├── .gitignore ├── lib ├── course_scraper │ ├── version.rb │ ├── course.rb │ ├── category.rb │ ├── spain.rb │ └── catalonia.rb └── course_scraper.rb ├── Gemfile ├── Rakefile ├── test ├── test_helper.rb └── course_scraper │ ├── spain_test.rb │ └── catalonia_test.rb ├── Readme.md ├── course_scraper.gemspec └── .rvmrc /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | .bundle 3 | Gemfile.lock 4 | pkg/* 5 | -------------------------------------------------------------------------------- /lib/course_scraper/version.rb: -------------------------------------------------------------------------------- 1 | module CourseScraper 2 | VERSION = "0.0.1" 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "http://rubygems.org" 2 | 3 | # Specify your gem's dependencies in course_scraper.gemspec 4 | gemspec 5 | gem 'rake' 6 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require 'bundler' 3 | Bundler::GemHelper.install_tasks 4 | 5 | require 'rake/testtask' 6 | Rake::TestTask.new do |t| 7 | t.libs << "test" 8 | t.test_files = FileList['./test/**/*_test.rb'] 9 | end 10 | 11 | task :default => [:test] 12 | -------------------------------------------------------------------------------- /lib/course_scraper.rb: -------------------------------------------------------------------------------- 1 | require_relative 'course_scraper/version' 2 | require_relative 'course_scraper/course' 3 | require_relative 'course_scraper/category' 4 | require_relative 'course_scraper/catalonia' 5 | require_relative 'course_scraper/spain' 6 | 7 | module CourseScraper 8 | end 9 | 10 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | gem 'minitest' 2 | require 'minitest/spec' 3 | require 'minitest/autorun' 4 | require 'mocha' 5 | require 'vcr' 6 | require_relative '../lib/course_scraper' 7 | 8 | VCR.config do |c| 9 | c.cassette_library_dir = "test/fixtures/cassettes" 10 | c.stub_with :webmock 11 | c.default_cassette_options = { :record => :new_episodes } 12 | end 13 | -------------------------------------------------------------------------------- /lib/course_scraper/course.rb: -------------------------------------------------------------------------------- 1 | module CourseScraper 2 | # Public: A vocational course. 3 | # 4 | class Course 5 | attr_reader :name, :type 6 | 7 | # Public: Initializes a new Course. 8 | # 9 | # name - the String name of the course 10 | # type - the Symbol type of the course (either :high or :medium). 11 | def initialize(name, type) 12 | @name = name 13 | @type = type 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/course_scraper/category.rb: -------------------------------------------------------------------------------- 1 | module CourseScraper 2 | # Public: A Category of vocational courses that contains them. 3 | # 4 | class Category 5 | attr_reader :name, :courses 6 | 7 | # Public: Initializes a new Category. 8 | # 9 | # name - the String name of the category 10 | # courses - the Array of courses of the category 11 | def initialize(name, courses) 12 | @name = name 13 | @courses = courses || [] 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # course_scraper 2 | 3 | A course scraper that gets all the vocational training courses in Catalonia 4 | and Spain. 5 | 6 | # Install 7 | 8 | In your Gemfile: 9 | 10 | gem 'course_scraper', git: 'git://github.com/codegram/course_scraper' 11 | 12 | ## Usage 13 | 14 | ```ruby 15 | require 'course_scraper' 16 | catalan_categories = CourseScraper::Catalonia.scrape 17 | 18 | catalan_categories.each do |category| 19 | category.name 20 | # => "Administracio" 21 | category.courses.each do |course| 22 | course.name 23 | # => "Transport" 24 | course.type 25 | # => :medium 26 | end 27 | end 28 | 29 | spanish_categories = CourseScraper::Spain.scrape 30 | 31 | spanish_categories.each do |category| 32 | category.name 33 | # => "Administracion" 34 | category.courses.each do |course| 35 | course.name 36 | # => "Transporte" 37 | course.type 38 | # => :medium 39 | end 40 | end 41 | ``` 42 | -------------------------------------------------------------------------------- /course_scraper.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | $:.push File.expand_path("../lib", __FILE__) 3 | require "course_scraper/version" 4 | 5 | Gem::Specification.new do |s| 6 | s.name = "course_scraper" 7 | s.version = CourseScraper::VERSION 8 | s.authors = ["Josep M. Bach"] 9 | s.email = ["josep.m.bach@gmail.com"] 10 | s.homepage = "http://github.com/codegram/course_scraper" 11 | s.summary = %q{Gives you a list of all courses in Spanish vocational education.} 12 | s.description = %q{Gives you a list of all courses in Spanish vocational education.} 13 | 14 | s.rubyforge_project = "course_scraper" 15 | 16 | s.files = `git ls-files`.split("\n") 17 | s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n") 18 | s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) } 19 | s.require_paths = ["lib"] 20 | 21 | s.add_runtime_dependency "capybara" 22 | s.add_runtime_dependency "capybara-webkit" 23 | s.add_development_dependency "vcr" 24 | s.add_development_dependency "webmock" 25 | s.add_development_dependency "mocha" 26 | s.add_development_dependency "minitest" 27 | end 28 | -------------------------------------------------------------------------------- /.rvmrc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This is an RVM Project .rvmrc file, used to automatically load the ruby 4 | # development environment upon cd'ing into the directory 5 | 6 | # First we specify our desired [@], the @gemset name is optional. 7 | environment_id="1.9.3@course_scraper" 8 | 9 | # 10 | # First we attempt to load the desired environment directly from the environment 11 | # file. This is very fast and efficicent compared to running through the entire 12 | # CLI and selector. If you want feedback on which environment was used then 13 | # insert the word 'use' after --create as this triggers verbose mode. 14 | # 15 | if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \ 16 | && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]] 17 | then 18 | \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id" 19 | 20 | if [[ -s ".rvm/hooks/after_use" ]] 21 | then 22 | . ".rvm/hooks/after_use" 23 | fi 24 | else 25 | # If the environment file has not yet been created, use the RVM CLI to select. 26 | if ! rvm --create "$environment_id" 27 | then 28 | echo "Failed to create RVM environment ''." 29 | fi 30 | fi 31 | 32 | # 33 | # If you use an RVM gemset file to install a list of gems (*.gems), you can have 34 | # it be automatically loaded. Uncomment the following and adjust the filename if 35 | # necessary. 36 | # 37 | # filename=".gems" 38 | # if [[ -s "$filename" ]] ; then 39 | # rvm gemset import "$filename" | grep -v already | grep -v listed | grep -v complete | sed '/^$/d' 40 | # fi 41 | 42 | # 43 | # If you use bundler and would like to run bundle each time you enter the 44 | # directory, you can uncomment the following code. 45 | # 46 | export PATH="./bin:$PATH" 47 | # 48 | -------------------------------------------------------------------------------- /test/course_scraper/spain_test.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require_relative '../test_helper' 3 | 4 | module CourseScraper 5 | describe Spain do 6 | subject { Spain.new } 7 | 8 | before do 9 | subject.setup_capybara 10 | end 11 | 12 | describe ".scrape" do 13 | it 'delegates to an instance' do 14 | Spain.stubs(:new).returns subject 15 | subject.expects(:scrape) 16 | 17 | Spain.scrape 18 | end 19 | end 20 | 21 | describe "#visit_category_list" do 22 | it 'visit the category list' do 23 | subject.expects(:visit) 24 | subject.visit_category_list 25 | end 26 | end 27 | 28 | describe "#each_category" do 29 | it 'calls a block for each category' do 30 | names = [] 31 | urls = [] 32 | subject.each_category do |name, url| 33 | names << name 34 | urls << url 35 | end 36 | 37 | names = names[0..2] 38 | urls = urls[0..2] 39 | 40 | names.must_equal [ 41 | "Actividades Físicas y Deportivas", 42 | "Administración y Gestión", 43 | "Agraria" 44 | ] 45 | urls.must_equal [ 46 | "/todofp/formacion/que-y-como-estudiar/oferta-formativa/todos-los-estudios/actividades-fisico-deportivas.html", 47 | "/todofp/formacion/que-y-como-estudiar/oferta-formativa/todos-los-estudios/administracion-gestion.html", 48 | "/todofp/formacion/que-y-como-estudiar/oferta-formativa/todos-los-estudios/agraria.html" 49 | ] 50 | end 51 | end 52 | 53 | describe "#each_course" do 54 | it 'calls a block for each course in a category' do 55 | category_url = "http://todofp.es/todofp/formacion/que-y-como-estudiar/oferta-formativa/todos-los-estudios/actividades-fisico-deportivas.html" 56 | 57 | names = [] 58 | types = [] 59 | 60 | VCR.use_cassette(:spanish_courses) do 61 | subject.each_course(category_url) do |name, type| 62 | names << name 63 | types << type 64 | end 65 | end 66 | 67 | names[0].must_equal "Conducción de actividades físico-deportivas en el medio natural" 68 | names[1].must_equal "Animación de actividades físico-deportivas" 69 | 70 | types[0].must_equal :medium 71 | types[1].must_equal :high 72 | end 73 | end 74 | end 75 | end 76 | 77 | -------------------------------------------------------------------------------- /test/course_scraper/catalonia_test.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require_relative '../test_helper' 3 | 4 | module CourseScraper 5 | describe Catalonia do 6 | subject { Catalonia.new } 7 | 8 | before do 9 | subject.setup_capybara 10 | end 11 | 12 | describe ".scrape" do 13 | it 'delegates to an instance' do 14 | Catalonia.stubs(:new).returns subject 15 | subject.expects(:scrape) 16 | 17 | Catalonia.scrape 18 | end 19 | end 20 | 21 | describe "#visit_category_list" do 22 | it 'visit the category list' do 23 | subject.expects(:visit) 24 | subject.visit_category_list 25 | end 26 | end 27 | 28 | describe "#each_category" do 29 | it 'calls a block for each category' do 30 | names = [] 31 | urls = [] 32 | subject.each_category do |name, url| 33 | names << name 34 | urls << url 35 | end 36 | 37 | names = names[0..2] 38 | urls = urls[0..2] 39 | 40 | names.must_equal [ 41 | "Activitats físiques i esportives", 42 | "Agrària", 43 | "Comerç i màrqueting" 44 | ] 45 | urls.must_equal [ 46 | "/portal/site/queestudiar/menuitem.796f7d19c318c94cd56a1c76b0c0e1a0/?vgnextoid=e01237a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextchannel=e01237a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextfmt=default", 47 | "/portal/site/queestudiar/menuitem.796f7d19c318c94cd56a1c76b0c0e1a0/?vgnextoid=d6e237a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextchannel=d6e237a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextfmt=default", 48 | "/portal/site/queestudiar/menuitem.796f7d19c318c94cd56a1c76b0c0e1a0/?vgnextoid=1d8337a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextchannel=1d8337a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextfmt=default" 49 | ] 50 | end 51 | end 52 | 53 | describe "#each_course" do 54 | it 'calls a block for each course in a category' do 55 | category_url = "http://www20.gencat.cat/portal/site/queestudiar/menuitem.796f7d19c318c94cd56a1c76b0c0e1a0/?vgnextoid=e01237a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextchannel=e01237a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextfmt=default://google.com" 56 | 57 | names = [] 58 | types = [] 59 | 60 | VCR.use_cassette(:catalan_courses) do 61 | subject.each_course(category_url) do |name, type| 62 | names << name 63 | types << type 64 | end 65 | end 66 | 67 | names[0].must_match /i esportives/ 68 | names[1].must_match /sicoesportives en el medi natural/ 69 | names[2].must_match /en el medi natural, perfil professional/ 70 | 71 | types[0].must_equal :high 72 | types[1].must_equal :medium 73 | types[2].must_equal :medium 74 | end 75 | end 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /lib/course_scraper/spain.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require 'capybara' 3 | require 'capybara/dsl' 4 | require 'capybara-webkit' 5 | 6 | module CourseScraper 7 | # Public: A scraper for all the vocational training courses in Spain. 8 | # 9 | # Examples 10 | # 11 | # courses = Spain.scrape 12 | # # => [#, #] 13 | # 14 | class Spain 15 | include Capybara::DSL 16 | 17 | # Public: Instantiates a new scraper and fires it to grab all the vocational 18 | # training courses in Spain. 19 | # 20 | # Returns the Array collection of CourseScraper::Course instances. 21 | def self.scrape 22 | new.scrape 23 | end 24 | 25 | # Internal: Sets the configuration for capybara to work with the Todofp 26 | # website. 27 | # 28 | # Returns nothing. 29 | def setup_capybara 30 | Capybara.run_server = false 31 | Capybara.current_driver = :webkit 32 | Capybara.app_host = 'http://todofp.es' 33 | end 34 | 35 | # Public: Scrapes the vocational training courses in Spain. 36 | # 37 | # Returns the Array collection of CourseScraper::Category instances with 38 | # nested Courses. 39 | def scrape 40 | setup_capybara 41 | 42 | categories = [] 43 | each_category do |name, href| 44 | categories << { name: name, url: href } 45 | end 46 | 47 | categories.map do |category| 48 | cat = Category.new(category[:name], []) 49 | each_course category[:url] do |name, type| 50 | cat.courses << Course.new(name, type) 51 | end 52 | cat 53 | end 54 | end 55 | 56 | # Internal: Visits the main page where the course categories are listed. 57 | # 58 | # Returns nothing. 59 | def visit_category_list 60 | visit '/todofp/formacion/que-y-como-estudiar/oferta-formativa/todos-los-estudios.html' 61 | end 62 | 63 | # Internal: Call a block for every category. 64 | # 65 | # Yields the String name of the category and its String URL. 66 | # 67 | # Returns nothing. 68 | def each_category(&block) 69 | visit_category_list 70 | 71 | links = [] 72 | 73 | within ".columnas-fp" do 74 | links = all('a') 75 | end 76 | 77 | links.each do |link| 78 | block.call link.text, link[:href] 79 | end 80 | end 81 | 82 | # Internal: Call a block for every course in a category URL. 83 | # 84 | # category_url - the String category URL 85 | # 86 | # Yields the String name of the course and its Symbol type. 87 | # 88 | # Returns nothing. 89 | def each_course(category_url, &block) 90 | visit category_url 91 | 92 | courses = [] 93 | 94 | all('.columnas-familiafp ul:nth-of-type(1) li').each do |course| 95 | courses << [course.text, :medium] 96 | end 97 | 98 | all('.columnas-familiafp ul:nth-of-type(2) li').each do |course| 99 | courses << [course.text, :high] 100 | end 101 | 102 | courses.each do |course| 103 | block.call *course 104 | end 105 | end 106 | end 107 | end 108 | -------------------------------------------------------------------------------- /lib/course_scraper/catalonia.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | require 'capybara/dsl' 3 | require 'capybara-webkit' 4 | 5 | module CourseScraper 6 | # Public: A scraper for all the vocational training courses in Catalonia. 7 | # 8 | # Examples 9 | # 10 | # courses = Catalonia.scrape 11 | # # => [#, #] 12 | # 13 | class Catalonia 14 | include Capybara::DSL 15 | 16 | # Public: Instantiates a new scraper and fires it to grab all the vocational 17 | # training courses in Catalonia. 18 | # 19 | # Returns the Array collection of CourseScraper::Course instances. 20 | def self.scrape 21 | new.scrape 22 | end 23 | 24 | # Internal: Sets the configuration for capybara to work with the Gencat 25 | # website. 26 | # 27 | # Returns nothing. 28 | def setup_capybara 29 | Capybara.run_server = false 30 | Capybara.current_driver = :webkit 31 | Capybara.app_host = 'http://www20.gencat.cat' 32 | end 33 | 34 | # Public: Scrapes the vocational training courses in Catalonia. 35 | # 36 | # Returns the Array collection of CourseScraper::Category instances with 37 | # nested Courses. 38 | def scrape 39 | setup_capybara 40 | 41 | categories = [] 42 | each_category do |name, href| 43 | categories << { name: name, url: href } 44 | end 45 | 46 | categories.map do |category| 47 | cat = Category.new(category[:name], []) 48 | each_course category[:url] do |name, type| 49 | cat.courses << Course.new(name, type) 50 | end 51 | cat 52 | end 53 | end 54 | 55 | # Internal: Visits the main page where the course categories are listed. 56 | # 57 | # Returns nothing. 58 | def visit_category_list 59 | visit "/portal/site/queestudiar/menuitem.d7cfc336363a7af8e85c7273b0c0e1a0/?vgnextoid=0a8137a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextchannel=0a8137a9f4f2b210VgnVCM2000009b0c1e0aRCRD&vgnextfmt=default" 60 | end 61 | 62 | # Internal: Call a block for every category. 63 | # 64 | # Yields the String name of the category and its String URL. 65 | # 66 | # Returns nothing. 67 | def each_category(&block) 68 | visit_category_list 69 | 70 | links = [] 71 | 72 | within ".FW_bColEsquerraCos" do 73 | links += all('a') 74 | end 75 | 76 | within ".FW_bColDretaCos" do 77 | links += all('a') 78 | end 79 | 80 | links.each do |link| 81 | block.call link.text, link[:href] 82 | end 83 | end 84 | 85 | # Internal: Call a block for every course in a category URL. 86 | # 87 | # category_url - the String category URL 88 | # 89 | # Yields the String name of the course and its Symbol type. 90 | # 91 | # Returns nothing. 92 | def each_course(category_url, &block) 93 | visit category_url 94 | 95 | courses = [] 96 | 97 | all('.CEDU_vertical').each do |ul| 98 | type = ul.find('li:nth-of-type(2)').text =~ /superior/ ? :high : :medium 99 | courses << [ul.find('li.titol').text, type] 100 | end 101 | 102 | courses.each do |course| 103 | block.call *course 104 | end 105 | end 106 | end 107 | end 108 | --------------------------------------------------------------------------------