├── log └── .keep ├── .rspec ├── app ├── mailers │ ├── .keep │ └── notification_mailer.rb ├── models │ ├── .keep │ ├── concerns │ │ └── .keep │ └── page.rb ├── assets │ ├── images │ │ └── .keep │ ├── stylesheets │ │ ├── global.sass │ │ ├── static.css.scss │ │ ├── bootstrap_and_overrides.css │ │ └── application.css │ └── javascripts │ │ ├── bootstrap.js.coffee │ │ ├── static.js.coffee │ │ └── application.js ├── controllers │ ├── concerns │ │ └── .keep │ ├── static_controller.rb │ └── application_controller.rb ├── views │ ├── notification_mailer │ │ ├── report.html.haml │ │ ├── notify_error.html.haml │ │ └── notify_success.html.haml │ ├── static │ │ └── index.html.haml │ └── layouts │ │ └── application.html.haml └── helpers │ ├── static_helper.rb │ ├── application_helper.rb │ └── string_helper.rb ├── lib ├── assets │ └── .keep ├── tasks │ ├── .keep │ ├── import_urls.rake │ ├── whois_records_query.rake │ └── scrape_urls.rake ├── graceful_shutdown.rb ├── parsed_link.rb ├── open_uri_scrape.rb └── page_parser.rb ├── public ├── favicon.ico ├── robots.txt ├── 500.html ├── 422.html └── 404.html ├── .ruby-gemset ├── .ruby-version ├── vendor ├── assets │ ├── javascripts │ │ └── .keep │ └── stylesheets │ │ └── .keep └── urls.txt ├── Procfile ├── bin ├── rake ├── bundle └── rails ├── spec ├── controllers │ └── static_controller_spec.rb ├── models │ └── page_spec.rb ├── mailers │ └── notification_spec.rb ├── helpers │ └── static_helper_spec.rb └── spec_helper.rb ├── config.ru ├── config ├── boot.rb ├── environment.rb ├── initializers │ ├── session_store.rb │ ├── filter_parameter_logging.rb │ ├── mime_types.rb │ ├── mailer.rb │ ├── backtrace_silencers.rb │ ├── wrap_parameters.rb │ ├── inflections.rb │ ├── secret_token.rb │ └── slug_extensions.rb ├── database.yml ├── locales │ ├── en.bootstrap.yml │ └── en.yml ├── application.rb ├── environments │ ├── development.rb │ ├── test.rb │ └── production.rb └── routes.rb ├── Rakefile ├── db ├── seeds.rb ├── migrate │ └── 20131227162259_create_pages.rb └── schema.rb ├── .gitignore ├── Gemfile ├── README.md └── Gemfile.lock /log/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | -------------------------------------------------------------------------------- /app/mailers/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/models/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/assets/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/tasks/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/assets/images/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.ruby-gemset: -------------------------------------------------------------------------------- 1 | link_scraper -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | ruby-2.0.0-p195 -------------------------------------------------------------------------------- /app/models/concerns/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/controllers/concerns/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vendor/assets/javascripts/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vendor/assets/stylesheets/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | worker: bundle exec rake pages:work 2 | -------------------------------------------------------------------------------- /app/views/notification_mailer/report.html.haml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/views/notification_mailer/notify_error.html.haml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/views/notification_mailer/notify_success.html.haml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/helpers/static_helper.rb: -------------------------------------------------------------------------------- 1 | module StaticHelper 2 | end 3 | -------------------------------------------------------------------------------- /app/views/static/index.html.haml: -------------------------------------------------------------------------------- 1 | %h1 Hello, link_scraper! 2 | -------------------------------------------------------------------------------- /app/helpers/application_helper.rb: -------------------------------------------------------------------------------- 1 | module ApplicationHelper 2 | end 3 | -------------------------------------------------------------------------------- /app/assets/stylesheets/global.sass: -------------------------------------------------------------------------------- 1 | .container-fluid 2 | margin-top: 50px 3 | -------------------------------------------------------------------------------- /lib/graceful_shutdown.rb: -------------------------------------------------------------------------------- 1 | class GracefulShutdown < StandardError 2 | end 3 | -------------------------------------------------------------------------------- /app/controllers/static_controller.rb: -------------------------------------------------------------------------------- 1 | class StaticController < ApplicationController 2 | end 3 | -------------------------------------------------------------------------------- /bin/rake: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require_relative '../config/boot' 3 | require 'rake' 4 | Rake.application.run 5 | -------------------------------------------------------------------------------- /spec/controllers/static_controller_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe StaticController do 4 | 5 | end 6 | -------------------------------------------------------------------------------- /spec/models/page_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Page do 4 | pending "add some examples to (or delete) #{__FILE__}" 5 | end 6 | -------------------------------------------------------------------------------- /bin/bundle: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__) 3 | load Gem.bin_path('bundler', 'bundle') 4 | -------------------------------------------------------------------------------- /app/assets/javascripts/bootstrap.js.coffee: -------------------------------------------------------------------------------- 1 | jQuery -> 2 | $("a[rel~=popover], .has-popover").popover() 3 | $("a[rel~=tooltip], .has-tooltip").tooltip() 4 | -------------------------------------------------------------------------------- /spec/mailers/notification_spec.rb: -------------------------------------------------------------------------------- 1 | require "spec_helper" 2 | 3 | describe Notification do 4 | pending "add some examples to (or delete) #{__FILE__}" 5 | end 6 | -------------------------------------------------------------------------------- /bin/rails: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | APP_PATH = File.expand_path('../../config/application', __FILE__) 3 | require_relative '../config/boot' 4 | require 'rails/commands' 5 | -------------------------------------------------------------------------------- /config.ru: -------------------------------------------------------------------------------- 1 | # This file is used by Rack-based servers to start the application. 2 | 3 | require ::File.expand_path('../config/environment', __FILE__) 4 | run Rails.application 5 | -------------------------------------------------------------------------------- /config/boot.rb: -------------------------------------------------------------------------------- 1 | # Set up gems listed in the Gemfile. 2 | ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__) 3 | 4 | require 'bundler/setup' if File.exists?(ENV['BUNDLE_GEMFILE']) 5 | -------------------------------------------------------------------------------- /config/environment.rb: -------------------------------------------------------------------------------- 1 | # Load the Rails application. 2 | require File.expand_path('../application', __FILE__) 3 | 4 | # Initialize the Rails application. 5 | TrackbackScraper::Application.initialize! 6 | -------------------------------------------------------------------------------- /config/initializers/session_store.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | TrackbackScraper::Application.config.session_store :cookie_store, key: '_link_scraper_session' 4 | -------------------------------------------------------------------------------- /app/assets/stylesheets/static.css.scss: -------------------------------------------------------------------------------- 1 | // Place all the styles related to the static controller here. 2 | // They will automatically be included in application.css. 3 | // You can use Sass (SCSS) here: http://sass-lang.com/ 4 | -------------------------------------------------------------------------------- /public/robots.txt: -------------------------------------------------------------------------------- 1 | # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file 2 | # 3 | # To ban all spiders from the entire site uncomment the next two lines: 4 | # User-agent: * 5 | # Disallow: / 6 | -------------------------------------------------------------------------------- /config/initializers/filter_parameter_logging.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Configure sensitive parameters which will be filtered from the log file. 4 | Rails.application.config.filter_parameters += [:password] 5 | -------------------------------------------------------------------------------- /config/initializers/mime_types.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Add new mime types for use in respond_to blocks: 4 | # Mime::Type.register "text/richtext", :rtf 5 | # Mime::Type.register_alias "text/html", :iphone 6 | -------------------------------------------------------------------------------- /app/assets/javascripts/static.js.coffee: -------------------------------------------------------------------------------- 1 | # Place all the behaviors and hooks related to the matching controller here. 2 | # All this logic will automatically be available in application.js. 3 | # You can use CoffeeScript in this file: http://coffeescript.org/ 4 | -------------------------------------------------------------------------------- /config/database.yml: -------------------------------------------------------------------------------- 1 | development: 2 | adapter: postgresql 3 | database: trackback_scraper-dev 4 | pool: 10 5 | timeout: 5000 6 | 7 | test: 8 | adapter: postgresql 9 | database: trackback_scraper-test 10 | pool: 10 11 | timeout: 5000 12 | -------------------------------------------------------------------------------- /app/controllers/application_controller.rb: -------------------------------------------------------------------------------- 1 | class ApplicationController < ActionController::Base 2 | # Prevent CSRF attacks by raising an exception. 3 | # For APIs, you may want to use :null_session instead. 4 | protect_from_forgery with: :exception 5 | end 6 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # Add your own tasks in files placed in lib/tasks ending in .rake, 2 | # for example lib/tasks/capistrano.rake, and they will automatically be available to Rake. 3 | 4 | require File.expand_path('../config/application', __FILE__) 5 | 6 | TrackbackScraper::Application.load_tasks 7 | -------------------------------------------------------------------------------- /lib/tasks/import_urls.rake: -------------------------------------------------------------------------------- 1 | desc 'import urls from a file' 2 | task 'urls:import' => :environment do 3 | urls = File.read(Rails.root.join('vendor', 'urls.txt')) 4 | urls.each_line.drop(1).each do |url| 5 | url = url.strip 6 | 7 | Page.find_or_create_by_url!(url) 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /app/assets/stylesheets/bootstrap_and_overrides.css: -------------------------------------------------------------------------------- 1 | /* 2 | =require twitter-bootstrap-static/bootstrap 3 | 4 | Use Font Awesome icons (default) 5 | To use Glyphicons sprites instead of Font Awesome, replace with "require twitter-bootstrap-static/sprites" 6 | =require twitter-bootstrap-static/fontawesome 7 | */ -------------------------------------------------------------------------------- /app/helpers/string_helper.rb: -------------------------------------------------------------------------------- 1 | module StringHelper 2 | extend self 3 | 4 | def coerce_to_utf8(input) 5 | output = input.dup.force_encoding("UTF-8") 6 | 7 | return output if output.valid_encoding? 8 | 9 | output = output.force_encoding("BINARY") 10 | output.encode("UTF-8", invalid: :replace, undef: :replace) 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /db/seeds.rb: -------------------------------------------------------------------------------- 1 | # This file should contain all the record creation needed to seed the database with its default values. 2 | # The data can then be loaded with the rake db:seed (or created alongside the db with db:setup). 3 | # 4 | # Examples: 5 | # 6 | # cities = City.create([{ name: 'Chicago' }, { name: 'Copenhagen' }]) 7 | # Mayor.create(name: 'Emanuel', city: cities.first) 8 | -------------------------------------------------------------------------------- /config/initializers/mailer.rb: -------------------------------------------------------------------------------- 1 | if ENV['MAILGUN_SMTP_LOGIN'].present? 2 | ActionMailer::Base.smtp_settings = { 3 | :authentication => :plain, 4 | :address => ENV.fetch('MAILGUN_SMTP_SERVER'), 5 | :port => ENV.fetch('MAILGUN_SMTP_PORT'), 6 | :domain => 'link_scraper.mailgun.org', 7 | :user_name => ENV.fetch('MAILGUN_SMTP_LOGIN'), 8 | :password => ENV.fetch('MAILGUN_SMTP_PASSWORD') 9 | } 10 | end 11 | -------------------------------------------------------------------------------- /config/initializers/backtrace_silencers.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # You can add backtrace silencers for libraries that you're using but don't wish to see in your backtraces. 4 | # Rails.backtrace_cleaner.add_silencer { |line| line =~ /my_noisy_library/ } 5 | 6 | # You can also remove all the silencers if you're trying to debug a problem that might stem from framework code. 7 | # Rails.backtrace_cleaner.remove_silencers! 8 | -------------------------------------------------------------------------------- /spec/helpers/static_helper_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | # Specs in this file have access to a helper object that includes 4 | # the StaticHelper. For example: 5 | # 6 | # describe StaticHelper do 7 | # describe "string concat" do 8 | # it "concats two strings with spaces" do 9 | # expect(helper.concat_strings("this","that")).to eq("this that") 10 | # end 11 | # end 12 | # end 13 | describe StaticHelper do 14 | pending "add some examples to (or delete) #{__FILE__}" 15 | end 16 | -------------------------------------------------------------------------------- /config/locales/en.bootstrap.yml: -------------------------------------------------------------------------------- 1 | # Sample localization file for English. Add more files in this directory for other locales. 2 | # See https://github.com/svenfuchs/rails-i18n/tree/master/rails%2Flocale for starting points. 3 | 4 | en: 5 | helpers: 6 | actions: "Actions" 7 | links: 8 | back: "Back" 9 | cancel: "Cancel" 10 | confirm: "Are you sure?" 11 | destroy: "Delete" 12 | new: "New" 13 | edit: "Edit" 14 | titles: 15 | edit: "Edit %{model}" 16 | save: "Save %{model}" 17 | new: "New %{model}" 18 | delete: "Delete %{model}" 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files for more about ignoring files. 2 | # 3 | # If you find yourself ignoring temporary files generated by your text editor 4 | # or operating system, you probably want to add a global ignore instead: 5 | # git config --global core.excludesfile '~/.gitignore_global' 6 | 7 | # Ignore bundler config. 8 | /.bundle 9 | 10 | # Ignore the default SQLite database. 11 | /db/*.sqlite3 12 | /db/*.sqlite3-journal 13 | 14 | # Ignore all logfiles and tempfiles. 15 | /log/*.log 16 | /tmp 17 | vendor/all_urls.txt 18 | vendor/domain_lookups.csv 19 | .env 20 | -------------------------------------------------------------------------------- /config/initializers/wrap_parameters.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # This file contains settings for ActionController::ParamsWrapper which 4 | # is enabled by default. 5 | 6 | # Enable parameter wrapping for JSON. You can disable this by setting :format to an empty array. 7 | ActiveSupport.on_load(:action_controller) do 8 | wrap_parameters format: [:json] if respond_to?(:wrap_parameters) 9 | end 10 | 11 | # To enable root element in JSON for ActiveRecord objects. 12 | # ActiveSupport.on_load(:active_record) do 13 | # self.include_root_in_json = true 14 | # end 15 | -------------------------------------------------------------------------------- /app/assets/stylesheets/application.css: -------------------------------------------------------------------------------- 1 | /* 2 | * This is a manifest file that'll be compiled into application.css, which will include all the files 3 | * listed below. 4 | * 5 | * Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets, 6 | * or vendor/assets/stylesheets of plugins, if any, can be referenced here using a relative path. 7 | * 8 | * You're free to add application-wide styles to this file and they'll appear at the top of the 9 | * compiled file, but it's generally better to create a new file per style scope. 10 | * 11 | *= require_self 12 | *= require_tree . 13 | */ 14 | -------------------------------------------------------------------------------- /config/initializers/inflections.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Add new inflection rules using the following format. Inflections 4 | # are locale specific, and you may define rules for as many different 5 | # locales as you wish. All of these examples are active by default: 6 | # ActiveSupport::Inflector.inflections(:en) do |inflect| 7 | # inflect.plural /^(ox)$/i, '\1en' 8 | # inflect.singular /^(ox)en/i, '\1' 9 | # inflect.irregular 'person', 'people' 10 | # inflect.uncountable %w( fish sheep ) 11 | # end 12 | 13 | # These inflection rules are supported but not enabled by default: 14 | # ActiveSupport::Inflector.inflections(:en) do |inflect| 15 | # inflect.acronym 'RESTful' 16 | # end 17 | -------------------------------------------------------------------------------- /config/initializers/secret_token.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Your secret key is used for verifying the integrity of signed cookies. 4 | # If you change this key, all old signed cookies will become invalid! 5 | 6 | # Make sure the secret is at least 30 characters and all random, 7 | # no regular words or you'll be exposed to dictionary attacks. 8 | # You can use `rake secret` to generate a secure secret key. 9 | 10 | # Make sure your secret_key_base is kept private 11 | # if you're sharing your code publicly. 12 | TrackbackScraper::Application.config.secret_key_base = '74b5d9f645e136b55cd292f7d8920db52aca14e6ee7d2d4f8e946850abe11c05c6574b0a5bdd9a883a2041154def95913cc9109ce68b65fe517189d2e45654eb' 13 | -------------------------------------------------------------------------------- /config/locales/en.yml: -------------------------------------------------------------------------------- 1 | # Files in the config/locales directory are used for internationalization 2 | # and are automatically loaded by Rails. If you want to use locales other 3 | # than English, add the necessary files in this directory. 4 | # 5 | # To use the locales, use `I18n.t`: 6 | # 7 | # I18n.t 'hello' 8 | # 9 | # In views, this is aliased to just `t`: 10 | # 11 | # <%= t('hello') %> 12 | # 13 | # To use a different locale, set it with `I18n.locale`: 14 | # 15 | # I18n.locale = :es 16 | # 17 | # This would use the information in config/locales/es.yml. 18 | # 19 | # To learn more, please read the Rails Internationalization guide 20 | # available at http://guides.rubyonrails.org/i18n.html. 21 | 22 | en: 23 | hello: "Hello world" 24 | -------------------------------------------------------------------------------- /config/initializers/slug_extensions.rb: -------------------------------------------------------------------------------- 1 | class String 2 | def to_slug 3 | strip. 4 | downcase. 5 | transliterate. 6 | convert_smart_punctuation. 7 | convert_misc_characters. 8 | convert_dollar_signs. 9 | gsub(/[`'()*,.#´]/, ''). 10 | gsub(/[^a-z0-9\-]+/i, '-'). 11 | gsub(/\-{2,}/, '-'). 12 | gsub(/^\-|\-$/i, ''). 13 | to_s 14 | end 15 | 16 | def convert_dollar_signs 17 | gsub(/\s+\$(?=[a-z0-9])/i, ' s') 18 | end 19 | 20 | def convert_misc_characters 21 | gsub(/\s+&\s+/, ' and '). 22 | gsub(/\s+@\s+/, ' at '). 23 | gsub(/[.]{2,}/, ' ') 24 | # gsub(/(\d)%(\s|$)/, '\1 percent ') 25 | end 26 | 27 | def transliterate 28 | ActiveSupport::Inflector.transliterate(self) 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem "decent_exposure" 4 | gem "decent_generators" 5 | gem 'dotenv-rails' 6 | gem "haml" 7 | gem "haml-rails" 8 | gem "librato-logreporter" 9 | gem "pg" 10 | gem "pry" 11 | gem "pry-rails" 12 | gem "twitter-bootstrap-rails" 13 | gem 'addressable', require: 'addressable/uri' 14 | gem 'coffee-rails', '~> 4.0.0' 15 | gem 'jbuilder', '~> 1.2' 16 | gem 'jquery-rails' 17 | gem 'nokogiri' 18 | gem 'rails' 19 | gem 'sass-rails', '~> 4.0.0' 20 | gem 'stringex' 21 | gem 'turbolinks' 22 | gem 'typhoeus' 23 | gem 'uglifier', '>= 1.3.0' 24 | gem 'whois' 25 | 26 | group :test, :development do 27 | gem "factory_girl" 28 | gem "fivemat" 29 | gem "rspec-rails" 30 | gem "rspec" 31 | end 32 | 33 | group :test do 34 | gem "shoulda-matchers" 35 | end 36 | -------------------------------------------------------------------------------- /app/assets/javascripts/application.js: -------------------------------------------------------------------------------- 1 | // This is a manifest file that'll be compiled into application.js, which will include all the files 2 | // listed below. 3 | // 4 | // Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts, 5 | // or vendor/assets/javascripts of plugins, if any, can be referenced here using a relative path. 6 | // 7 | // It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the 8 | // compiled file. 9 | // 10 | // Read Sprockets README (https://github.com/sstephenson/sprockets#sprockets-directives) for details 11 | // about supported directives. 12 | // 13 | //= require jquery 14 | //= require jquery_ujs 15 | //= require twitter/bootstrap 16 | //= require turbolinks 17 | //= require_tree . 18 | -------------------------------------------------------------------------------- /app/mailers/notification_mailer.rb: -------------------------------------------------------------------------------- 1 | class NotificationMailer < ActionMailer::Base 2 | default from: "notifications@link_scraper.com" 3 | 4 | DEFAULT_TO_ADDRESS = ENV['SEND_NOTIFICATIONS_TO_EMAIL'] 5 | 6 | def self.configured? 7 | DEFAULT_TO_ADDRESS.present? && ENV['MAILGUN_SMTP_LOGIN'].present? 8 | end 9 | 10 | def notify_success(scraped) 11 | mail to: DEFAULT_TO_ADDRESS, 12 | subject: "Finished #{scraped} pages" 13 | end 14 | 15 | def notify_error(message) 16 | mail to: DEFAULT_TO_ADDRESS, 17 | subject: "An error happened: #{message.first(50)}..." 18 | end 19 | 20 | def report(report, subject: "Completed Report", to: DEFAULT_TO_ADDRESS) 21 | attachments['report.csv'] = report 22 | 23 | mail to: to, 24 | subject: subject 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /db/migrate/20131227162259_create_pages.rb: -------------------------------------------------------------------------------- 1 | class CreatePages < ActiveRecord::Migration 2 | def change 3 | create_table :pages do |t| 4 | t.string :url, limit: 2000 5 | t.text :links 6 | t.integer :count_of_links_to_rg_song_pages 7 | t.integer :count_of_links_with_rg_format 8 | t.datetime :created_at 9 | t.datetime :updated_at 10 | t.integer :error_code 11 | t.text :error_message 12 | t.integer :total_links_to_rg 13 | t.boolean :fetched, default: false, null: false 14 | t.integer :count_of_links_with_text_ending_in_lyrics 15 | t.integer :count_of_annotation_links 16 | t.datetime :locked_at 17 | t.integer :count_of_link_clumps 18 | t.integer :largest_link_clump_size 19 | t.integer :count_of_link_clumps_fuzzy_match 20 | t.integer :largest_link_clump_size_fuzzy_match 21 | end 22 | 23 | add_index :pages, :url, unique: true 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Rap Genius Trackback Scraper 2 | 3 | This is the tool we used to scrape 178k URLs in 15 minutes in order to find which pages were hosting potentially spammy Rap Genius links. Given a list of URLs to scrape, it creates aggregate information that identifies the spammiest sites for manual review. 4 | 5 | For more details on the motivation and background for this repository, check out [the blog post on Rap Genius](http://news.rapgenius.com/Rap-genius-founders-rap-genius-is-back-on-google-lyrics) 6 | 7 | ### Setup 8 | 9 | You can run the scrape process using a set of sample data in vendor/urls.txt. To get started: 10 | 11 | ```sh 12 | $ bundle install && rake db:create db:migrate urls:import 13 | $ gem install foreman 14 | $ mkdir tmp 15 | $ foreman start worker 16 | ``` 17 | 18 | Then, once the pages have all been scraped (i.e., `Page.unscraped.count == 0`): 19 | 20 | ```ruby 21 | # from the console 22 | Page.write_report! 23 | ``` 24 | 25 | ### License 26 | MIT 27 | -------------------------------------------------------------------------------- /lib/tasks/whois_records_query.rake: -------------------------------------------------------------------------------- 1 | desc 'scrape whois contacts' 2 | task 'whois:scrape' do 3 | domains = CSV.parse(File.read(Rails.root.join('vendor/domain_lookups.csv'))).drop(1).map(&:first).map(&:strip) 4 | 5 | w = Whois::Client.new 6 | 7 | CSV.open(Rails.root.join('tmp', 'domain_lookups_output.csv'), 'wb') do |csv| 8 | csv << %w(domain contact) 9 | 10 | domains.each do |d| 11 | domain = 12 | if d =~ /^(\d+\.?)+$/ 13 | d 14 | else 15 | d.split('.').last(2).join('.') 16 | end 17 | 18 | tries = 0 19 | 20 | begin 21 | email = w.lookup(domain).technical_contact.try(:email) 22 | rescue Whois::WebInterfaceError, Whois::NoInterfaceError 23 | rescue Timeout::Error, Whois::ConnectionError => e 24 | tries += 1 25 | retry if tries <= 5 26 | rescue => e 27 | puts "Error:" 28 | puts [d, e].inspect 29 | end 30 | 31 | csv << [d, email || 'Unknown'] 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /lib/tasks/scrape_urls.rake: -------------------------------------------------------------------------------- 1 | desc 'scrape all urls' 2 | task 'pages:scrape', [:limit] => :environment do |t, args| 3 | limit = args[:limit] || 20 4 | 5 | done = 0 6 | 7 | Page.scrape_batch(limit) do |completed_response| 8 | done += 1 9 | 10 | if done % 10 == 0 11 | puts "Completed #{done}" 12 | end 13 | end 14 | 15 | NotificationMailer.notify_success(limit).deliver! if NotificationMailer.configured? 16 | end 17 | 18 | 19 | desc 'delayed_job-like worker task for scraping pages' 20 | task 'pages:work' => :environment do 21 | require 'graceful_shutdown' 22 | trap('TERM') { raise GracefulShutdown } 23 | 24 | batch_size = ENV.fetch('SCRAPE_BATCH_SIZE', 200).to_i 25 | 26 | begin 27 | loop do 28 | start = Time.now 29 | 30 | Rails.logger.info "Scraping batch of #{batch_size}" 31 | Page.scrape_batch(batch_size) 32 | Rails.logger.info "Done scraping batch of #{batch_size} at #{((Time.now.to_f - start.to_f) / batch_size.to_f).round(2)}seconds/page" 33 | 34 | sleep 10 if Page.count.zero? 35 | end 36 | rescue GracefulShutdown 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /app/views/layouts/application.html.haml: -------------------------------------------------------------------------------- 1 | !!! 5 2 | - application_name = Rails.application.class.to_s.split('::').first 3 | %html(lang="en-US" class="#{controller_name}-#{action_name}") 4 | %head 5 | %title 6 | - if content_for?(:title) 7 | = yield (:title) 8 | - else 9 | = application_name 10 | = stylesheet_link_tag "application", :media => "all" 11 | = javascript_include_tag "application" 12 | = csrf_meta_tags 13 | %body 14 | .navbar.navbar-fixed-top 15 | .navbar-inner 16 | .container 17 | %a.brand{href: '/'}= application_name 18 | .container-fluid 19 | .row-fluid 20 | .span8.offset3 21 | - flash.keys.each do |key| 22 | .alert{ class: "alert-#{key}" } 23 | %a.close{ href: '#', "data-dismiss" => "alert" } x 24 | %h4.alert-heading= key.capitalize 25 | - if flash[key].respond_to?(:each) 26 | - flash[key].each do |msg| 27 | = msg 28 | %br 29 | - else 30 | = flash[key] 31 | - flash.delete(key) 32 | = yield 33 | = yield(:page_javascript) if content_for? :page_javascript 34 | -------------------------------------------------------------------------------- /config/application.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path('../boot', __FILE__) 2 | 3 | # Pick the frameworks you want: 4 | require "active_record/railtie" 5 | require "action_controller/railtie" 6 | require "action_mailer/railtie" 7 | require "sprockets/railtie" 8 | # require "rails/test_unit/railtie" 9 | 10 | # Require the gems listed in Gemfile, including any gems 11 | # you've limited to :test, :development, or :production. 12 | Bundler.require(:default, Rails.env) 13 | 14 | require 'csv' 15 | 16 | module TrackbackScraper 17 | class Application < Rails::Application 18 | # Settings in config/environments/* take precedence over those specified here. 19 | # Application configuration should go into files in config/initializers 20 | # -- all .rb files in that directory are automatically loaded. 21 | 22 | # Set Time.zone default to the specified zone and make Active Record auto-convert to this zone. 23 | # Run "rake -D time" for a list of tasks for finding time zone names. Default is UTC. 24 | # config.time_zone = 'Central Time (US & Canada)' 25 | 26 | # The default locale is :en and all translations from config/locales/*.rb,yml are auto loaded. 27 | # config.i18n.load_path += Dir[Rails.root.join('my', 'locales', '*.{rb,yml}').to_s] 28 | # config.i18n.default_locale = :de 29 | 30 | config.autoload_paths += ['lib'] 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /config/environments/development.rb: -------------------------------------------------------------------------------- 1 | TrackbackScraper::Application.configure do 2 | # Settings specified here will take precedence over those in config/application.rb. 3 | 4 | # In the development environment your application's code is reloaded on 5 | # every request. This slows down response time but is perfect for development 6 | # since you don't have to restart the web server when you make code changes. 7 | config.cache_classes = false 8 | 9 | # Do not eager load code on boot. 10 | config.eager_load = false 11 | 12 | # Show full error reports and disable caching. 13 | config.consider_all_requests_local = true 14 | config.action_controller.perform_caching = false 15 | 16 | # Don't care if the mailer can't send. 17 | config.action_mailer.raise_delivery_errors = true 18 | config.action_mailer.perform_deliveries = true 19 | config.action_mailer.delivery_method = :smtp 20 | 21 | # Print deprecation notices to the Rails logger. 22 | config.active_support.deprecation = :log 23 | 24 | # Raise an error on page load if there are pending migrations 25 | config.active_record.migration_error = :page_load 26 | 27 | # Debug mode disables concatenation and preprocessing of assets. 28 | # This option may cause significant delays in view rendering with a large 29 | # number of complex assets. 30 | config.assets.debug = true 31 | 32 | config.action_mailer.default_url_options = { host: 'link_scraper_dev.com' } 33 | end 34 | -------------------------------------------------------------------------------- /public/500.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |If you are the application owner check the logs for more information.
56 | 57 | 58 | -------------------------------------------------------------------------------- /lib/parsed_link.rb: -------------------------------------------------------------------------------- 1 | class ParsedLink 2 | include StringHelper 3 | 4 | attr_accessor :rg_link, :song_page_link, :annotation_link, :lyrics_text_link, :rg_text_format, :inner_text, :href 5 | alias_method :rg_link?, :rg_link 6 | alias_method :song_page_link?, :song_page_link 7 | alias_method :annotation_link?, :annotation_link 8 | alias_method :lyrics_text_link?, :lyrics_text_link 9 | alias_method :rg_text_format?, :rg_text_format 10 | 11 | def initialize(link) 12 | return unless self.href = link['href'].presence 13 | self.href = coerce_to_utf8(href) 14 | 15 | begin 16 | parsed = Addressable::URI.parse(href) 17 | rescue Addressable::URI::InvalidURIError 18 | return 19 | end 20 | 21 | return unless parsed.host =~ /(?:^|\.)rapgenius.com\z/i 22 | self.rg_link = true 23 | 24 | path, self.inner_text = coerce_to_utf8(parsed.path), coerce_to_utf8(link.inner_text) 25 | 26 | if path =~ /-lyrics\z/i 27 | self.song_page_link = true 28 | 29 | if inner_text =~ /\sLyrics\z/ 30 | self.lyrics_text_link = true 31 | 32 | artist, title = link.inner_text.split(/\s+–\s+/) 33 | return unless artist.present? && title.present? 34 | 35 | title.chomp!(" Lyrics") 36 | 37 | self.rg_text_format = true if "/#{artist.to_slug}-#{title.to_slug}".downcase == parsed.path.dup.chomp("-lyrics").downcase 38 | end 39 | elsif path =~ %r(\A/\d+(\z|/)) 40 | self.annotation_link = true 41 | end 42 | end 43 | 44 | def self.parse(link) 45 | new(link) if link 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /public/422.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |Maybe you tried to change something you didn't have access to.
55 |If you are the application owner check the logs for more information.
57 | 58 | 59 | -------------------------------------------------------------------------------- /public/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |You may have mistyped the address or the page may have moved.
55 |If you are the application owner check the logs for more information.
57 | 58 | 59 | -------------------------------------------------------------------------------- /config/environments/test.rb: -------------------------------------------------------------------------------- 1 | TrackbackScraper::Application.configure do 2 | # Settings specified here will take precedence over those in config/application.rb. 3 | 4 | # The test environment is used exclusively to run your application's 5 | # test suite. You never need to work with it otherwise. Remember that 6 | # your test database is "scratch space" for the test suite and is wiped 7 | # and recreated between test runs. Don't rely on the data there! 8 | config.cache_classes = true 9 | 10 | # Do not eager load code on boot. This avoids loading your whole application 11 | # just for the purpose of running a single test. If you are using a tool that 12 | # preloads Rails for running tests, you may have to set it to true. 13 | config.eager_load = false 14 | 15 | # Configure static asset server for tests with Cache-Control for performance. 16 | config.serve_static_assets = true 17 | config.static_cache_control = "public, max-age=3600" 18 | 19 | # Show full error reports and disable caching. 20 | config.consider_all_requests_local = true 21 | config.action_controller.perform_caching = false 22 | 23 | # Raise exceptions instead of rendering exception templates. 24 | config.action_dispatch.show_exceptions = false 25 | 26 | # Disable request forgery protection in test environment. 27 | config.action_controller.allow_forgery_protection = false 28 | 29 | # Tell Action Mailer not to deliver emails to the real world. 30 | # The :test delivery method accumulates sent emails in the 31 | # ActionMailer::Base.deliveries array. 32 | config.action_mailer.delivery_method = :test 33 | 34 | # Print deprecation notices to the stderr. 35 | config.active_support.deprecation = :stderr 36 | end 37 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # This file is copied to spec/ when you run 'rails generate rspec:install' 2 | ENV["RAILS_ENV"] ||= 'test' 3 | require File.expand_path("../../config/environment", __FILE__) 4 | require 'rspec/rails' 5 | require 'rspec/autorun' 6 | 7 | # Requires supporting ruby files with custom matchers and macros, etc, 8 | # in spec/support/ and its subdirectories. 9 | Dir[Rails.root.join("spec/support/**/*.rb")].each { |f| require f } 10 | 11 | # Checks for pending migrations before tests are run. 12 | # If you are not using ActiveRecord, you can remove this line. 13 | ActiveRecord::Migration.check_pending! if defined?(ActiveRecord::Migration) 14 | 15 | RSpec.configure do |config| 16 | # ## Mock Framework 17 | # 18 | # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line: 19 | # 20 | # config.mock_with :mocha 21 | # config.mock_with :flexmock 22 | # config.mock_with :rr 23 | 24 | # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures 25 | config.fixture_path = "#{::Rails.root}/spec/fixtures" 26 | 27 | # If you're not using ActiveRecord, or you'd prefer not to run each of your 28 | # examples within a transaction, remove the following line or assign false 29 | # instead of true. 30 | config.use_transactional_fixtures = true 31 | 32 | # If true, the base class of anonymous controllers will be inferred 33 | # automatically. This will be the default behavior in future versions of 34 | # rspec-rails. 35 | config.infer_base_class_for_anonymous_controllers = false 36 | 37 | # Run specs in random order to surface order dependencies. If you find an 38 | # order dependency and want to debug it, you can fix the order by providing 39 | # the seed, which is printed after each run. 40 | # --seed 1234 41 | config.order = "random" 42 | end 43 | -------------------------------------------------------------------------------- /config/routes.rb: -------------------------------------------------------------------------------- 1 | TrackbackScraper::Application.routes.draw do 2 | root to: 'static#index' 3 | # The priority is based upon order of creation: first created -> highest priority. 4 | # See how all your routes lay out with "rake routes". 5 | 6 | # You can have the root of your site routed with "root" 7 | # root 'welcome#index' 8 | 9 | # Example of regular route: 10 | # get 'products/:id' => 'catalog#view' 11 | 12 | # Example of named route that can be invoked with purchase_url(id: product.id) 13 | # get 'products/:id/purchase' => 'catalog#purchase', as: :purchase 14 | 15 | # Example resource route (maps HTTP verbs to controller actions automatically): 16 | # resources :products 17 | 18 | # Example resource route with options: 19 | # resources :products do 20 | # member do 21 | # get 'short' 22 | # post 'toggle' 23 | # end 24 | # 25 | # collection do 26 | # get 'sold' 27 | # end 28 | # end 29 | 30 | # Example resource route with sub-resources: 31 | # resources :products do 32 | # resources :comments, :sales 33 | # resource :seller 34 | # end 35 | 36 | # Example resource route with more complex sub-resources: 37 | # resources :products do 38 | # resources :comments 39 | # resources :sales do 40 | # get 'recent', on: :collection 41 | # end 42 | # end 43 | 44 | # Example resource route with concerns: 45 | # concern :toggleable do 46 | # post 'toggle' 47 | # end 48 | # resources :posts, concerns: :toggleable 49 | # resources :photos, concerns: :toggleable 50 | 51 | # Example resource route within a namespace: 52 | # namespace :admin do 53 | # # Directs /admin/products/* to Admin::ProductsController 54 | # # (app/controllers/admin/products_controller.rb) 55 | # resources :products 56 | # end 57 | end 58 | -------------------------------------------------------------------------------- /db/schema.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | # This file is auto-generated from the current state of the database. Instead 3 | # of editing this file, please use the migrations feature of Active Record to 4 | # incrementally modify your database, and then regenerate this schema definition. 5 | # 6 | # Note that this schema.rb definition is the authoritative source for your 7 | # database schema. If you need to create the application database on another 8 | # system, you should be using db:schema:load, not running all the migrations 9 | # from scratch. The latter is a flawed and unsustainable approach (the more migrations 10 | # you'll amass, the slower it'll run and the greater likelihood for issues). 11 | # 12 | # It's strongly recommended that you check this file into your version control system. 13 | 14 | ActiveRecord::Schema.define(version: 20131227162259) do 15 | 16 | # These are extensions that must be enabled in order to support this database 17 | enable_extension "plpgsql" 18 | 19 | create_table "pages", force: true do |t| 20 | t.string "url", limit: 2000 21 | t.text "links" 22 | t.integer "count_of_links_to_rg_song_pages" 23 | t.integer "count_of_links_with_rg_format" 24 | t.datetime "created_at" 25 | t.datetime "updated_at" 26 | t.integer "error_code" 27 | t.text "error_message" 28 | t.integer "total_links_to_rg" 29 | t.boolean "fetched", default: false, null: false 30 | t.integer "count_of_links_with_text_ending_in_lyrics" 31 | t.integer "count_of_annotation_links" 32 | t.datetime "locked_at" 33 | t.integer "count_of_link_clumps" 34 | t.integer "largest_link_clump_size" 35 | t.integer "count_of_link_clumps_fuzzy_match" 36 | t.integer "largest_link_clump_size_fuzzy_match" 37 | end 38 | 39 | add_index "pages", ["url"], name: "index_pages_on_url", unique: true, using: :btree 40 | 41 | end 42 | -------------------------------------------------------------------------------- /lib/open_uri_scrape.rb: -------------------------------------------------------------------------------- 1 | require 'open-uri' 2 | 3 | class OpenUriScrape 4 | class FakeTyphoeusResponse < Struct.new(:success, :body, :code, :timed_out, :return_message, :total_time_ms) 5 | alias_method :success?, :success 6 | alias_method :timed_out?, :timed_out 7 | 8 | def total_time 9 | (total_time_ms || 0) / 1000.0 10 | end 11 | end 12 | 13 | attr_reader :batch_size, :queue, :processed, :concurrency 14 | 15 | def initialize(batch_size) 16 | @batch_size = batch_size 17 | @queue, @processed, = Queue.new, Queue.new 18 | @concurrency = ENV.fetch('HTTP_CONCURRENCY', 200).to_i 19 | end 20 | 21 | def scrape_batch 22 | pages = Page.reserve_batch_for_scraping(batch_size) 23 | pages.each { |p| queue << p } 24 | 25 | concurrency.times do 26 | Thread.new do 27 | until queue.empty? 28 | page = queue.pop 29 | processed << [page, fetch_response(page.url)] 30 | end 31 | end 32 | end 33 | 34 | pages.count.times do 35 | page, response = processed.pop 36 | 37 | page.scraped!(response) 38 | yield(response) if block_given? 39 | end 40 | rescue GracefulShutdown 41 | queue.clear 42 | pages.each { |p| p.unlock! if p.locked? } 43 | raise 44 | rescue => e 45 | Rails.logger.error([e.message] + e.backtrace) 46 | NotificationMailer.notify_error(e.message).deliver! if NotificationMailer.configured? 47 | 48 | raise e 49 | end 50 | 51 | private 52 | 53 | def fetch_response(url) 54 | result, error = nil, nil 55 | 56 | total_time = Benchmark.ms do 57 | begin 58 | Timeout.timeout(ENV.fetch('HTTP_TIMEOUT', 20).to_i) do 59 | result = open(url) 60 | end 61 | rescue => e 62 | error = e 63 | end 64 | end 65 | 66 | raise error if error 67 | 68 | FakeTyphoeusResponse.new(true, result.read, result.status.first.to_i, false, '', total_time) 69 | rescue Timeout::Error => e 70 | FakeTyphoeusResponse.new(false, nil, 0, true, nil, total_time) 71 | rescue OpenURI::HTTPError => e 72 | FakeTyphoeusResponse.new(false, nil, e.message.split.first.to_i, false, e.message, total_time) 73 | rescue => e 74 | FakeTyphoeusResponse.new(false, nil, 0, false, e.inspect, total_time) 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /lib/page_parser.rb: -------------------------------------------------------------------------------- 1 | class PageParser 2 | def initialize(body) 3 | @body = body 4 | end 5 | 6 | def parse_and_find_rg_links 7 | aggregates = {count_of_links_to_rg_song_pages: 0, 8 | count_of_links_with_text_ending_in_lyrics: 0, 9 | count_of_links_with_rg_format: 0, 10 | count_of_annotation_links: 0} 11 | links_to_rg = {} 12 | 13 | doc = Nokogiri::HTML.fragment(body) 14 | doc.css('a').each do |link| 15 | parsed = ParsedLink.parse(link) 16 | next unless parsed.rg_link? 17 | 18 | links_to_rg[parsed.inner_text] = parsed.href 19 | aggregates[:count_of_links_to_rg_song_pages] += 1 if parsed.song_page_link? 20 | aggregates[:count_of_links_with_text_ending_in_lyrics] += 1 if parsed.lyrics_text_link? 21 | aggregates[:count_of_links_with_rg_format] += 1 if parsed.rg_text_format? 22 | aggregates[:count_of_annotation_links] += 1 if parsed.annotation_link? 23 | end 24 | 25 | {links: links_to_rg, 26 | total_links_to_rg: links_to_rg.count}.merge(aggregates).merge(identify_link_clumps) 27 | end 28 | 29 | private 30 | attr_reader :body 31 | 32 | def identify_link_clumps 33 | link_clumps_by_rg_text_format = identify_link_clumps_with(:rg_text_format?) 34 | link_clumps_fuzzy_match = identify_link_clumps_with(:lyrics_text_link?) 35 | 36 | {count_of_link_clumps: link_clumps_by_rg_text_format[:count], 37 | largest_link_clump_size: link_clumps_by_rg_text_format[:largest], 38 | count_of_link_clumps_fuzzy_match: link_clumps_fuzzy_match[:count], 39 | largest_link_clump_size_fuzzy_match: link_clumps_fuzzy_match[:largest]} 40 | end 41 | 42 | def adjacent_rg_text_format_link(link, method) 43 | return unless next_link = next_anchor_sibling(link) 44 | next_link['data-seen-already'] = true 45 | next_link if ParsedLink.parse(next_link).try(method) 46 | end 47 | 48 | def identify_link_clumps_with(method) 49 | largest_clump_size, current_clump_size, number_of_clumps = 0, 0, 0 50 | 51 | doc = Nokogiri::HTML.fragment(body) 52 | 53 | while link = doc.css('a:not([data-seen-already])').first 54 | link['data-seen-already'] = 'true' 55 | 56 | parsed = ParsedLink.parse(link) 57 | next unless parsed.try(method) 58 | 59 | current_clump_size = 1 60 | 61 | current_link = link 62 | while current_link = adjacent_rg_text_format_link(current_link, method) 63 | current_clump_size += 1 64 | end 65 | 66 | if current_clump_size > 1 67 | number_of_clumps += 1 68 | largest_clump_size = current_clump_size if current_clump_size > largest_clump_size 69 | end 70 | end 71 | 72 | {count: number_of_clumps, largest: largest_clump_size} 73 | end 74 | 75 | def next_anchor_sibling(node) 76 | sibling = node.next_sibling 77 | sibling = sibling.next_sibling while sibling.try(:name).to_s == 'br' 78 | 79 | sibling if sibling.try(:name) == 'a' 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /config/environments/production.rb: -------------------------------------------------------------------------------- 1 | TrackbackScraper::Application.configure do 2 | # Settings specified here will take precedence over those in config/application.rb. 3 | 4 | # Code is not reloaded between requests. 5 | config.cache_classes = true 6 | 7 | # Eager load code on boot. This eager loads most of Rails and 8 | # your application in memory, allowing both thread web servers 9 | # and those relying on copy on write to perform better. 10 | # Rake tasks automatically ignore this option for performance. 11 | config.eager_load = true 12 | 13 | # Full error reports are disabled and caching is turned on. 14 | config.consider_all_requests_local = false 15 | config.action_controller.perform_caching = true 16 | 17 | # Enable Rack::Cache to put a simple HTTP cache in front of your application 18 | # Add `rack-cache` to your Gemfile before enabling this. 19 | # For large-scale production use, consider using a caching reverse proxy like nginx, varnish or squid. 20 | # config.action_dispatch.rack_cache = true 21 | 22 | # Disable Rails's static asset server (Apache or nginx will already do this). 23 | config.serve_static_assets = false 24 | 25 | # Compress JavaScripts and CSS. 26 | config.assets.js_compressor = :uglifier 27 | # config.assets.css_compressor = :sass 28 | 29 | # Do not fallback to assets pipeline if a precompiled asset is missed. 30 | config.assets.compile = false 31 | 32 | # Generate digests for assets URLs. 33 | config.assets.digest = true 34 | 35 | # Version of your assets, change this if you want to expire all your assets. 36 | config.assets.version = '1.0' 37 | 38 | # Specifies the header that your server uses for sending files. 39 | # config.action_dispatch.x_sendfile_header = "X-Sendfile" # for apache 40 | # config.action_dispatch.x_sendfile_header = 'X-Accel-Redirect' # for nginx 41 | 42 | # Force all access to the app over SSL, use Strict-Transport-Security, and use secure cookies. 43 | # config.force_ssl = true 44 | 45 | # Set to :debug to see everything in the log. 46 | config.log_level = :info 47 | 48 | # Prepend all log lines with the following tags. 49 | # config.log_tags = [ :subdomain, :uuid ] 50 | 51 | # Use a different logger for distributed setups. 52 | # config.logger = ActiveSupport::TaggedLogging.new(SyslogLogger.new) 53 | 54 | # Use a different cache store in production. 55 | # config.cache_store = :mem_cache_store 56 | 57 | # Enable serving of images, stylesheets, and JavaScripts from an asset server. 58 | # config.action_controller.asset_host = "http://assets.example.com" 59 | 60 | # Precompile additional assets. 61 | # application.js, application.css, and all non-JS/CSS in app/assets folder are already added. 62 | # config.assets.precompile += %w( search.js ) 63 | 64 | # Ignore bad email addresses and do not raise email delivery errors. 65 | # Set this to true and configure the email server for immediate delivery to raise delivery errors. 66 | # config.action_mailer.raise_delivery_errors = false 67 | 68 | # Enable locale fallbacks for I18n (makes lookups for any locale fall back to 69 | # the I18n.default_locale when a translation can not be found). 70 | config.i18n.fallbacks = true 71 | 72 | # Send deprecation notices to registered listeners. 73 | config.active_support.deprecation = :notify 74 | 75 | # Disable automatic flushing of the log to improve performance. 76 | # config.autoflush_log = false 77 | 78 | # Use default logging formatter so that PID and timestamp are not suppressed. 79 | config.log_formatter = ::Logger::Formatter.new 80 | end 81 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | actionmailer (4.0.2) 5 | actionpack (= 4.0.2) 6 | mail (~> 2.5.4) 7 | actionpack (4.0.2) 8 | activesupport (= 4.0.2) 9 | builder (~> 3.1.0) 10 | erubis (~> 2.7.0) 11 | rack (~> 1.5.2) 12 | rack-test (~> 0.6.2) 13 | activemodel (4.0.2) 14 | activesupport (= 4.0.2) 15 | builder (~> 3.1.0) 16 | activerecord (4.0.2) 17 | activemodel (= 4.0.2) 18 | activerecord-deprecated_finders (~> 1.0.2) 19 | activesupport (= 4.0.2) 20 | arel (~> 4.0.0) 21 | activerecord-deprecated_finders (1.0.3) 22 | activesupport (4.0.2) 23 | i18n (~> 0.6, >= 0.6.4) 24 | minitest (~> 4.2) 25 | multi_json (~> 1.3) 26 | thread_safe (~> 0.1) 27 | tzinfo (~> 0.3.37) 28 | addressable (2.3.5) 29 | arel (4.0.1) 30 | atomic (1.1.14) 31 | builder (3.1.4) 32 | coderay (1.1.0) 33 | coffee-rails (4.0.1) 34 | coffee-script (>= 2.2.0) 35 | railties (>= 4.0.0, < 5.0) 36 | coffee-script (2.2.0) 37 | coffee-script-source 38 | execjs 39 | coffee-script-source (1.6.3) 40 | decent_exposure (2.3.0) 41 | decent_generators (0.0.1) 42 | rails (~> 4.0.0) 43 | diff-lcs (1.2.5) 44 | dotenv (0.9.0) 45 | dotenv-rails (0.9.0) 46 | dotenv (= 0.9.0) 47 | erubis (2.7.0) 48 | ethon (0.6.2) 49 | ffi (>= 1.3.0) 50 | mime-types (~> 1.18) 51 | execjs (2.0.2) 52 | factory_girl (4.3.0) 53 | activesupport (>= 3.0.0) 54 | ffi (1.9.3) 55 | fivemat (1.2.1) 56 | haml (4.0.4) 57 | tilt 58 | haml-rails (0.5.3) 59 | actionpack (>= 4.0.1) 60 | activesupport (>= 4.0.1) 61 | haml (>= 3.1, < 5.0) 62 | railties (>= 4.0.1) 63 | hike (1.2.3) 64 | i18n (0.6.9) 65 | jbuilder (1.5.3) 66 | activesupport (>= 3.0.0) 67 | multi_json (>= 1.2.0) 68 | jquery-rails (3.0.4) 69 | railties (>= 3.0, < 5.0) 70 | thor (>= 0.14, < 2.0) 71 | json (1.8.1) 72 | librato-logreporter (0.2.1) 73 | mail (2.5.4) 74 | mime-types (~> 1.16) 75 | treetop (~> 1.4.8) 76 | method_source (0.8.2) 77 | mime-types (1.25.1) 78 | mini_portile (0.5.2) 79 | minitest (4.7.5) 80 | multi_json (1.8.2) 81 | nokogiri (1.6.1) 82 | mini_portile (~> 0.5.0) 83 | pg (0.17.1) 84 | polyglot (0.3.3) 85 | pry (0.9.12.4) 86 | coderay (~> 1.0) 87 | method_source (~> 0.8) 88 | slop (~> 3.4) 89 | pry-rails (0.3.2) 90 | pry (>= 0.9.10) 91 | rack (1.5.2) 92 | rack-test (0.6.2) 93 | rack (>= 1.0) 94 | rails (4.0.2) 95 | actionmailer (= 4.0.2) 96 | actionpack (= 4.0.2) 97 | activerecord (= 4.0.2) 98 | activesupport (= 4.0.2) 99 | bundler (>= 1.3.0, < 2.0) 100 | railties (= 4.0.2) 101 | sprockets-rails (~> 2.0.0) 102 | railties (4.0.2) 103 | actionpack (= 4.0.2) 104 | activesupport (= 4.0.2) 105 | rake (>= 0.8.7) 106 | thor (>= 0.18.1, < 2.0) 107 | rake (10.1.1) 108 | rspec (2.14.1) 109 | rspec-core (~> 2.14.0) 110 | rspec-expectations (~> 2.14.0) 111 | rspec-mocks (~> 2.14.0) 112 | rspec-core (2.14.7) 113 | rspec-expectations (2.14.4) 114 | diff-lcs (>= 1.1.3, < 2.0) 115 | rspec-mocks (2.14.4) 116 | rspec-rails (2.14.0) 117 | actionpack (>= 3.0) 118 | activesupport (>= 3.0) 119 | railties (>= 3.0) 120 | rspec-core (~> 2.14.0) 121 | rspec-expectations (~> 2.14.0) 122 | rspec-mocks (~> 2.14.0) 123 | sass (3.2.13) 124 | sass-rails (4.0.1) 125 | railties (>= 4.0.0, < 5.0) 126 | sass (>= 3.1.10) 127 | sprockets-rails (~> 2.0.0) 128 | shoulda-matchers (2.4.0) 129 | activesupport (>= 3.0.0) 130 | slop (3.4.7) 131 | sprockets (2.10.1) 132 | hike (~> 1.2) 133 | multi_json (~> 1.0) 134 | rack (~> 1.0) 135 | tilt (~> 1.1, != 1.3.0) 136 | sprockets-rails (2.0.1) 137 | actionpack (>= 3.0) 138 | activesupport (>= 3.0) 139 | sprockets (~> 2.8) 140 | stringex (2.1.2) 141 | thor (0.18.1) 142 | thread_safe (0.1.3) 143 | atomic 144 | tilt (1.4.1) 145 | treetop (1.4.15) 146 | polyglot 147 | polyglot (>= 0.3.1) 148 | turbolinks (2.1.0) 149 | coffee-rails 150 | twitter-bootstrap-rails (2.2.8) 151 | actionpack (>= 3.1) 152 | execjs 153 | rails (>= 3.1) 154 | railties (>= 3.1) 155 | typhoeus (0.6.7) 156 | ethon (~> 0.6.2) 157 | tzinfo (0.3.38) 158 | uglifier (2.4.0) 159 | execjs (>= 0.3.0) 160 | json (>= 1.8.0) 161 | whois (3.4.2) 162 | 163 | PLATFORMS 164 | ruby 165 | 166 | DEPENDENCIES 167 | addressable 168 | coffee-rails (~> 4.0.0) 169 | decent_exposure 170 | decent_generators 171 | dotenv-rails 172 | factory_girl 173 | fivemat 174 | haml 175 | haml-rails 176 | jbuilder (~> 1.2) 177 | jquery-rails 178 | librato-logreporter 179 | nokogiri 180 | pg 181 | pry 182 | pry-rails 183 | rails 184 | rspec 185 | rspec-rails 186 | sass-rails (~> 4.0.0) 187 | shoulda-matchers 188 | stringex 189 | turbolinks 190 | twitter-bootstrap-rails 191 | typhoeus 192 | uglifier (>= 1.3.0) 193 | whois 194 | -------------------------------------------------------------------------------- /app/models/page.rb: -------------------------------------------------------------------------------- 1 | class Page < ActiveRecord::Base 2 | scope :locked, -> { where("locked_at IS NOT NULL") } 3 | scope :not_locked, -> { where(locked_at: nil) } 4 | def locked? 5 | locked_at? 6 | end 7 | 8 | scope :unscraped, -> { where(fetched: false).not_locked } 9 | scope :scraped, -> { where(fetched: true) } 10 | scope :errored, -> { scraped.where('error_code IS NOT NULL OR error_message IS NOT NULL') } 11 | def errored? 12 | error_code.present? || error_message.present? 13 | end 14 | 15 | scope :timeout, -> { scraped.where("error_message = 'timeout'") } 16 | 17 | scope :not_errored, -> { scraped.where(error_code: nil, error_message: nil) } 18 | 19 | serialize :links, Hash 20 | 21 | def self.forget_everything! 22 | update_all(reset_attributes.merge(updated_at: Time.now)) 23 | end 24 | 25 | def self.reset_attributes 26 | { 27 | fetched: false, 28 | locked_at: nil, 29 | links: nil, 30 | total_links_to_rg: nil, 31 | count_of_links_to_rg_song_pages: nil, 32 | count_of_links_with_rg_format: nil, 33 | count_of_annotation_links: nil, 34 | count_of_links_with_text_ending_in_lyrics: nil, 35 | error_code: nil, 36 | error_message: nil, 37 | count_of_link_clumps_fuzzy_match: nil, 38 | largest_link_clump_size_fuzzy_match: nil, 39 | count_of_link_clumps: nil, 40 | largest_link_clump_size: nil 41 | } 42 | end 43 | delegate :reset_attributes, to: 'self.class' 44 | 45 | def self.error_code_summary 46 | scraped.where('error_code IS NOT NULL').pluck(:error_code).each_with_object(Hash.new(0)) do |code, h| 47 | h[code] += 1 48 | end 49 | end 50 | 51 | def self.reserve_batch_for_scraping(limit) 52 | pages_subquery = unscraped.limit(limit).order(:id).select(:id).lock(true).to_sql 53 | db_time_now = Time.now.utc 54 | 55 | find_by_sql [<<-SQL, db_time_now, db_time_now] 56 | UPDATE pages SET locked_at = ?, updated_at = ? 57 | WHERE id IN (#{pages_subquery}) 58 | RETURNING * 59 | SQL 60 | end 61 | 62 | def self.hydra 63 | @hydra ||= Typhoeus::Hydra.new(max_concurrency: ENV.fetch('HTTP_CONCURRENCY', 200).to_i) 64 | end 65 | delegate :hydra, to: 'self.class' 66 | 67 | def self.scrape_batch(batch_size) 68 | pages = reserve_batch_for_scraping(batch_size) 69 | 70 | pages.each do |page| 71 | hydra.queue(request = page.new_request) 72 | 73 | request.on_complete do |response| 74 | page.scraped!(response) 75 | 76 | yield(response) if block_given? 77 | end 78 | end 79 | 80 | hydra.run 81 | rescue GracefulShutdown 82 | hydra.abort 83 | pages.each { |p| p.unlock! if p.locked? } 84 | raise 85 | rescue => e 86 | Rails.logger.error([e.message] + e.backtrace) 87 | NotificationMailer.notify_error(e.message).deliver! if NotificationMailer.configured? 88 | 89 | raise e 90 | end 91 | 92 | def self.scrape_batch_with_open_uri(batch_size) 93 | OpenUriScrape.new(batch_size).scrape_batch 94 | end 95 | 96 | CSV_COLUMNS = %w(url count_of_link_clumps count_of_link_clumps_fuzzy_match count_of_links_with_rg_format count_of_links_with_text_ending_in_lyrics count_of_links_to_rg_song_pages count_of_annotation_links total_links_to_rg largest_link_clump_size largest_link_clump_size_fuzzy_match) 97 | def self.write_report!(file_path = Rails.root.join('tmp/report.csv')) 98 | File.open(file_path, 'wb') do |file| 99 | file.write(generate_report) 100 | end 101 | end 102 | 103 | def self.generate_report(limit = nil) 104 | CSV.generate do |csv| 105 | csv << CSV_COLUMNS 106 | 107 | not_errored.limit(limit).order((CSV_COLUMNS - ['url']).map { |c| "#{c} desc" }.join(', ')).each do |page| 108 | csv << CSV_COLUMNS.map { |c| page.__send__(c) } 109 | end 110 | end 111 | end 112 | 113 | def self.send_email_report 114 | raise unless NotificationMailer.configured? 115 | 116 | NotificationMailer.report(generate_report).deliver! 117 | end 118 | 119 | def self.send_abbreviated_report(limit = 20_000) 120 | raise unless NotificationMailer.configured? 121 | 122 | NotificationMailer.report(generate_report(limit), subject: "Abbreviated report").deliver! 123 | end 124 | 125 | def scraped!(response, raise_errors = false) 126 | Librato.measure('scrape.request.time', response.total_time * 1000) unless response.total_time.zero? 127 | 128 | return handle_scrape_error(response) unless response.success? 129 | 130 | begin 131 | scraped_attributes = parse_and_find_rg_links(response.body) 132 | rescue => e 133 | Librato.increment('scrape.error') 134 | tap(&:mark_fetched).update_attributes!(error_message: { :exception => e }.to_yaml) 135 | raise e if raise_errors 136 | else 137 | Librato.increment('scrape.success') 138 | 139 | tap(&:mark_fetched).update_attributes!(scraped_attributes.merge!(error_code: nil, error_message: nil)) 140 | end 141 | end 142 | 143 | def handle_scrape_error(response) 144 | Librato.increment('scrape.error') 145 | 146 | mark_fetched 147 | 148 | if response.timed_out? 149 | Librato.increment('scrape.error.timeout') 150 | update_attributes!(error_message: "timeout") 151 | elsif response.code == 0 152 | Librato.increment('scrape.error.unknown') 153 | update_attributes!(error_message: response.return_message) 154 | else 155 | Librato.increment('scrape.error.http') 156 | update_attributes!(error_message: nil, error_code: response.code) 157 | end 158 | end 159 | 160 | def scrape! 161 | hydra.queue(request = new_request) 162 | request.on_complete { |r| scraped!(r, :raise_errors) } 163 | hydra.run 164 | end 165 | 166 | def new_request 167 | Typhoeus::Request.new(url, followlocation: true, timeout: ENV.fetch('HTTP_TIMEOUT', 20).to_i, headers: request_headers) 168 | end 169 | 170 | def request_headers 171 | { 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 172 | 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9', 173 | 'Cache-Control' => 'max-age=0', 'Accept-Language' => 'en-US,en;q=0.8'} 174 | end 175 | 176 | def mark_fetched 177 | self.attributes = reset_attributes 178 | 179 | self.fetched = true 180 | self.locked_at = nil 181 | end 182 | 183 | def unlock! 184 | self.locked_at = nil 185 | save! 186 | end 187 | 188 | private 189 | 190 | def parse_and_find_rg_links(body) 191 | Librato.measure('parse_and_find_rg_links') do 192 | PageParser.new(body).parse_and_find_rg_links 193 | end 194 | end 195 | end 196 | -------------------------------------------------------------------------------- /vendor/urls.txt: -------------------------------------------------------------------------------- 1 | http://www.huffingtonpost.com/2013/09/10/kanye-west-ray-j-hit-it-first_n_3900572.html 2 | http://www.huffingtonpost.com/2013/10/14/eminem-rap-god-debuts-marshall-mathers-lp-2_n_4098815.html 3 | http://www.huffingtonpost.com/2013/09/16/rapper-common-chicago_n_3922182.html 4 | http://www.refinedhype.com/hyped/entry/no-sense-lil-wayne-dedication-5 5 | http://thefreshheir.com/2013/11/26/new-music-sxmplelife-lloyd-banks-ft-50-cent-on-fire-bootleg/ 6 | http://thefreshheir.com/2013/12/03/video-tupac-shakur-on-life-and-death-animated-interview-1994/ 7 | http://www.huffingtonpost.com/2013/10/29/kanye-west-interview_n_4175351.html 8 | http://www.mostlyjunkfood.com/the-many-nicknames-of-tyler-the-creator/ 9 | http://www.huffingtonpost.com/2013/12/06/kanye-kid-china-photo_n_4399924.html 10 | http://www.mostlyjunkfood.com/i-think-im-turning-japanese-raps-infatuation-with-benihana/ 11 | http://thefreshheir.com/2013/11/26/rare-photo-of-kanye-west-as-a-child-surfaces/ 12 | http://www.mostlyjunkfood.com/justin-timberlakes-the-2020-experience-album-art-tracklist/ 13 | http://thefreshheir.com/2013/11/26/video-ne-hip-hop-interviews-natural/ 14 | http://www.vibe.com/article/new-music-casey-veggies-and-rockie-fresh-aladdin 15 | http://www.refinedhype.com/hyped/entry/classic-hate-juicy 16 | http://www.refinedhype.com/hyped/entry/kanye-quote-post 17 | http://www.mostlyjunkfood.com/review-drake-nothing/ 18 | http://www.huffingtonpost.com/2013/11/25/childish-gambino-sweatpants_n_4338141.html 19 | http://www.mostlyjunkfood.com/stream-commons-the-dreamer-the-believer-album-before-it-hits-stores-december-20th/ 20 | http://www.huffingtonpost.com/2013/10/20/kanye-west-yeezus-tour-seattle-kendrick-lamar_n_4132372.html 21 | http://www.huffingtonpost.com/2013/10/21/arcade-fire-afterlife-video_n_4139415.html 22 | http://www.mostlyjunkfood.com/danny-brown-xxx-review/ 23 | http://www.huffingtonpost.com/2013/12/20/lamborghini-huracan--lp-610-4-sports-car_n_4480302.html?utm 24 | http://thefreshheir.com/2013/11/28/new-music-childish-gambino-telegraph-ave/ 25 | http://www.huffingtonpost.com/2013/03/29/8-things-women-need-to-bitches-to-do-books_n_2979633.html 26 | http://www.mostlyjunkfood.com/lists-suck-10-cosby-show-references-in-rap-songs/ 27 | http://www.huffingtonpost.com/2013/12/12/kanye-west-returns-to-south-park_n_4433298.html 28 | http://www.huffingtonpost.com/2013/06/24/kanye-west-i-am-a-god-fashion-week-diss_n_3490688.html 29 | http://www.refinedhype.com/hyped/entry/fotw-drake-gets-emotional-chinese 30 | http://www.mostlyjunkfood.com/pointcounterpoint-yellow-album-dom-kennedy-review/ 31 | http://www.refinedhype.com/hyped/entry/craziest-lines-2-chainz-boats-2 32 | http://www.vibe.com/article/new-video-awkword-throw-away-key 33 | http://www.huffingtonpost.com/2013/11/27/eminem-rap-god-video_n_4351130.html 34 | http://www.refinedhype.com/hyped/entry/vagina-rap 35 | http://www.huffingtonpost.com/2013/11/04/ben-stiller-something-about-mary-hair-gel_n_4212720.html 36 | http://www.mostlyjunkfood.com/listen-kendrick-lamar-featuring-dr-dre-the-recipe/ 37 | http://www.mostlyjunkfood.com/jay-electronica-call-of-duty-mw3-ft-mobb-deep/ 38 | http://www.vibe.com/article/watch-t-pain-raps-without-auto-tune-name-drops-hello-kitty-work-video 39 | http://www.refinedhype.com/hyped/entry/da-real-lambo-lebrons-mom 40 | http://www.mostlyjunkfood.com/frank-ocean-acura-integurl-video/ 41 | http://www.mostlyjunkfood.com/review-mellowhype-numbers/ 42 | http://www.mostlyjunkfood.com/mp3-flux-pavilion-do-or-die-ft-childish-gambino/ 43 | http://thefreshheir.com/2013/12/03/cant-tell-me-shit-remix/ 44 | http://www.mostlyjunkfood.com/mp3-heems-killing-time/ 45 | http://www.refinedhype.com/hyped/entry/fotw-paris-hilton-weezy 46 | http://www.huffingtonpost.com/2013/10/27/paris-hilton-miley-cyrus-halloween_n_4167824.html 47 | http://www.vibe.com/article/review-lupe-fiascos-peace-papercup-jayzus-proves-hes-best-lyricist-alive 48 | http://www.mostlyjunkfood.com/drake-headlines/ 49 | http://www.vibe.com/article/eminem-rap-god-lyrics 50 | http://www.huffingtonpost.com/2013/11/21/law-and-order-svu-line_n_4316568.html 51 | http://www.mostlyjunkfood.com/return-of-the-mac-top-10-steve-jobs-inspired-rap-lyrics/ 52 | http://www.vibe.com/article/lecrae-talks-gravity-album-bridging-gaps-and-jeremy-lin 53 | http://www.mostlyjunkfood.com/the-curious-case-of-sad-kanye-the-grey-sweatshirt/ 54 | http://www.vibe.com/article/rick-ross-says-hes-good-standing-reebok 55 | http://thefreshheir.com/2013/11/26/video-napoleon-lv-dreamcatcher/ 56 | http://www.refinedhype.com/hyped/entry/bite-or-not-jay-z-biggie/ 57 | http://www.mostlyjunkfood.com/lil-b-calls-the-game-irrelevant-video/ 58 | http://www.refinedhype.com/hyped/entry/jay-z-pound-cake-wack 59 | http://www.vibe.com/article/drakes-sophomore-album-too-emo-men 60 | http://www.vibe.com/article/opinion-what-does-kendrick-lamar-vs-drake-mean-j-cole 61 | http://www.huffingtonpost.com/2013/12/16/2-chainz-u-da-realest-video_n_4452612.html 62 | http://thefreshheir.com/2013/12/04/new-music-love-mansuy-white/ 63 | http://www.refinedhype.com/hyped/entry/migos-brokeanese 64 | http://www.huffingtonpost.com/2013/12/09/ryan-seacrest-vh1-docu-series-white-female-rappers_n_4413515.html 65 | http://www.mostlyjunkfood.com/drake-free-spirit-ft-rick-ross/ 66 | http://www.mostlyjunkfood.com/soul-khan-speeding-bullets-video/ 67 | http://www.mostlyjunkfood.com/the-outlawz-actually-smoked-tupacs-ashes-in-a-blunt-video/ 68 | http://thefreshheir.com/2013/11/28/video-chance-the-rapper-performs-new-song-in-chicago/ 69 | http://www.huffingtonpost.com/2013/03/28/rick-ross-uoeno-lyrics-rapper-responds_n_2974891.html 70 | http://www.mostlyjunkfood.com/tyler-the-creator-goblin/ 71 | http://www.mostlyjunkfood.com/the-roots-%e2%80%93-make-my-ft-big-k-r-i-t/ 72 | http://www.huffingtonpost.com/2013/12/10/ariana-grande-santa-baby-liz-gillies_n_4417929.html 73 | http://www.mostlyjunkfood.com/odd-future-at-hollywood-palladium-93011/ 74 | http://www.huffingtonpost.com/2013/12/09/justin-bieber-confident-music-mondays-journals_n_4411851.html 75 | http://www.refinedhype.com/hyped/entry/lil-dicky-white-dude 76 | http://www.mostlyjunkfood.com/kendrick-lamar-rigamortus-ft-busta-rhymes-remix/ 77 | http://www.mostlyjunkfood.com/five-music-tips-for-unsigned-artists/ 78 | http://www.refinedhype.com/hyped/entry/rap-stats-rap-nerd-orgasm 79 | http://thefreshheir.com/2013/11/20/video-martin-ky-polaroid/ 80 | http://www.huffingtonpost.com/2013/10/10/kanye-west-gone-billboard-top-20-eight-years_n_4076965.html 81 | http://www.refinedhype.com/hyped/entry/lupe-fiasco-put-em-down 82 | http://thefreshheir.com/2013/11/26/video-wara-from-the-nbhd-98-rocafella/ 83 | http://www.huffingtonpost.com/2012/11/15/robert-deniro-jay-z-actor-scolds-rapper_n_2137459.html 84 | http://www.huffingtonpost.com/2013/02/21/emmett-tills-family-responds-lil-wayne-lyric-open-letter_n_2733765.html 85 | http://www.huffingtonpost.com/2013/12/20/lamborghini-huracan--lp-610-4-sports-car_n_4480302.html 86 | http://www.mostlyjunkfood.com/outkast-to-reunite-on-remixes-for-frank-oceans-pink-matter-t-i-s-sorry/ 87 | http://www.mostlyjunkfood.com/ofwgkta-odd-future-tape-vol-2-concert-review/ 88 | http://www.huffingtonpost.com/2013/08/13/kendrick-lamar-control-big-sean_n_3748466.html 89 | http://www.huffingtonpost.com/2013/10/20/mase-now-we-even-rappers-first-album-in-years_n_4131795.html 90 | http://www.huffingtonpost.com/2013/11/19/kanye-west-gigli-moment_n_4303434.html 91 | http://www.refinedhype.com/hyped/entry/breaking-down-jay-z-100-bill-flow 92 | http://www.huffingtonpost.com/2013/10/27/lorde-royals-youngest-artist-british-singles-chart_n_4168113.html 93 | http://www.mostlyjunkfood.com/shotsfired-nothing-sames-hypothetical-hypothetical-subliminals/ 94 | http://www.huffingtonpost.com/2013/09/15/drake-jay-z-pound-cake-paris-morton-music_n_3932410.html 95 | http://www.refinedhype.com/hyped/entry/j-coles-control-response 96 | http://www.vibe.com/article/take-look-adolescent-kanye-west-china-photo 97 | http://www.huffingtonpost.com/2013/06/29/kanye-west-black-skinhead_n_3522146.html 98 | http://www.refinedhype.com/hyped/entry/nmpa-rapgenius 99 | http://thefreshheir.com/2013/11/25/new-music-dom-mclennon-wheredidmymindgo-demo/ 100 | http://thefreshheir.com/2013/11/28/video-young-thug-some-more/ 101 | --------------------------------------------------------------------------------