├── .drone.yml ├── .env.test ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .ruby-version ├── Gemfile ├── Gemfile.lock ├── LICENSE.md ├── Procfile ├── README.md ├── Rakefile ├── bin ├── console ├── pigo_darwin_arm64 ├── pigo_darwin_x86_64 ├── pigo_linux_x86_64 └── rake ├── lib ├── crawler │ ├── image │ │ ├── cache.rb │ │ ├── cascade │ │ │ └── facefinder │ │ ├── download.rb │ │ ├── download │ │ │ ├── default.rb │ │ │ ├── instagram.rb │ │ │ ├── vimeo.rb │ │ │ └── youtube.rb │ │ ├── download_cache.rb │ │ ├── helpers.rb │ │ ├── image_processor.rb │ │ ├── initializers │ │ │ ├── constants.rb │ │ │ ├── down.rb │ │ │ ├── librato.rb │ │ │ ├── sidekiq.rb │ │ │ ├── storage.rb │ │ │ └── worker_stat.rb │ │ ├── jobs │ │ │ ├── find_image.rb │ │ │ ├── process_image.rb │ │ │ └── upload_image.rb │ │ ├── meta_images.rb │ │ ├── meta_images_cache.rb │ │ └── timer.rb │ └── refresher │ │ ├── cache.rb │ │ ├── feed.rb │ │ ├── feed_status.rb │ │ ├── http_cache.rb │ │ ├── initializers │ │ ├── redis.rb │ │ └── sidekiq.rb │ │ ├── jobs │ │ ├── feed_downloader.rb │ │ └── feed_status_update.rb │ │ ├── redirect_cache.rb │ │ └── throttle.rb ├── image.rb └── refresher.rb └── test ├── cache_test.rb ├── download ├── default.rb ├── instagram.rb ├── vimeo.rb └── youtube.rb ├── download_cache_test.rb ├── download_test.rb ├── feed_downloader_test.rb ├── feed_status_test.rb ├── feed_status_update_test.rb ├── feed_test.rb ├── image_test.rb ├── jobs ├── find_image_test.rb ├── process_image_test.rb └── upload_image_test.rb ├── meta_images_cache_test.rb ├── meta_images_test.rb ├── redirect_cache_test.rb ├── redirect_test.rb ├── support └── www │ ├── atom.xml │ ├── feed.json │ ├── html.html │ └── image.jpeg ├── test_helper.rb ├── throttle_test.rb └── timer_test.rb /.drone.yml: -------------------------------------------------------------------------------- 1 | kind: pipeline 2 | name: default 3 | 4 | steps: 5 | - name: test 6 | image: ubuntu:16.04 7 | commands: 8 | - apt-get update 9 | - apt-get install -y software-properties-common 10 | - apt-add-repository ppa:brightbox/ruby-ng 11 | - apt update 12 | - apt-get install -y ruby2.5 ruby2.5-dev build-essential curl git libidn11-dev libpq-dev libreadline-dev libxml2-dev libxslt1-dev libcurl4-openssl-dev libssl-dev zlib1g-dev libffi-dev redis-server 13 | - systemctl restart redis-server.service 14 | - gem install bundler -v "1.16.5" 15 | - bundle install --jobs=8 --retry=2 16 | - rake 17 | -------------------------------------------------------------------------------- /.env.test: -------------------------------------------------------------------------------- 1 | AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID 2 | AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY 3 | AWS_S3_BUCKET=s3-bucket 4 | FACEBOOK_ACCESS_TOKEN=FACEBOOK_ACCESS_TOKEN 5 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | os: [ubuntu-22.04] 11 | ruby: [3.1] 12 | 13 | runs-on: ${{ matrix.os }} 14 | 15 | services: 16 | redis: 17 | image: redis 18 | options: >- 19 | --health-cmd "redis-cli ping" 20 | --health-interval 10s 21 | --health-timeout 5s 22 | --health-retries 5 23 | ports: 24 | - 6379:6379 25 | 26 | steps: 27 | - uses: actions/checkout@v2 28 | 29 | - name: Install dependencies 30 | run: sudo apt-get --yes install libidn11-dev libvips 31 | 32 | - uses: ruby/setup-ruby@v1 33 | with: 34 | ruby-version: ${{ matrix.ruby }} 35 | bundler-cache: true 36 | 37 | - name: Run tests 38 | run: bundle exec rake -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | vendor/bundle 5 | tmp 6 | .env 7 | coverage/ 8 | log/dump.rdb -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 3.1.2 -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | git_source(:github) { |name| "https://github.com/#{name}.git" } 3 | 4 | gem "down", github: "feedbin/down", branch: "normalize" 5 | gem "unf_ext" 6 | 7 | gem "sax-machine", github: "feedbin/sax-machine", branch: "feedbin" 8 | gem "feedjira", github: "feedbin/feedjira", branch: "f2" 9 | gem "http", github: "feedbin/http", branch: "feedbin" 10 | gem "feedkit", github: "feedbin/feedkit", branch: "master" 11 | 12 | gem "bundler" 13 | gem "addressable" 14 | gem "connection_pool" 15 | gem "dotenv" 16 | gem "fog-aws" 17 | gem "image_processing" 18 | gem "librato-metrics", "~> 1.6.2" 19 | gem "librato-rack" 20 | gem "mime-types" 21 | gem "nokogiri" 22 | gem "rake" 23 | gem "redis" 24 | gem "resolv" 25 | gem "ruby-vips" 26 | gem "sidekiq" 27 | 28 | group :development do 29 | gem "foreman" 30 | gem "standard" 31 | end 32 | 33 | group :test do 34 | gem "minitest" 35 | gem "webmock" 36 | end 37 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GIT 2 | remote: https://github.com/feedbin/down.git 3 | revision: df69f03b65453bbbabc99678c3f6a3a1c4d1a0b2 4 | branch: normalize 5 | specs: 6 | down (5.3.1) 7 | addressable (~> 2.8) 8 | 9 | GIT 10 | remote: https://github.com/feedbin/feedjira.git 11 | revision: da48b21c09604c797854e095504626ac25d0892a 12 | branch: f2 13 | specs: 14 | feedjira (2.1.2) 15 | loofah (>= 2.0) 16 | sax-machine (>= 1.0) 17 | 18 | GIT 19 | remote: https://github.com/feedbin/feedkit.git 20 | revision: 68aa2b996f98c9d61cc4bc69388c10a2431efe47 21 | branch: master 22 | specs: 23 | feedkit (0.1.0) 24 | addressable (~> 2.8.0) 25 | feedjira (~> 2.0) 26 | http (~> 4.4) 27 | rchardet (~> 1.8.0) 28 | twitter (~> 7.0) 29 | twitter-text (~> 3.1.0) 30 | 31 | GIT 32 | remote: https://github.com/feedbin/http.git 33 | revision: 334b230d2033177ebf535e8277e21408cd8fdfcf 34 | branch: feedbin 35 | specs: 36 | http (4.4.1) 37 | addressable (~> 2.3) 38 | http-cookie (~> 1.0) 39 | http-form_data (~> 2.2) 40 | http-parser (~> 1.2.0) 41 | 42 | GIT 43 | remote: https://github.com/feedbin/sax-machine.git 44 | revision: 80c6e3b9eb4ab0ac16a0eeaac13d10a713d10423 45 | branch: feedbin 46 | specs: 47 | sax-machine (1.3.2) 48 | 49 | GEM 50 | remote: https://rubygems.org/ 51 | specs: 52 | addressable (2.8.1) 53 | public_suffix (>= 2.0.2, < 6.0) 54 | aggregate (0.2.3) 55 | ast (2.4.2) 56 | buftok (0.2.0) 57 | builder (3.2.4) 58 | connection_pool (2.2.5) 59 | crack (0.4.5) 60 | rexml 61 | crass (1.0.6) 62 | domain_name (0.5.20190701) 63 | unf (>= 0.0.5, < 1.0.0) 64 | dotenv (2.8.1) 65 | equalizer (0.0.11) 66 | excon (0.92.4) 67 | faraday (1.10.2) 68 | faraday-em_http (~> 1.0) 69 | faraday-em_synchrony (~> 1.0) 70 | faraday-excon (~> 1.1) 71 | faraday-httpclient (~> 1.0) 72 | faraday-multipart (~> 1.0) 73 | faraday-net_http (~> 1.0) 74 | faraday-net_http_persistent (~> 1.0) 75 | faraday-patron (~> 1.0) 76 | faraday-rack (~> 1.0) 77 | faraday-retry (~> 1.0) 78 | ruby2_keywords (>= 0.0.4) 79 | faraday-em_http (1.0.0) 80 | faraday-em_synchrony (1.0.0) 81 | faraday-excon (1.1.0) 82 | faraday-httpclient (1.0.1) 83 | faraday-multipart (1.0.4) 84 | multipart-post (~> 2) 85 | faraday-net_http (1.0.1) 86 | faraday-net_http_persistent (1.2.0) 87 | faraday-patron (1.0.0) 88 | faraday-rack (1.0.0) 89 | faraday-retry (1.0.3) 90 | ffi (1.15.5) 91 | ffi-compiler (1.0.1) 92 | ffi (>= 1.0.0) 93 | rake 94 | fog-aws (3.15.0) 95 | fog-core (~> 2.1) 96 | fog-json (~> 1.1) 97 | fog-xml (~> 0.1) 98 | fog-core (2.3.0) 99 | builder 100 | excon (~> 0.71) 101 | formatador (>= 0.2, < 2.0) 102 | mime-types 103 | fog-json (1.2.0) 104 | fog-core 105 | multi_json (~> 1.10) 106 | fog-xml (0.1.4) 107 | fog-core 108 | nokogiri (>= 1.5.11, < 2.0.0) 109 | foreman (0.87.2) 110 | formatador (1.1.0) 111 | hashdiff (1.0.1) 112 | hetchy (1.0.0) 113 | http-cookie (1.0.5) 114 | domain_name (~> 0.5) 115 | http-form_data (2.3.0) 116 | http-parser (1.2.3) 117 | ffi-compiler (>= 1.0, < 2.0) 118 | http_parser.rb (0.6.0) 119 | idn-ruby (0.1.4) 120 | image_processing (1.12.2) 121 | mini_magick (>= 4.9.5, < 5) 122 | ruby-vips (>= 2.0.17, < 3) 123 | json (2.6.2) 124 | librato-metrics (1.6.2) 125 | aggregate (~> 0.2.2) 126 | faraday (>= 0.7, < 2.0) 127 | multi_json 128 | librato-rack (1.1.1) 129 | hetchy (~> 1.0) 130 | librato-metrics (~> 1.6) 131 | loofah (2.18.0) 132 | crass (~> 1.0.2) 133 | nokogiri (>= 1.5.9) 134 | memoizable (0.4.2) 135 | thread_safe (~> 0.3, >= 0.3.1) 136 | mime-types (3.4.1) 137 | mime-types-data (~> 3.2015) 138 | mime-types-data (3.2022.0105) 139 | mini_magick (4.11.0) 140 | mini_portile2 (2.8.0) 141 | minitest (5.16.3) 142 | multi_json (1.15.0) 143 | multipart-post (2.2.3) 144 | naught (1.1.0) 145 | nokogiri (1.13.8) 146 | mini_portile2 (~> 2.8.0) 147 | racc (~> 1.4) 148 | parallel (1.22.1) 149 | parser (3.1.2.1) 150 | ast (~> 2.4.1) 151 | public_suffix (5.0.0) 152 | racc (1.6.0) 153 | rack (2.2.4) 154 | rainbow (3.1.1) 155 | rake (13.0.6) 156 | rchardet (1.8.0) 157 | redis (5.0.4) 158 | redis-client (>= 0.7.4) 159 | redis-client (0.8.0) 160 | connection_pool 161 | regexp_parser (2.5.0) 162 | resolv (0.2.1) 163 | rexml (3.2.5) 164 | rubocop (1.35.1) 165 | json (~> 2.3) 166 | parallel (~> 1.10) 167 | parser (>= 3.1.2.1) 168 | rainbow (>= 2.2.2, < 4.0) 169 | regexp_parser (>= 1.8, < 3.0) 170 | rexml (>= 3.2.5, < 4.0) 171 | rubocop-ast (>= 1.20.1, < 2.0) 172 | ruby-progressbar (~> 1.7) 173 | unicode-display_width (>= 1.4.0, < 3.0) 174 | rubocop-ast (1.21.0) 175 | parser (>= 3.1.1.0) 176 | rubocop-performance (1.14.3) 177 | rubocop (>= 1.7.0, < 2.0) 178 | rubocop-ast (>= 0.4.0) 179 | ruby-progressbar (1.11.0) 180 | ruby-vips (2.1.4) 181 | ffi (~> 1.12) 182 | ruby2_keywords (0.0.5) 183 | sidekiq (6.5.5) 184 | connection_pool (>= 2.2.2) 185 | rack (~> 2.0) 186 | redis (>= 4.5.0) 187 | simple_oauth (0.3.1) 188 | standard (1.16.1) 189 | rubocop (= 1.35.1) 190 | rubocop-performance (= 1.14.3) 191 | thread_safe (0.3.6) 192 | twitter (7.0.0) 193 | addressable (~> 2.3) 194 | buftok (~> 0.2.0) 195 | equalizer (~> 0.0.11) 196 | http (~> 4.0) 197 | http-form_data (~> 2.0) 198 | http_parser.rb (~> 0.6.0) 199 | memoizable (~> 0.4.0) 200 | multipart-post (~> 2.0) 201 | naught (~> 1.0) 202 | simple_oauth (~> 0.3.0) 203 | twitter-text (3.1.0) 204 | idn-ruby 205 | unf (~> 0.1.0) 206 | unf (0.1.4) 207 | unf_ext 208 | unf_ext (0.0.8.2) 209 | unicode-display_width (2.2.0) 210 | webmock (3.18.1) 211 | addressable (>= 2.8.0) 212 | crack (>= 0.3.2) 213 | hashdiff (>= 0.4.0, < 2.0.0) 214 | 215 | PLATFORMS 216 | ruby 217 | 218 | DEPENDENCIES 219 | addressable 220 | bundler 221 | connection_pool 222 | dotenv 223 | down! 224 | feedjira! 225 | feedkit! 226 | fog-aws 227 | foreman 228 | http! 229 | image_processing 230 | librato-metrics (~> 1.6.2) 231 | librato-rack 232 | mime-types 233 | minitest 234 | nokogiri 235 | rake 236 | redis 237 | resolv 238 | ruby-vips 239 | sax-machine! 240 | sidekiq 241 | standard 242 | unf_ext 243 | webmock 244 | 245 | BUNDLED WITH 246 | 2.3.21 247 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License 2 | =============== 3 | 4 | Copyright 2013 [Ben Ubois](mailto:ben@feedbin.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 9 | 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | crawler_images_parallel: bundle exec sidekiq --concurrency 4 --queue image_parallel_critical,2 --queue image_parallel -q image_parallel_$HOSTNAME --require ./lib/image.rb 2 | crawler_images_serial: bundle exec sidekiq --concurrency 1 --queue image_serial_critical_$HOSTNAME,2 --queue image_serial_$HOSTNAME --require ./lib/image.rb 3 | crawler_feeds_parallel: bundle exec sidekiq --concurrency 40 --queue feed_downloader_critical,2 --queue feed_downloader --require ./lib/refresher.rb 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ‼️ Crawler functionality has been merged into the main Feedbin app. 3 | ``` 4 | 5 | Crawler 6 | ========= 7 | 8 | Crawler is a service meant to be run in combination with [Feedbin](https://github.com/feedbin/feedbin). Crawler refreshes feeds a process image thumbnails. It is a separate service so it can be as lightweight and parallel as possible. 9 | 10 | ### Requirements 11 | 12 | * libvips 8.6+ 13 | * Ruby 3.1 14 | * An AWS S3 bucket 15 | * Redis shared with the main Feedbin instance 16 | 17 | ### Environment variables 18 | 19 | * `AWS_ACCESS_KEY_ID` - Your AWS access key ID 20 | * `AWS_SECRET_ACCESS_KEY` - You AWS secret access key 21 | * `AWS_S3_BUCKET_IMAGES` (or `AWS_S3_BUCKET` if not set) - The bucket to upload the thumbnails to 22 | * `REDIS_URL` - The URL to the Redis instance used by the main Feedbin instance 23 | * `FACEBOOK_ACCESS_TOKEN` - Needed to access Instagram images 24 | 25 | Optional variables, you might need these for non-AWS providers: 26 | 27 | * `AWS_S3_REGION` - The AWS region of your bucket 28 | * `AWS_S3_HOST` - domain of your endpoint 29 | * `AWS_S3_ENDPOINT` - Same but with the scheme and port 30 | * `AWS_S3_PATH_STYLE` - Need to be set to `true` for Minio 31 | 32 | You can technically also use Minio or another S3 alternative by editing the parameters in [lib/storage.rb](lib/storage.rb). The Minio cookbook has [an example](https://github.com/minio/cookbook/blob/master/docs/fog-aws-for-ruby-with-minio.md) with the necessary parameters. 33 | 34 | ### Setup 35 | Clone the repo and install dependencies: 36 | ``` 37 | git clone https://github.com/feedbin/crawler.git 38 | cd crawler 39 | bundle 40 | ``` 41 | 42 | Start the process with `bundle exec foreman start` 43 | 44 | You may need to adjust the `ENTRY_IMAGE_HOST` environment variable of the main Feedbin instance if you want to use a reverse proxy to S3 or if you're using an alternative file server. The variable can be used to replace the hostname clients use to get the images, but the path can't be changed. 45 | 46 | Crawler needs access to the same Redis instance as the main Feedbin instance (`REDIS_URL` environment variable). 47 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "rake/testtask" 2 | 3 | path = __dir__ 4 | 5 | Rake::TestTask.new(:test) do |test| 6 | test.libs = [] 7 | test.ruby_opts = ["-W1"] 8 | test.pattern = "test/**/*_test.rb" 9 | end 10 | 11 | task default: :test 12 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require_relative "../lib/image" 5 | require_relative "../lib/refresher" 6 | 7 | # You can add fixtures and/or initialization code here to make experimenting 8 | # with your gem easier. You can also use a different console, if you like. 9 | 10 | # (If you use this, don't forget to add pry to your Gemfile!) 11 | # require "pry" 12 | # Pry.start 13 | 14 | require "irb" 15 | IRB.start(__FILE__) 16 | -------------------------------------------------------------------------------- /bin/pigo_darwin_arm64: -------------------------------------------------------------------------------- 1 | pigo_darwin_x86_64 -------------------------------------------------------------------------------- /bin/pigo_darwin_x86_64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/bin/pigo_darwin_x86_64 -------------------------------------------------------------------------------- /bin/pigo_linux_x86_64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/bin/pigo_linux_x86_64 -------------------------------------------------------------------------------- /bin/rake: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | bundle exec rake -------------------------------------------------------------------------------- /lib/crawler/image/cache.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class Cache 6 | def self.read(*args) 7 | new.read(*args) 8 | end 9 | 10 | def self.delete(*args) 11 | new.delete(*args) 12 | end 13 | 14 | def self.increment(key, **args) 15 | new.increment(key, **args) 16 | end 17 | 18 | def self.count(*args) 19 | new.count(*args) 20 | end 21 | 22 | def self.write(key, value, **args) 23 | new.write(key, value, **args) 24 | end 25 | 26 | def read(key) 27 | @read ||= begin 28 | value = Sidekiq.redis do |redis| 29 | redis.get key 30 | end 31 | JSON.load(value)&.transform_keys(&:to_sym) || {} 32 | end 33 | end 34 | 35 | def write(key, values, options: {}) 36 | values = values.compact 37 | unless values.empty? 38 | Sidekiq.redis do |redis| 39 | redis.set(key, JSON.dump(values)) 40 | end 41 | end 42 | write_key_expiry(key, options) 43 | end 44 | 45 | def delete(*keys) 46 | Sidekiq.redis { |redis| redis.unlink(*keys) } 47 | end 48 | 49 | def increment(key, options: {}) 50 | count = Sidekiq.redis { |redis| redis.incr(key) } 51 | write_key_expiry(key, options) 52 | count 53 | end 54 | 55 | def count(key) 56 | Sidekiq.redis { |redis| redis.get(key) }.to_i 57 | end 58 | 59 | def write_key_expiry(key, options) 60 | if options[:expires_in] 61 | Sidekiq.redis do |redis| 62 | redis.expire key, options[:expires_in] 63 | end 64 | end 65 | end 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /lib/crawler/image/cascade/facefinder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/lib/crawler/image/cascade/facefinder -------------------------------------------------------------------------------- /lib/crawler/image/download.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class Download 6 | attr_reader :path 7 | 8 | def initialize(url, minimum_size: 20_000) 9 | @url = url 10 | @valid = false 11 | @minimum_size = minimum_size 12 | end 13 | 14 | def self.download!(url, **args) 15 | klass = find_download_provider(url) || Download::Default 16 | instance = klass.new(url, **args) 17 | instance.download 18 | instance 19 | end 20 | 21 | def image_url 22 | @url 23 | end 24 | 25 | def download_file(url) 26 | @file = Down.download(url, max_size: 10 * 1024 * 1024, timeout_options: {read_timeout: 20, write_timeout: 5, connect_timeout: 5}) 27 | @path = @file.path 28 | end 29 | 30 | def persist! 31 | unless @path == persisted_path 32 | FileUtils.mv @path, persisted_path 33 | @path = persisted_path 34 | end 35 | persisted_path 36 | end 37 | 38 | def delete! 39 | @file.respond_to?(:close) && @file.close 40 | @file.respond_to?(:unlink) && @file.unlink 41 | @path && File.unlink(@path) 42 | rescue Errno::ENOENT 43 | end 44 | 45 | def persisted_path 46 | @persisted_path ||= File.join(Dir.tmpdir, ["image_original_", SecureRandom.hex].join) 47 | end 48 | 49 | def valid? 50 | valid = @file && @file.content_type&.start_with?("image") 51 | valid &&= @file.size >= @minimum_size unless @minimum_size.nil? 52 | valid 53 | end 54 | 55 | def provider_identifier 56 | self.class.recognize_url?(@url) 57 | end 58 | 59 | def self.recognize_url?(src_url) 60 | if supported_urls.find { src_url.to_s =~ _1 } 61 | Regexp.last_match[1] 62 | else 63 | false 64 | end 65 | end 66 | 67 | def self.find_download_provider(url) 68 | download_providers.detect { |klass| klass.recognize_url?(url) } 69 | end 70 | 71 | def self.download_providers 72 | [ 73 | Download::Youtube, 74 | Download::Instagram, 75 | Download::Vimeo 76 | ] 77 | end 78 | 79 | def self.supported_urls 80 | [] 81 | end 82 | end 83 | end 84 | end -------------------------------------------------------------------------------- /lib/crawler/image/download/default.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class Download::Default < Download 6 | def self.recognize_url?(*args) 7 | true 8 | end 9 | 10 | def download 11 | download_file(image_url) 12 | rescue Down::Error => exception 13 | end 14 | end 15 | end 16 | end -------------------------------------------------------------------------------- /lib/crawler/image/download/instagram.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class Download::Instagram < Download 6 | def self.supported_urls 7 | [ 8 | %r{.*?//www\.instagram\.com/p/(.*?)(/|#|\?|$)}, 9 | %r{.*?//instagram\.com/p/(.*?)(/|#|\?|$)} 10 | ] 11 | end 12 | 13 | def download 14 | download_file(image_url) 15 | rescue Down::Error => exception 16 | end 17 | 18 | def image_url 19 | data.dig("thumbnail_url") 20 | end 21 | 22 | private 23 | 24 | OEMBED_URL = "https://graph.facebook.com/v9.0/instagram_oembed" 25 | 26 | def data 27 | @data ||= begin 28 | options = { 29 | params: { 30 | access_token: ENV["FACEBOOK_ACCESS_TOKEN"], 31 | url: "https://instagram.com/p/#{provider_identifier}", 32 | fields: "thumbnail_url" 33 | } 34 | } 35 | JSON.load(HTTP.get(OEMBED_URL, options).to_s) 36 | end 37 | end 38 | end 39 | end 40 | end -------------------------------------------------------------------------------- /lib/crawler/image/download/vimeo.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class Download::Vimeo < Download 6 | def self.supported_urls 7 | [ 8 | %r{.*?//vimeo\.com/video/(.*?)(#|\?|$)}, 9 | %r{.*?//vimeo\.com/(.*?)(#|\?|$)}, 10 | %r{.*?//player\.vimeo\.com/video/(.*?)(#|\?|$)} 11 | ] 12 | end 13 | 14 | def download 15 | download_file(image_url) 16 | rescue Down::Error => exception 17 | end 18 | 19 | def image_url 20 | data.dig("thumbnail_url").gsub(/_\d+.jpg/, ".jpg") 21 | end 22 | 23 | private 24 | 25 | OEMBED_URL = "https://vimeo.com/api/oembed.json" 26 | 27 | def data 28 | @data ||= begin 29 | options = { 30 | params: { 31 | url: "https://vimeo.com/#{provider_identifier}" 32 | } 33 | } 34 | JSON.load(HTTP.get(OEMBED_URL, options).to_s) 35 | end 36 | end 37 | end 38 | end 39 | end -------------------------------------------------------------------------------- /lib/crawler/image/download/youtube.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class Download::Youtube < Download 6 | attr_reader :image_url 7 | 8 | def self.supported_urls 9 | [ 10 | %r{.*?//www\.youtube-nocookie\.com/embed/(.*?)(\?|$)}, 11 | %r{.*?//www\.youtube\.com/embed/(.*?)(\?|$)}, 12 | %r{.*?//www\.youtube\.com/user/.*?#\w/\w/\w/\w/(.+)\b}, 13 | %r{.*?//www\.youtube\.com/v/(.*?)(#|\?|$)}, 14 | %r{.*?//www\.youtube\.com/watch\?v=(.*?)(&|#|$)}, 15 | %r{.*?//youtube-nocookie\.com/embed/(.*?)(\?|$)}, 16 | %r{.*?//youtube\.com/embed/(.*?)(\?|$)}, 17 | %r{.*?//youtu\.be/(.+)} 18 | ] 19 | end 20 | 21 | def download 22 | ["maxresdefault", "hqdefault"].each do |option| 23 | @image_url = "https://i.ytimg.com/vi/#{provider_identifier}/#{option}.jpg" 24 | download_file(@image_url) 25 | break 26 | rescue Down::Error => exception 27 | end 28 | end 29 | end 30 | end 31 | end -------------------------------------------------------------------------------- /lib/crawler/image/download_cache.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class DownloadCache 6 | include Helpers 7 | 8 | attr_reader :storage_url 9 | 10 | def initialize(url, public_id:, preset_name:) 11 | @url = url 12 | @public_id = public_id 13 | @preset_name = preset_name 14 | @storage_url = nil 15 | end 16 | 17 | def self.copy(url, **args) 18 | instance = new(url, **args) 19 | instance.copy 20 | instance 21 | end 22 | 23 | def copy 24 | @storage_url = copy_image unless storage_url.nil? || storage_url == false 25 | end 26 | 27 | def copied? 28 | !!@storage_url 29 | end 30 | 31 | def storage_url 32 | @storage_url ||= cache[:storage_url] 33 | end 34 | 35 | def image_url 36 | @image_url ||= cache[:image_url] 37 | end 38 | 39 | def download? 40 | !previously_attempted? && storage_url != false 41 | end 42 | 43 | def previously_attempted? 44 | !cache.empty? 45 | end 46 | 47 | def save(storage_url:, image_url:) 48 | @cache = {storage_url: storage_url, image_url: image_url} 49 | Cache.write(cache_key, @cache, options: {expires_in: 7 * 24 * 60 * 60}) 50 | end 51 | 52 | def cache 53 | @cache ||= Cache.read(cache_key) 54 | end 55 | 56 | def cache_key 57 | "image_download_#{@preset_name}_#{Digest::SHA1.hexdigest(@url)}" 58 | end 59 | 60 | def copy_image 61 | url = URI.parse(storage_url) 62 | source_object_name = url.path[1..-1] 63 | Fog::Storage.new(STORAGE_OPTIONS).copy_object(AWS_S3_BUCKET_IMAGES, source_object_name, AWS_S3_BUCKET_IMAGES, image_name, storage_options) 64 | final_url = url.path = "/#{image_name}" 65 | url.to_s 66 | rescue Excon::Error::NotFound 67 | false 68 | end 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /lib/crawler/image/helpers.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | module Helpers 6 | AWS_S3_BUCKET_IMAGES = ENV["AWS_S3_BUCKET_IMAGES"] || ENV["AWS_S3_BUCKET"] 7 | 8 | def preset 9 | OpenStruct.new(IMAGE_PRESETS[@preset_name.to_sym]) 10 | end 11 | 12 | def send_to_feedbin(original_url:, storage_url:) 13 | Sidekiq::Client.push( 14 | "args" => [@public_id, { 15 | "original_url" => original_url, 16 | "processed_url" => storage_url, 17 | "width" => preset.width, 18 | "height" => preset.height 19 | }], 20 | "class" => preset.job_class, 21 | "queue" => "default" 22 | ) 23 | end 24 | 25 | def image_name 26 | File.join(@public_id[0..6], "#{@public_id}.jpg") 27 | end 28 | 29 | def storage_options 30 | { 31 | "Cache-Control" => "max-age=315360000, public", 32 | "Expires" => "Sun, 29 Jun 2036 17:48:34 GMT", 33 | "x-amz-storage-class" => ENV["AWS_S3_STORAGE_CLASS"] || "REDUCED_REDUNDANCY", 34 | "x-amz-acl" => "public-read" 35 | } 36 | end 37 | end 38 | end 39 | end -------------------------------------------------------------------------------- /lib/crawler/image/image_processor.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class ImageProcessor 6 | attr_reader :path 7 | 8 | def initialize(file, target_width:, target_height:) 9 | @file = file 10 | @target_width = target_width 11 | @target_height = target_height 12 | end 13 | 14 | def valid? 15 | source.avg && width >= @target_width && height >= @target_height 16 | rescue ::Vips::Error 17 | false 18 | end 19 | 20 | def height 21 | source.height 22 | end 23 | 24 | def width 25 | source.width 26 | end 27 | 28 | def color 29 | hex = nil 30 | file = ImageProcessing::Vips 31 | .source(source) 32 | .resize_to_fill(1, 1, sharpen: false) 33 | .custom { |image| 34 | image.tap do |data| 35 | hex = data.getpoint(0, 0).map { |value| "%02x" % value }.join 36 | end 37 | }.call 38 | file.unlink 39 | hex 40 | end 41 | 42 | def source 43 | @source ||= Vips::Image.new_from_file(@file) 44 | end 45 | 46 | def pipeline(resized_width, resized_height) 47 | ImageProcessing::Vips 48 | .source(source) 49 | .resize_to_fill(resized_width, resized_height) 50 | .convert("jpg") 51 | .saver(strip: true, quality: 90) 52 | end 53 | 54 | def fill_crop 55 | pipeline(@target_width, @target_height).call(destination: persisted_path) 56 | persisted_path 57 | end 58 | 59 | def smart_crop 60 | return fill_crop if resize_too_small? || resize_just_right? 61 | 62 | image = pipeline(resized.width, resized.height) 63 | 64 | if resized.width > @target_width 65 | axis = "x" 66 | contraint = @target_width 67 | max = resized.width - @target_width 68 | else 69 | axis = "y" 70 | contraint = @target_height 71 | max = resized.height - @target_height 72 | end 73 | 74 | if center = average_face_position(axis, image.call) 75 | point = {"x" => 0, "y" => 0} 76 | point[axis] = (center.to_f - contraint.to_f / 2.0).floor 77 | 78 | if point[axis] < 0 79 | point[axis] = 0 80 | elsif point[axis] > max 81 | point[axis] = max 82 | end 83 | 84 | image = image.crop(point["x"], point["y"], @target_width, @target_height) 85 | else 86 | image = image.resize_to_fill(@target_width, @target_height, crop: :attention) 87 | end 88 | 89 | image.call(destination: persisted_path) 90 | persisted_path 91 | end 92 | 93 | def resized 94 | @resized ||= begin 95 | resized_width = @target_width.to_f 96 | 97 | width_proportion = width.to_f / height.to_f 98 | height_proportion = height.to_f / width.to_f 99 | 100 | resized_height = resized_width * height_proportion 101 | 102 | if resized_height < @target_height 103 | resized_height = @target_height.to_f 104 | resized_width = resized_height * width_proportion 105 | end 106 | OpenStruct.new({width: resized_width.to_i, height: resized_height.to_i}) 107 | end 108 | end 109 | 110 | def average_face_position(axis, file) 111 | params = { 112 | pigo: Shellwords.escape(PIGO), 113 | image: Shellwords.escape(file.path), 114 | cascade: Shellwords.escape(CASCADE) 115 | } 116 | command = "%s -in %s -out empty -cf %s -scale 1.2 -json -" 117 | out, _, status = Open3.capture3(command % params) 118 | begin 119 | File.unlink(file) 120 | rescue 121 | Errno::ENOENT 122 | end 123 | 124 | faces = if status.success? 125 | JSON.load(out) 126 | end 127 | 128 | return nil if faces.nil? 129 | 130 | result = faces.flat_map { |face| face.dig("face") }.map do |face| 131 | face[axis] + face["size"] / 2 132 | end 133 | 134 | (result.sum(0.0) / result.size).to_i 135 | end 136 | 137 | def persisted_path 138 | @persisted_path ||= File.join(Dir.tmpdir, ["image_processed_", SecureRandom.hex, ".jpg"].join) 139 | end 140 | 141 | def resize_too_small? 142 | resized.width < @target_width || resized.height < @target_height 143 | end 144 | 145 | def resize_just_right? 146 | resized.width == @target_width && resized.height == @target_height 147 | end 148 | end 149 | end 150 | end 151 | -------------------------------------------------------------------------------- /lib/crawler/image/initializers/constants.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | module Crawler 3 | module Image 4 | pigo_name = "pigo_#{Etc.uname[:sysname].downcase}_#{Etc.uname[:machine]}" 5 | CASCADE = File.expand_path("../cascade/facefinder", __dir__) 6 | PIGO = File.expand_path("../../../../bin/#{pigo_name}", __dir__) 7 | raise "Architecture not supported. Add #{pigo_name} to ./bin from https://github.com/esimov/pigo" unless File.executable?(PIGO) 8 | 9 | IMAGE_PRESETS = { 10 | primary: { 11 | width: 542, 12 | height: 304, 13 | minimum_size: 20_000, 14 | crop: :smart_crop, 15 | job_class: "EntryImage" 16 | }, 17 | twitter: { 18 | width: 542, 19 | height: 304, 20 | minimum_size: 10_000, 21 | crop: :smart_crop, 22 | job_class: "TwitterLinkImage" 23 | }, 24 | youtube: { 25 | width: 542, 26 | height: 304, 27 | minimum_size: nil, 28 | crop: :fill_crop, 29 | job_class: "EntryImage" 30 | }, 31 | podcast: { 32 | width: 200, 33 | height: 200, 34 | minimum_size: nil, 35 | crop: :fill_crop, 36 | job_class: "ItunesImage" 37 | }, 38 | podcast_feed: { 39 | width: 200, 40 | height: 200, 41 | minimum_size: nil, 42 | crop: :fill_crop, 43 | job_class: "ItunesFeedImage" 44 | } 45 | } 46 | end 47 | end -------------------------------------------------------------------------------- /lib/crawler/image/initializers/down.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | Down.backend :http 4 | -------------------------------------------------------------------------------- /lib/crawler/image/initializers/librato.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | Librato.tracker.start! 4 | -------------------------------------------------------------------------------- /lib/crawler/image/initializers/sidekiq.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | Sidekiq.configure_server do |config| 4 | config.server_middleware do |chain| 5 | chain.add WorkerStat 6 | end 7 | config.redis = {id: "image-server-#{::Process.pid}"} 8 | end 9 | 10 | Sidekiq.configure_client do |config| 11 | config.redis = {id: "image-client-#{::Process.pid}"} 12 | end 13 | 14 | Sidekiq.strict_args!(false) 15 | -------------------------------------------------------------------------------- /lib/crawler/image/initializers/storage.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | STORAGE_OPTIONS = { 6 | provider: "AWS", 7 | aws_access_key_id: ENV["AWS_ACCESS_KEY_ID"], 8 | aws_secret_access_key: ENV["AWS_SECRET_ACCESS_KEY"] 9 | } 10 | STORAGE_OPTIONS[:region] = ENV["AWS_S3_REGION"] if ENV["AWS_S3_REGION"] 11 | STORAGE_OPTIONS[:host] = ENV["AWS_S3_HOST"] if ENV["AWS_S3_HOST"] 12 | STORAGE_OPTIONS[:endpoint] = ENV["AWS_S3_ENDPOINT"] if ENV["AWS_S3_ENDPOINT"] 13 | STORAGE_OPTIONS[:path_style] = ENV["AWS_S3_PATH_STYLE"] if ENV["AWS_S3_PATH_STYLE"] 14 | end 15 | end -------------------------------------------------------------------------------- /lib/crawler/image/initializers/worker_stat.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | class WorkerStat 4 | def call(worker, item, queue) 5 | title = "worker.#{worker.class}" 6 | Librato.increment "#{title}.count" 7 | Librato.timing title, percentile: [95] do 8 | yield 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/crawler/image/jobs/find_image.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class FindImage 6 | include Sidekiq::Worker 7 | include Helpers 8 | sidekiq_options queue: "image_parallel", retry: false 9 | 10 | def perform(public_id, preset_name, candidate_urls, entry_url = nil) 11 | @public_id = public_id 12 | @preset_name = preset_name 13 | @entry_url = entry_url 14 | @candidate_urls = combine_urls(candidate_urls) 15 | timer = Timer.new(45) 16 | count = 0 17 | 18 | while original_url = @candidate_urls.shift 19 | count += 1 20 | 21 | if count > 10 22 | Sidekiq.logger.info "Exceeded count limit: public_id=#{@public_id} count=#{count}" 23 | break 24 | end 25 | 26 | if timer.expired? 27 | Sidekiq.logger.info "Exceeded total time limit: public_id=#{@public_id} elapsed_time=#{timer.elapsed}" 28 | break 29 | end 30 | 31 | Sidekiq.logger.info "Candidate: public_id=#{@public_id} original_url=#{original_url} count=#{count}" 32 | 33 | download_cache = DownloadCache.copy(original_url, public_id: @public_id, preset_name: @preset_name) 34 | if download_cache.copied? 35 | send_to_feedbin(original_url: download_cache.image_url, storage_url: download_cache.storage_url) 36 | Sidekiq.logger.info "Copied image: public_id=#{@public_id} image_url=#{download_cache.image_url} storage_url=#{download_cache.storage_url}" 37 | break 38 | elsif download_cache.download? 39 | break if download_image(original_url, download_cache) 40 | else 41 | Sidekiq.logger.info "Skipping download: public_id=#{@public_id} original_url=#{original_url}" 42 | end 43 | 44 | end 45 | end 46 | 47 | def download_image(original_url, download_cache) 48 | found = false 49 | download = Download.download!(original_url, minimum_size: preset.minimum_size) 50 | if download.valid? 51 | found = true 52 | ProcessImage.perform_async(@public_id, @preset_name, download.persist!, original_url, download.image_url, @candidate_urls) 53 | Sidekiq.logger.info "Download valid: public_id=#{@public_id} image_url=#{download.image_url}" 54 | else 55 | download.delete! 56 | download_cache.save(storage_url: false, image_url: false) 57 | Sidekiq.logger.info "Download invalid: public_id=#{@public_id} original_url=#{original_url}" 58 | end 59 | found 60 | rescue => exception 61 | download.delete! 62 | Sidekiq.logger.info "Download failed: exception=#{exception.inspect} original_url=#{original_url}" 63 | false 64 | end 65 | 66 | def combine_urls(candidate_urls) 67 | return candidate_urls unless @entry_url 68 | 69 | if Download.find_download_provider(@entry_url) 70 | page_urls = [@entry_url] 71 | Sidekiq.logger.info "Recognized URL: public_id=#{@public_id} entry_url=#{@entry_url}" 72 | else 73 | page_urls = MetaImages.find_urls(@entry_url) 74 | Sidekiq.logger.info "MetaImages: public_id=#{@public_id} count=#{page_urls&.length || 0} entry_url=#{@entry_url}" 75 | end 76 | page_urls ||= [] 77 | page_urls.concat(candidate_urls) 78 | end 79 | end 80 | 81 | class FindImageCritical 82 | include Sidekiq::Worker 83 | sidekiq_options queue: "image_parallel_critical", retry: false 84 | def perform(*args) 85 | FindImage.new.perform(*args) 86 | end 87 | end 88 | end 89 | end -------------------------------------------------------------------------------- /lib/crawler/image/jobs/process_image.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class ProcessImage 6 | include Sidekiq::Worker 7 | include Helpers 8 | sidekiq_options queue: "image_serial_#{Socket.gethostname}", retry: false 9 | 10 | def perform(public_id, preset_name, original_path, original_url, image_url, candidate_urls) 11 | @preset_name = preset_name 12 | Sidekiq.logger.info "ProcessImage: public_id=#{public_id} original_url=#{original_url}" 13 | image = ImageProcessor.new(original_path, target_width: preset.width, target_height: preset.height) 14 | if image.valid? 15 | processed_path = image.send(preset.crop) 16 | UploadImage.perform_async(public_id, @preset_name, processed_path, original_url, image_url) 17 | else 18 | FindImageCritical.perform_async(public_id, @preset_name, candidate_urls) unless candidate_urls.empty? 19 | end 20 | begin 21 | File.unlink(original_path) 22 | rescue Errno::ENOENT 23 | end 24 | end 25 | end 26 | 27 | class ProcessImageCritical 28 | include Sidekiq::Worker 29 | sidekiq_options queue: "image_serial_critical_#{Socket.gethostname}", retry: false 30 | def perform(*args) 31 | ProcessImage.new.perform(*args) 32 | end 33 | end 34 | end 35 | end -------------------------------------------------------------------------------- /lib/crawler/image/jobs/upload_image.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class UploadImage 6 | include Sidekiq::Worker 7 | include Helpers 8 | sidekiq_options queue: "image_parallel_#{Socket.gethostname}", retry: false 9 | 10 | def perform(public_id, preset_name, image_path, original_url, image_url) 11 | @public_id = public_id 12 | @preset_name = preset_name 13 | @original_url = original_url 14 | @image_path = image_path 15 | 16 | storage_url = upload 17 | send_to_feedbin(original_url: image_url, storage_url: storage_url) 18 | begin 19 | File.unlink(image_path) 20 | rescue Errno::ENOENT 21 | end 22 | 23 | DownloadCache.new(@original_url, public_id: @public_id, preset_name: @preset_name).save(storage_url: storage_url, image_url: image_url) 24 | Sidekiq.logger.info "UploadImage: public_id=#{@public_id} original_url=#{@original_url} storage_url=#{storage_url}" 25 | end 26 | 27 | def upload 28 | File.open(@image_path) do |file| 29 | response = Fog::Storage.new(STORAGE_OPTIONS).put_object(AWS_S3_BUCKET_IMAGES, image_name, file, storage_options) 30 | URI::HTTPS.build( 31 | host: response.data[:host], 32 | path: response.data[:path] 33 | ).to_s 34 | end 35 | end 36 | end 37 | end 38 | end -------------------------------------------------------------------------------- /lib/crawler/image/meta_images.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class MetaImages 6 | def initialize(url) 7 | @url = url 8 | end 9 | 10 | def self.find_urls(url) 11 | new(url).find_urls 12 | rescue Addressable::URI::InvalidURIError 13 | [] 14 | end 15 | 16 | def find_urls 17 | if cache.urls 18 | cache.urls 19 | elsif needs_download? 20 | download 21 | end 22 | end 23 | 24 | def download 25 | urls = [] 26 | file = Down.download(parsed_url, max_size: 5 * 1024 * 1024) 27 | urls = parse(file) 28 | rescue Down::Error => exception 29 | Sidekiq.logger.info "PageImages: exception=#{exception.inspect} url=#{@url}" 30 | urls 31 | ensure 32 | cache.save({checked: true, urls: urls}) 33 | cache.has_meta!(!urls.empty?) 34 | end 35 | 36 | def parse(file) 37 | Nokogiri.HTML5(file.read).search("meta[property='twitter:image'], meta[property='og:image']").map do |element| 38 | url = element["content"]&.strip 39 | next if url.nil? 40 | next if url == "" 41 | Addressable::URI.join(parsed_url, url) 42 | end.compact 43 | end 44 | 45 | def needs_download? 46 | !cache.checked? && cache.has_meta? 47 | end 48 | 49 | def cache 50 | @cache ||= MetaImagesCache.new(parsed_url) 51 | end 52 | 53 | def parsed_url 54 | @parsed_url ||= begin 55 | parsed = Addressable::URI.parse(@url) 56 | raise Addressable::URI::InvalidURIError if parsed.host.nil? 57 | parsed 58 | end 59 | end 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /lib/crawler/image/meta_images_cache.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class MetaImagesCache 6 | def initialize(url) 7 | @url = url 8 | end 9 | 10 | def urls 11 | url_cache[:urls] 12 | end 13 | 14 | def checked? 15 | !!url_cache[:checked] 16 | end 17 | 18 | def has_meta!(result) 19 | @host_cache = {has_meta: result} 20 | Cache.write(host_cache_key, @host_cache, options: {expires_in: 24 * 60 * 60}) 21 | end 22 | 23 | def has_meta? 24 | host_cache[:has_meta].nil? ? true : host_cache[:has_meta] 25 | end 26 | 27 | def save(data) 28 | @url_cache = data 29 | Cache.write(url_cache_key, data, options: {expires_in: 24 * 60 * 60}) 30 | end 31 | 32 | def url_cache 33 | @url_cache ||= Cache.read(url_cache_key) 34 | end 35 | 36 | def host_cache 37 | @host_cache ||= Cache.read(host_cache_key) 38 | end 39 | 40 | def host_cache_key 41 | "image_host_#{Digest::SHA1.hexdigest(@url.host)}" 42 | end 43 | 44 | def url_cache_key 45 | "image_url_#{Digest::SHA1.hexdigest(@url)}" 46 | end 47 | end 48 | end 49 | end -------------------------------------------------------------------------------- /lib/crawler/image/timer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Image 5 | class Timer 6 | def initialize(timeout = 0) 7 | start 8 | @deadline = now + timeout.to_f 9 | end 10 | 11 | def expired? 12 | now > @deadline 13 | end 14 | 15 | def now 16 | ::Process.clock_gettime(::Process::CLOCK_MONOTONIC) 17 | end 18 | 19 | def elapsed 20 | (now - start).ceil(2) 21 | end 22 | 23 | def start 24 | @start ||= now 25 | end 26 | end 27 | end 28 | end -------------------------------------------------------------------------------- /lib/crawler/refresher/cache.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Refresher 5 | class Cache 6 | def self.read(*args) 7 | new.read(*args) 8 | end 9 | 10 | def self.delete(*args) 11 | new.delete(*args) 12 | end 13 | 14 | def self.increment(key, **args) 15 | new.increment(key, **args) 16 | end 17 | 18 | def self.count(*args) 19 | new.count(*args) 20 | end 21 | 22 | def self.write(key, value, **args) 23 | new.write(key, value, **args) 24 | end 25 | 26 | def read(key) 27 | @read ||= begin 28 | hash = Sidekiq.redis do |redis| 29 | redis.hgetall key 30 | end 31 | hash.transform_keys(&:to_sym) 32 | end 33 | end 34 | 35 | def write(key, values, options: {}) 36 | values = values.compact 37 | unless values.empty? 38 | Sidekiq.redis do |redis| 39 | redis.mapped_hmset(key, values) 40 | end 41 | end 42 | write_key_expiry(key, options) 43 | end 44 | 45 | def delete(*keys) 46 | Sidekiq.redis {|redis| redis.unlink(*keys) } 47 | end 48 | 49 | def increment(key, options: {}) 50 | count = Sidekiq.redis {|redis| redis.incr(key) } 51 | write_key_expiry(key, options) 52 | count 53 | end 54 | 55 | def count(key) 56 | Sidekiq.redis {|redis| redis.get(key) }.to_i 57 | end 58 | 59 | def write_key_expiry(key, options) 60 | if options[:expires_in] 61 | Sidekiq.redis do |redis| 62 | redis.expire key, options[:expires_in] 63 | end 64 | end 65 | end 66 | end 67 | end 68 | end -------------------------------------------------------------------------------- /lib/crawler/refresher/feed.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Refresher 5 | class Feed 6 | extend Forwardable 7 | 8 | def_delegators :http_cache, :etag, :last_modified, :checksum, :save 9 | def_delegators :feed_status, :ok?, :log_download!, :downloaded_at 10 | 11 | def_delegator :feed_status, :count, :attempt_count 12 | def_delegator :feed_status, :error!, :download_error 13 | def_delegator :redirect_cache, :read, :redirect 14 | 15 | attr_accessor :redirects 16 | 17 | def initialize(feed_id) 18 | @feed_id = feed_id 19 | @redirects = [] 20 | end 21 | 22 | def next_attempt 23 | Time.at(feed_status.next_retry).utc.iso8601 24 | end 25 | 26 | def download_success 27 | feed_status.clear! unless last_error && last_error["class"] == "Feedkit::NotFeed" 28 | redirect_cache.save(redirects) 29 | end 30 | 31 | def last_error 32 | feed_status.attempt_log.first 33 | end 34 | 35 | def redirect_cache 36 | @redirect_cache ||= RedirectCache.new(@feed_id) 37 | end 38 | 39 | def feed_status 40 | @feed_status ||= FeedStatus.new(@feed_id) 41 | end 42 | 43 | def http_cache 44 | @http_cache ||= HTTPCache.new(@feed_id) 45 | end 46 | 47 | def inspect 48 | "#<#{self.class}:#{object_id.to_s(16)} @feed_id=#{@feed_id} next_attempt=#{next_attempt} redirect=#{redirect.inspect} http_cache=#{http_cache.cached} last_error=#{last_error.inspect}>" 49 | end 50 | end 51 | end 52 | end -------------------------------------------------------------------------------- /lib/crawler/refresher/feed_status.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | 4 | module Crawler 5 | module Refresher 6 | class FeedStatus 7 | def initialize(feed_id) 8 | @feed_id = feed_id 9 | end 10 | 11 | def self.clear!(*args) 12 | new(*args).clear! 13 | end 14 | 15 | def clear! 16 | Cache.delete(cache_key, errors_cache_key, log_cache_key) 17 | end 18 | 19 | def error!(exception, formatted: false) 20 | @count = count + 1 21 | Cache.write(cache_key, { 22 | count: @count, 23 | failed_at: Time.now.to_i 24 | }) 25 | exception = formatted ? exception : error_json(exception) 26 | Sidekiq.redis do |redis| 27 | redis.pipelined do |pipeline| 28 | pipeline.lpush(errors_cache_key, exception) 29 | pipeline.ltrim(errors_cache_key, 0, 25) 30 | end 31 | end 32 | end 33 | 34 | def log_download! 35 | @downloaded_at = Time.now.to_i 36 | Cache.write(log_cache_key, { 37 | downloaded_at: @downloaded_at 38 | }) 39 | @downloaded_at 40 | end 41 | 42 | def downloaded_at 43 | @downloaded_at ||= log_cache[:downloaded_at] && log_cache[:downloaded_at].to_i 44 | end 45 | 46 | def ok? 47 | Time.now.to_i > next_retry 48 | end 49 | 50 | def next_retry 51 | failed_at + backoff 52 | end 53 | 54 | def backoff 55 | multiplier = [count, 8].max 56 | multiplier = [multiplier, 23].min 57 | multiplier ** 4 58 | end 59 | 60 | def count 61 | @count ||= cached[:count].to_i 62 | end 63 | 64 | def failed_at 65 | cached[:failed_at].to_i 66 | end 67 | 68 | def attempt_log 69 | @attempt_log ||= begin 70 | Sidekiq.redis do |redis| 71 | redis.lrange(errors_cache_key, 0, -1) 72 | end.map do |json| 73 | data = JSON.load(json) 74 | data["date"] = Time.at(data["date"]) 75 | data 76 | end 77 | end 78 | end 79 | 80 | def error_json(exception) 81 | status = exception.respond_to?(:response) ? exception.response.status.code : nil 82 | JSON.dump({date: Time.now.to_i, class: exception.class, message: exception.message, status: status}) 83 | end 84 | 85 | def log_cache 86 | @log_cache ||= Cache.read(log_cache_key) 87 | end 88 | 89 | def cached 90 | @cached ||= Cache.read(cache_key) 91 | end 92 | 93 | def cache_key 94 | "refresher_status_#{@feed_id}" 95 | end 96 | 97 | def errors_cache_key 98 | "refresher_errors_#{@feed_id}" 99 | end 100 | 101 | def log_cache_key 102 | "refresher_log_#{@feed_id}" 103 | end 104 | end 105 | end 106 | end -------------------------------------------------------------------------------- /lib/crawler/refresher/http_cache.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Refresher 5 | class HTTPCache 6 | def initialize(feed_id) 7 | @feed_id = feed_id 8 | end 9 | 10 | def save(response) 11 | data = { 12 | etag: response.etag, 13 | last_modified: response.last_modified, 14 | checksum: response.checksum 15 | } 16 | Cache.write(cache_key, data, options: {expires_in: 8 * 60 * 60}) 17 | end 18 | 19 | def etag 20 | cached[:etag] 21 | end 22 | 23 | def last_modified 24 | cached[:last_modified] 25 | end 26 | 27 | def checksum 28 | cached[:checksum] 29 | end 30 | 31 | def cached 32 | @cached ||= begin 33 | Cache.read(cache_key) 34 | end 35 | end 36 | 37 | def cache_key 38 | "refresher_http_#{@feed_id}" 39 | end 40 | end 41 | end 42 | end -------------------------------------------------------------------------------- /lib/crawler/refresher/initializers/redis.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | $redis = ConnectionPool.new(size: 3, timeout: 5) { 4 | if ENV["REDIS_ID_URL"] 5 | Redis.new(url: ENV["REDIS_ID_URL"]) 6 | else 7 | Redis.new 8 | end 9 | } 10 | -------------------------------------------------------------------------------- /lib/crawler/refresher/initializers/sidekiq.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | Sidekiq.configure_server do |config| 4 | config.redis = {id: "refresher-server-#{::Process.pid}"} 5 | end 6 | 7 | Sidekiq.configure_client do |config| 8 | config.redis = {id: "refresher-client-#{::Process.pid}"} 9 | end 10 | 11 | Sidekiq.strict_args!(false) -------------------------------------------------------------------------------- /lib/crawler/refresher/jobs/feed_downloader.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Refresher 5 | class FeedDownloader 6 | include Sidekiq::Worker 7 | 8 | sidekiq_options queue: :feed_downloader, retry: false, backtrace: false 9 | 10 | def perform(feed_id, feed_url, subscribers, critical = false) 11 | @feed_id = feed_id 12 | @feed_url = feed_url 13 | @subscribers = subscribers 14 | @critical = critical 15 | @feed = Feed.new(feed_id) 16 | 17 | throttle = Throttle.new(@feed_url, @feed.downloaded_at) 18 | if @critical 19 | download 20 | elsif throttle.throttled? 21 | Sidekiq.logger.info "Throttled downloaded_at=#{Time.at(@feed.downloaded_at)} url=#{@feed_url}" 22 | elsif @feed.ok? 23 | download 24 | end 25 | end 26 | 27 | def download 28 | @feed.log_download! 29 | @response = begin 30 | request 31 | rescue Feedkit::ZlibError 32 | request(auto_inflate: false) 33 | end 34 | 35 | not_modified = @response.not_modified?(@feed.checksum) 36 | Sidekiq.logger.info "Downloaded modified=#{!not_modified} http_status=\"#{@response.status}\" url=#{@feed_url}" 37 | parse unless not_modified 38 | @feed.download_success 39 | rescue Feedkit::Error => exception 40 | @feed.download_error(exception) 41 | Sidekiq.logger.info "Feedkit::Error: attempts=#{@feed.attempt_count} exception=#{exception.inspect} id=#{@feed_id} url=#{@feed_url}" 42 | end 43 | 44 | def request(auto_inflate: true) 45 | parsed_url = Feedkit::BasicAuth.parse(@feed_url) 46 | url = @feed.redirect ? @feed.redirect : parsed_url.url 47 | Sidekiq.logger.info "Redirect: from=#{@feed_url} to=#{@feed.redirect} id=#{@feed_id}" if @feed.redirect 48 | Feedkit::Request.download(url, 49 | on_redirect: on_redirect, 50 | username: parsed_url.username, 51 | password: parsed_url.password, 52 | last_modified: @feed.last_modified, 53 | etag: @feed.etag, 54 | auto_inflate: auto_inflate, 55 | user_agent: "Feedbin feed-id:#{@feed_id} - #{@subscribers} subscribers" 56 | ) 57 | end 58 | 59 | def on_redirect 60 | proc do |from, to| 61 | @feed.redirects.push Redirect.new(@feed_id, status: from.status.code, from: from.uri.to_s, to: to.uri.to_s) 62 | end 63 | end 64 | 65 | def parse 66 | @response.persist! 67 | job_id = Sidekiq::Client.push( 68 | "args" => [@feed_id, @feed_url, @response.path, @response.encoding.to_s], 69 | "class" => @critical ? "FeedParserCritical" : "FeedParser", 70 | "queue" => @critical ? "feed_parser_critical_#{Socket.gethostname}" : "feed_parser_#{Socket.gethostname}", 71 | "retry" => false 72 | ) 73 | Sidekiq.logger.info "Parse enqueued job_id: #{job_id} path=#{@response.path}" 74 | @feed.save(@response) 75 | end 76 | end 77 | 78 | class FeedDownloaderCritical 79 | include Sidekiq::Worker 80 | sidekiq_options queue: :feed_downloader_critical, retry: false 81 | def perform(*args) 82 | FeedDownloader.new.perform(*args, true) 83 | end 84 | end 85 | end 86 | end -------------------------------------------------------------------------------- /lib/crawler/refresher/jobs/feed_status_update.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Refresher 5 | class FeedStatusUpdate 6 | include Sidekiq::Worker 7 | sidekiq_options queue: :feed_downloader_critical 8 | 9 | def perform(feed_id, exception = nil) 10 | if exception 11 | FeedStatus.new(feed_id).error!(exception, formatted: true) 12 | else 13 | FeedStatus.clear!(feed_id) 14 | end 15 | end 16 | 17 | end 18 | end 19 | end -------------------------------------------------------------------------------- /lib/crawler/refresher/redirect_cache.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Refresher 5 | class RedirectCache 6 | 7 | # 4 redirect/hr 24hrs a day for 6 days 8 | PERSIST_AFTER = 4 * 24 * 6 9 | 10 | attr_reader :redirects 11 | 12 | def initialize(feed_id) 13 | @feed_id = feed_id 14 | end 15 | 16 | def save(redirects) 17 | @redirects = redirects 18 | Cache.write(stable_key, {to: @redirects.last.to}) if redirect_stable? 19 | end 20 | 21 | def redirect_stable? 22 | return false if @redirects.empty? 23 | return false unless @redirects.all?(&:permanent?) 24 | Cache.increment(counter_key, options: {expires_in: 72 * 60 * 60}) > PERSIST_AFTER 25 | end 26 | 27 | def read 28 | @read ||= Cache.read(stable_key)[:to] 29 | end 30 | 31 | def delete 32 | Cache.delete(stable_key) 33 | end 34 | 35 | def counter_key 36 | @counter_key ||= begin 37 | "refresher_redirect_tmp_" + Digest::SHA1.hexdigest(@redirects.map(&:cache_key).join) 38 | end 39 | end 40 | 41 | def stable_key 42 | @stable_key ||= begin 43 | "refresher_redirect_stable_#{@feed_id}" 44 | end 45 | end 46 | end 47 | 48 | class Redirect 49 | PERMANENT_REDIRECTS = [301, 308].to_set.freeze 50 | 51 | attr_reader :from, :to 52 | 53 | def initialize(feed_id, status:, from:, to:) 54 | @feed_id = feed_id 55 | @status = status 56 | @from = from 57 | @to = to 58 | end 59 | 60 | def permanent? 61 | PERMANENT_REDIRECTS.include?(@status) 62 | end 63 | 64 | def cache_key 65 | @cache_key ||= Digest::SHA1.hexdigest([@feed_id, @status, @from, @to].join) 66 | end 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /lib/crawler/refresher/throttle.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Crawler 4 | module Refresher 5 | class Throttle 6 | 7 | TIMEOUT = 60 * 30 8 | 9 | def initialize(feed_url, last_download) 10 | @feed_url = feed_url 11 | @last_download = last_download 12 | end 13 | 14 | def self.throttled?(*args) 15 | new(*args).throttled? 16 | end 17 | 18 | def throttled? 19 | throttled_hosts.include?(host) && downloaded_recently? 20 | end 21 | 22 | def downloaded_recently? 23 | return false if @last_download.nil? 24 | (Time.now.to_i - @last_download) < random_timeout 25 | end 26 | 27 | def random_timeout 28 | rand(TIMEOUT..(TIMEOUT * 2)) 29 | end 30 | 31 | def throttled_hosts 32 | ENV["THROTTLED_HOSTS"]&.split(",") || [] 33 | end 34 | 35 | def host 36 | Addressable::URI.heuristic_parse(@feed_url).host.split(".").last(2).join(".") 37 | rescue 38 | nil 39 | end 40 | end 41 | end 42 | end -------------------------------------------------------------------------------- /lib/image.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | $LOAD_PATH.unshift File.expand_path(File.dirname(File.dirname(__FILE__))) 4 | 5 | $stdout.sync = true 6 | 7 | require "bundler/setup" 8 | require "dotenv" 9 | Dotenv.load(".env", ".env.test") 10 | 11 | require "socket" 12 | require "etc" 13 | require "net/http" 14 | require "securerandom" 15 | require "time" 16 | require "uri" 17 | require "etc" 18 | require "digest" 19 | 20 | require "addressable" 21 | require "dotenv" 22 | require "down" 23 | require "fog/aws" 24 | require "http" 25 | require "image_processing/vips" 26 | require "json" 27 | require "librato-rack" 28 | require "mime-types" 29 | require "open3" 30 | require "redis" 31 | require "shellwords" 32 | require "sidekiq" 33 | 34 | require "lib/crawler/image/initializers/constants" 35 | require "lib/crawler/image/initializers/down" 36 | require "lib/crawler/image/initializers/librato" 37 | require "lib/crawler/image/initializers/worker_stat" 38 | require "lib/crawler/image/initializers/sidekiq" 39 | require "lib/crawler/image/initializers/storage" 40 | 41 | require "lib/crawler/image/helpers" 42 | require "lib/crawler/image/timer" 43 | require "lib/crawler/image/cache" 44 | require "lib/crawler/image/meta_images" 45 | require "lib/crawler/image/meta_images_cache" 46 | require "lib/crawler/image/download_cache" 47 | require "lib/crawler/image/download" 48 | require "lib/crawler/image/download/default" 49 | require "lib/crawler/image/download/instagram" 50 | require "lib/crawler/image/download/vimeo" 51 | require "lib/crawler/image/download/youtube" 52 | require "lib/crawler/image/image_processor" 53 | require "lib/crawler/image/jobs/find_image" 54 | require "lib/crawler/image/jobs/process_image" 55 | require "lib/crawler/image/jobs/upload_image" 56 | -------------------------------------------------------------------------------- /lib/refresher.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path(File.dirname(File.dirname(__FILE__))) 2 | 3 | $stdout.sync = true 4 | 5 | require "bundler/setup" 6 | require "dotenv" 7 | 8 | if ENV["ENV_PATH"] 9 | Dotenv.load ENV["ENV_PATH"] 10 | else 11 | Dotenv.load 12 | end 13 | 14 | require "digest/sha1" 15 | require "date" 16 | require "socket" 17 | require "time" 18 | require "forwardable" 19 | require "json" 20 | 21 | require "sidekiq" 22 | require "connection_pool" 23 | require "redis" 24 | require "feedkit" 25 | 26 | require "lib/crawler/refresher/initializers/sidekiq" 27 | require "lib/crawler/refresher/initializers/redis" 28 | 29 | require "lib/crawler/refresher/cache" 30 | require "lib/crawler/refresher/feed_status" 31 | require "lib/crawler/refresher/redirect_cache" 32 | require "lib/crawler/refresher/http_cache" 33 | require "lib/crawler/refresher/feed" 34 | require "lib/crawler/refresher/throttle" 35 | require "lib/crawler/refresher/jobs/feed_downloader" 36 | require "lib/crawler/refresher/jobs/feed_status_update" 37 | -------------------------------------------------------------------------------- /test/cache_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class CacheTest < Minitest::Test 6 | def setup 7 | flush 8 | end 9 | 10 | def test_should_delete 11 | cache_key = "cache_key" 12 | Cache.increment(cache_key) 13 | assert_equal(1, Cache.count(cache_key)) 14 | Cache.delete(cache_key) 15 | assert_equal(0, Cache.count(cache_key)) 16 | end 17 | 18 | def test_should_increment 19 | assert_equal(1, Cache.increment("cache_key")) 20 | end 21 | 22 | def test_should_get_count 23 | cache_key = "cache_key" 24 | assert_equal(0, Cache.count(cache_key)) 25 | Cache.increment(cache_key) 26 | assert_equal(1, Cache.count(cache_key)) 27 | end 28 | 29 | def test_should_cache_values 30 | cache_key = "cache_key" 31 | Cache.write(cache_key, { 32 | etag: nil, 33 | last_modified: "last_modified", 34 | }) 35 | 36 | values = Cache.read(cache_key) 37 | 38 | assert_equal("last_modified", values[:last_modified]) 39 | assert_nil(values[:etag]) 40 | end 41 | 42 | def test_should_cache_values_with_exiration 43 | cache_key = "cache_key" 44 | 45 | Cache.write(cache_key, { 46 | key: "value", 47 | }, 48 | options: {expires_in: 1} 49 | ) 50 | 51 | result = Sidekiq.redis do |redis| 52 | redis.ttl(cache_key) 53 | end 54 | 55 | assert_equal(1, result) 56 | end 57 | end 58 | end 59 | end -------------------------------------------------------------------------------- /test/download/default.rb: -------------------------------------------------------------------------------- 1 | require_relative "../test_helper" 2 | module Crawler 3 | module Image 4 | 5 | class Download::DefaultTest < Minitest::Test 6 | def test_should_download_valid_image 7 | url = "http://example.com/image.jpg" 8 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678") 9 | download = Download.download!(url, minimum_size: 8) 10 | assert_instance_of Download::Default, download 11 | end 12 | end 13 | end 14 | end -------------------------------------------------------------------------------- /test/download/instagram.rb: -------------------------------------------------------------------------------- 1 | require_relative "../test_helper" 2 | module Crawler 3 | module Image 4 | 5 | class Download::InstagramTest < Minitest::Test 6 | def test_should_download_valid_image 7 | url = "http://example.com/image.jpg" 8 | stub_request(:get, /graph\.facebook\.com/).to_return(body: {thumbnail_url: url}.to_json) 9 | 10 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678") 11 | download = Download.download!("https://www.instagram.com/p/CMGfYFaJoF7/", minimum_size: 8) 12 | assert download.valid? 13 | assert_instance_of Download::Instagram, download 14 | end 15 | end 16 | end 17 | end -------------------------------------------------------------------------------- /test/download/vimeo.rb: -------------------------------------------------------------------------------- 1 | require_relative "../test_helper" 2 | module Crawler 3 | module Image 4 | class Download::VimeoTest < Minitest::Test 5 | def test_should_download_valid_image 6 | url = "http://example.com/image.jpg" 7 | stub_request(:get, /vimeo\.com\/api/).to_return(body: {thumbnail_url: url}.to_json) 8 | 9 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678") 10 | download = Download.download!("https://player.vimeo.com/video/CMGfYFaJoF7/", minimum_size: 8) 11 | assert download.valid? 12 | assert_instance_of Download::Vimeo, download 13 | end 14 | end 15 | end 16 | end -------------------------------------------------------------------------------- /test/download/youtube.rb: -------------------------------------------------------------------------------- 1 | require_relative "../test_helper" 2 | module Crawler 3 | module Image 4 | class Download::YoutubeTest < Minitest::Test 5 | def test_should_download_valid_image 6 | id = SecureRandom.hex 7 | 8 | max_url = "https://i.ytimg.com/vi/#{id}/maxresdefault.jpg" 9 | hq_url = "https://i.ytimg.com/vi/#{id}/hqdefault.jpg" 10 | 11 | stub_request(:get, max_url).to_return(status: 404) 12 | stub_request(:get, hq_url).to_return(headers: {content_type: "image/jpg"}, body: "12345678") 13 | 14 | download = Download.download!("https://www.youtube.com/watch?v=#{id}", minimum_size: 8) 15 | assert download.valid? 16 | 17 | assert_instance_of Download::Youtube, download 18 | assert_requested :get, max_url 19 | assert_requested :get, hq_url 20 | end 21 | 22 | def test_should_stop_at_first_image 23 | id = SecureRandom.hex 24 | 25 | max_url = "https://i.ytimg.com/vi/#{id}/maxresdefault.jpg" 26 | hq_url = "https://i.ytimg.com/vi/#{id}/hqdefault.jpg" 27 | 28 | stub_request(:get, max_url).to_return(headers: {content_type: "image/jpg"}, body: "12345678") 29 | stub_request(:get, hq_url).to_return(headers: {content_type: "image/jpg"}, body: "12345678") 30 | 31 | download = Download.download!("https://www.youtube.com/watch?v=#{id}", minimum_size: 8) 32 | assert download.valid? 33 | 34 | assert_instance_of Download::Youtube, download 35 | assert_requested :get, max_url 36 | refute_requested :get, hq_url 37 | end 38 | end 39 | end 40 | end -------------------------------------------------------------------------------- /test/download_cache_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | module Crawler 3 | module Image 4 | class DownloadCacheTest < Minitest::Test 5 | def setup 6 | flush 7 | end 8 | 9 | def test_should_save_url 10 | image_url = "http://example.com/example/example.jpg" 11 | storage_url = "http://s3.com/example/example.jpg" 12 | public_id = SecureRandom.hex 13 | 14 | cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary") 15 | cache.save(storage_url: storage_url, image_url: image_url) 16 | 17 | cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary") 18 | assert_equal(storage_url, cache.storage_url) 19 | end 20 | 21 | def test_should_copy_existing_image 22 | image_url = "http://example.com/example/example.jpg" 23 | storage_url = "http://s3.com/example/example.jpg" 24 | public_id = SecureRandom.hex 25 | 26 | stub_request(:put, /.*\.s3\.amazonaws\.com/).to_return(status: 200, body: aws_copy_body) 27 | 28 | cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary") 29 | refute cache.copied? 30 | 31 | cache.save(storage_url: storage_url, image_url: image_url) 32 | cache.copy 33 | 34 | assert cache.copied? 35 | assert cache.storage_url.include?(public_id) 36 | end 37 | 38 | def test_should_fail_to_copy_missing_image 39 | image_url = "http://example.com/example/example.jpg" 40 | storage_url = "http://s3.com/example/example.jpg" 41 | public_id = SecureRandom.hex 42 | s3_host = /.*\.s3\.amazonaws\.com/ 43 | 44 | stub_request(:put, s3_host).to_return(status: 404) 45 | 46 | cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary") 47 | cache.save(storage_url: storage_url, image_url: image_url) 48 | cache.copy 49 | refute cache.copied? 50 | assert_requested :put, s3_host 51 | end 52 | end 53 | end 54 | end -------------------------------------------------------------------------------- /test/download_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | module Crawler 3 | module Image 4 | class DownloadTest < Minitest::Test 5 | def test_should_download_valid_image 6 | url = "http://example.com/image.jpg" 7 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678") 8 | download = Download.download!(url, minimum_size: 8) 9 | assert download.valid? 10 | end 11 | 12 | def test_should_be_too_small 13 | url = "http://example.com/image.jpg" 14 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "1234567") 15 | download = Download.download!(url, minimum_size: 8) 16 | refute download.valid? 17 | end 18 | 19 | def test_should_ignore_size 20 | url = "http://example.com/image.jpg" 21 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "1") 22 | download = Download.download!(url, minimum_size: nil) 23 | assert download.valid? 24 | end 25 | 26 | def test_should_be_wrong_content_type 27 | url = "http://example.com/image.jpg" 28 | stub_request(:get, url).to_return(headers: {content_type: "text/html"}) 29 | download = Download.download!(url, minimum_size: nil) 30 | refute download.valid? 31 | end 32 | 33 | def test_should_persist_file 34 | url = "http://example.com/image.jpg" 35 | body = "body" 36 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: body) 37 | download = Download.download!(url) 38 | path = download.path 39 | download.persist! 40 | refute path == download.path 41 | FileUtils.rm download.path 42 | end 43 | end 44 | end 45 | end -------------------------------------------------------------------------------- /test/feed_downloader_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class FeedDownloaderTest < Minitest::Test 6 | def setup 7 | flush 8 | end 9 | 10 | def test_should_schedule_feed_parser 11 | url = "http://example.com/atom.xml" 12 | stub_request_file("atom.xml", url) 13 | 14 | assert_equal 0, Sidekiq::Queues["feed_parser_#{Socket.gethostname}"].size 15 | FeedDownloader.new.perform(1, url, 10) 16 | assert_equal 1, Sidekiq::Queues["feed_parser_#{Socket.gethostname}"].size 17 | 18 | FeedDownloader.new.perform(1, url, 10) 19 | assert_equal 1, Sidekiq::Queues["feed_parser_#{Socket.gethostname}"].size 20 | end 21 | 22 | def test_should_schedule_critical_feed_parser 23 | url = "http://example.com/atom.xml" 24 | stub_request_file("atom.xml", url) 25 | 26 | assert_equal 0, Sidekiq::Queues["feed_parser_critical_#{Socket.gethostname}"].size 27 | FeedDownloaderCritical.new.perform(1, url, 10) 28 | assert_equal 1, Sidekiq::Queues["feed_parser_critical_#{Socket.gethostname}"].size 29 | end 30 | 31 | def test_should_send_user_agent 32 | url = "http://example.com/atom.xml" 33 | stub_request_file("atom.xml", url).with(headers: {"User-Agent" => "Feedbin feed-id:1 - 10 subscribers"}) 34 | FeedDownloader.new.perform(1, url, 10) 35 | end 36 | 37 | def test_should_send_authorization 38 | username = "username" 39 | password = "password" 40 | url = "http://#{username}:#{password}@example.com/atom.xml" 41 | 42 | stub_request(:get, "http://example.com/atom.xml").with(headers: {"Authorization" => "Basic #{Base64.strict_encode64("#{username}:#{password}")}"}) 43 | FeedDownloader.new.perform(1, url, 10) 44 | end 45 | 46 | def test_should_use_saved_redirect 47 | feed_id = 1 48 | url_one = "http://example.com/one" 49 | url_two = "http://example.com/two" 50 | 51 | redirect_cache = RedirectCache.new(feed_id) 52 | Cache.write(redirect_cache.stable_key, {to: url_two}) 53 | 54 | stub_request(:get, url_two) 55 | FeedDownloader.new.perform(feed_id, url_one, 10) 56 | end 57 | 58 | def test_should_use_saved_redirect_with_basic_auth 59 | feed_id = 1 60 | username = "username" 61 | password = "password" 62 | url_one = "http://#{username}:#{password}@example.com/one" 63 | url_two = "http://example.com/two" 64 | 65 | redirect_cache = RedirectCache.new(feed_id) 66 | Cache.write(redirect_cache.stable_key, {to: url_two}) 67 | 68 | stub_request(:get, url_two).with(headers: {"Authorization" => "Basic #{Base64.strict_encode64("#{username}:#{password}")}"}) 69 | FeedDownloader.new.perform(feed_id, url_one, 10) 70 | end 71 | 72 | def test_should_do_nothing_if_not_modified 73 | feed_id = 1 74 | etag = "etag" 75 | last_modified = "last_modified" 76 | Cache.write("refresher_http_#{feed_id}", { 77 | etag: etag, 78 | last_modified: last_modified, 79 | checksum: nil 80 | }) 81 | 82 | url = "http://example.com/atom.xml" 83 | stub_request(:get, url).with(headers: {"If-None-Match" => etag, "If-Modified-Since" => last_modified}).to_return(status: 304) 84 | FeedDownloader.new.perform(feed_id, url, 10) 85 | assert_equal 0, Sidekiq::Queues["feed_parser_critical_#{Socket.gethostname}"].size 86 | end 87 | 88 | def test_should_not_be_ok_after_error 89 | feed_id = 1 90 | 91 | url = "http://example.com/atom.xml" 92 | stub_request(:get, url).to_return(status: 429) 93 | 94 | FeedDownloader.new.perform(feed_id, url, 10) 95 | 96 | refute FeedStatus.new(feed_id).ok?, "Should not be ok?" 97 | end 98 | 99 | def test_should_follow_redirects 100 | first_url = "http://www.example.com" 101 | last_url = "#{first_url}/final" 102 | 103 | response = { 104 | status: 301, 105 | headers: { 106 | "Location" => "/final" 107 | } 108 | } 109 | stub_request(:get, first_url).to_return(response) 110 | stub_request(:get, last_url) 111 | 112 | FeedDownloader.new.perform(1, first_url, 10) 113 | end 114 | end 115 | end 116 | end -------------------------------------------------------------------------------- /test/feed_status_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class FeedStatusTest < Minitest::Test 6 | 7 | def setup 8 | flush 9 | end 10 | 11 | def test_should_not_be_ok 12 | feed_id = 1 13 | FeedStatus.new(feed_id).error!(Exception.new) 14 | refute FeedStatus.new(feed_id).ok?, "ok? should be false." 15 | end 16 | 17 | def test_should_be_ok 18 | feed_id = 1 19 | FeedStatus.new(feed_id).error!(Exception.new) 20 | FeedStatus.clear!(feed_id) 21 | assert FeedStatus.new(feed_id).ok?, "ok? should be true." 22 | end 23 | 24 | def test_should_get_count 25 | feed_id = 1 26 | FeedStatus.new(feed_id).error!(Exception.new) 27 | FeedStatus.new(feed_id).error!(Exception.new) 28 | assert_equal(2, FeedStatus.new(feed_id).count) 29 | end 30 | 31 | def test_should_be_ok_after_timeout 32 | feed_id = 1 33 | 34 | FeedStatus.new(feed_id).error!(Exception.new) 35 | 36 | one_hour = 60 * 60 37 | one_hour_from_now = Time.now.to_i + one_hour 38 | two_hours_ago = Time.now.to_i - one_hour - one_hour 39 | 40 | feed_status = FeedStatus.new(feed_id) 41 | 42 | assert feed_status.next_retry > one_hour_from_now 43 | 44 | Cache.write(feed_status.cache_key, { failed_at: two_hours_ago }) 45 | 46 | assert FeedStatus.new(feed_id).ok?, "Status should be ok after rewinding failed_at" 47 | end 48 | 49 | def test_should_save_last_download 50 | feed_id = 1 51 | now = Time.now.to_i 52 | FeedStatus.new(1).log_download! 53 | difference = FeedStatus.new(1).downloaded_at - now 54 | assert difference <= 1 55 | end 56 | end 57 | end 58 | end -------------------------------------------------------------------------------- /test/feed_status_update_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class FeedStatusUpdateTest < Minitest::Test 6 | 7 | def setup 8 | flush 9 | end 10 | 11 | def test_should_clear_status 12 | feed_id = 1 13 | FeedStatus.new(feed_id).error!(Exception.new) 14 | refute FeedStatus.new(feed_id).ok?, "ok? should be false." 15 | FeedStatusUpdate.new.perform(feed_id) 16 | assert FeedStatus.new(feed_id).ok?, "ok? should be true." 17 | end 18 | 19 | def test_should_record_error 20 | feed_id = 1 21 | exception = Exception.new 22 | formatted_exception = JSON.dump({date: Time.now.to_i, class: exception.class, message: exception.message, status: nil}) 23 | FeedStatusUpdate.new.perform(feed_id, formatted_exception) 24 | status = FeedStatus.new(feed_id) 25 | refute status.ok?, "ok? should be false." 26 | assert_equal exception.class.name, status.attempt_log.first["class"] 27 | end 28 | 29 | end 30 | end 31 | end -------------------------------------------------------------------------------- /test/feed_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class FeedTest < Minitest::Test 6 | 7 | def setup 8 | flush 9 | end 10 | 11 | def test_should_be_ok 12 | feed_id = 1 13 | feed = Feed.new(feed_id) 14 | feed.download_error(Exception.new) 15 | 16 | feed = Feed.new(feed_id) 17 | feed.download_success 18 | 19 | feed = Feed.new(feed_id) 20 | assert feed.ok? 21 | end 22 | 23 | def test_should_not_be_ok 24 | feed_id = 1 25 | feed = Feed.new(feed_id) 26 | feed.download_error(Feedkit::NotFeed.new) 27 | 28 | feed = Feed.new(feed_id) 29 | feed.download_success 30 | 31 | feed = Feed.new(feed_id) 32 | assert_equal("Feedkit::NotFeed", feed.last_error["class"]) 33 | refute feed.ok? 34 | end 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /test/image_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | module Crawler 3 | module Image 4 | class ImageTest < Minitest::Test 5 | def test_should_get_image_size 6 | file = File.expand_path("support/www/image.jpeg", __dir__) 7 | image = ImageProcessor.new(file, target_width: 542, target_height: 304) 8 | assert_equal(image.width, 640) 9 | assert_equal(image.height, 828) 10 | assert_equal(542, image.resized.width) 11 | assert_equal(701, image.resized.height) 12 | end 13 | 14 | def test_should_get_face_location 15 | file = support_file("image.jpeg") 16 | image = ImageProcessor.new(file, target_width: 542, target_height: 304) 17 | 18 | assert_equal(462, image.average_face_position("y", File.new(file))) 19 | end 20 | 21 | def test_should_crop 22 | file = File.expand_path("support/www/image.jpeg", __dir__) 23 | image = ImageProcessor.new(file, target_width: 542, target_height: 304) 24 | cropped_path = image.smart_crop 25 | assert cropped_path.include?(".jpg") 26 | FileUtils.rm cropped_path 27 | end 28 | 29 | def test_should_return_same_size_image 30 | file = File.expand_path("support/www/image.jpeg", __dir__) 31 | image = ImageProcessor.new(file, target_width: 640, target_height: 828) 32 | cropped_path = image.smart_crop 33 | assert cropped_path.include?(".jpg") 34 | end 35 | end 36 | end 37 | end -------------------------------------------------------------------------------- /test/jobs/find_image_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "../test_helper" 2 | module Crawler 3 | module Image 4 | class FindImageTest < Minitest::Test 5 | def setup 6 | flush 7 | end 8 | 9 | def test_should_copy_image 10 | image_url = "https://i.ytimg.com/vi/id/maxresdefault.jpg" 11 | original_url = "https://www.youtube.com/watch?v=id" 12 | 13 | stub_request_file("image.jpeg", image_url, headers: {content_type: "image/jpeg"}) 14 | stub_request(:put, /.*\.s3\.amazonaws\.com/).to_return(status: 200, body: aws_copy_body) 15 | 16 | Sidekiq::Testing.inline! do 17 | FindImage.perform_async(SecureRandom.hex, "primary", [original_url]) 18 | end 19 | 20 | FindImage.new.perform(SecureRandom.hex, "primary", [original_url]) 21 | assert_equal(image_url, EntryImage.jobs.first["args"][1]["original_url"]) 22 | end 23 | 24 | def test_should_process_an_image 25 | image_url = "http://example.com/image.jpg" 26 | page_url = "http://example.com/article" 27 | urls = [image_url] 28 | 29 | stub_request_file("html.html", page_url) 30 | stub_request_file("image.jpeg", image_url, headers: {content_type: "image/jpeg"}) 31 | 32 | stub_request(:get, "http://example.com/image/og_image.jpg").to_return(status: 404) 33 | stub_request(:get, "http://example.com/image/twitter_image.jpg").to_return(status: 404) 34 | 35 | stub_request(:put, /.*\.s3\.amazonaws\.com/).to_return(status: 200, body: aws_copy_body) 36 | 37 | Sidekiq::Testing.inline! do 38 | FindImage.perform_async(SecureRandom.hex, "primary", urls, page_url) 39 | end 40 | 41 | assert_requested :get, "http://example.com/image/og_image.jpg" 42 | assert_requested :get, "http://example.com/image/twitter_image.jpg" 43 | 44 | assert_equal 0, EntryImage.jobs.size 45 | FindImage.new.perform(SecureRandom.hex, "primary", urls, nil) 46 | assert_equal 1, EntryImage.jobs.size 47 | end 48 | 49 | def test_should_enqueue_recognized_image 50 | url = "https://i.ytimg.com/vi/id/maxresdefault.jpg" 51 | image_url = "http://example.com/image.jpg" 52 | 53 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: ("lorem " * 3_500)) 54 | 55 | assert_equal 0, ProcessImage.jobs.size 56 | FindImage.new.perform(SecureRandom.hex, "primary", [image_url], "https://www.youtube.com/watch?v=id") 57 | assert_equal 1, ProcessImage.jobs.size 58 | 59 | effective_image_url = ProcessImage.jobs.first["args"][4] 60 | 61 | assert_equal(url, effective_image_url) 62 | 63 | assert_requested :get, url 64 | refute_requested :get, image_url 65 | end 66 | 67 | def test_should_try_all_urls 68 | urls = [ 69 | "http://example.com/image_1.jpg", 70 | "http://example.com/image_2.jpg", 71 | "http://example.com/image_3.jpg" 72 | ] 73 | 74 | urls.each do |url| 75 | stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: ("lorem " * 3_500)) 76 | end 77 | 78 | Sidekiq::Testing.inline! do 79 | FindImage.perform_async(SecureRandom.hex, "primary", urls, nil) 80 | end 81 | 82 | assert_requested :get, urls[0] 83 | assert_requested :get, urls[1] 84 | assert_requested :get, urls[2] 85 | end 86 | end 87 | end 88 | end -------------------------------------------------------------------------------- /test/jobs/process_image_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "../test_helper" 2 | module Crawler 3 | module Image 4 | class ProcessImageTest < Minitest::Test 5 | def setup 6 | flush 7 | end 8 | 9 | def test_should_enqueue_upload 10 | public_id = SecureRandom.hex 11 | path = support_file("image.jpeg") 12 | url = "http://example.com/image.jpg" 13 | 14 | assert_equal 0, UploadImage.jobs.size 15 | ProcessImage.new.perform(public_id, "primary", path, url, url, []) 16 | assert_equal 1, UploadImage.jobs.size 17 | 18 | assert_equal(public_id, UploadImage.jobs.first["args"].first) 19 | assert_equal(url, UploadImage.jobs.first["args"].last) 20 | end 21 | 22 | def test_should_enqueue_find 23 | public_id = SecureRandom.hex 24 | path = Tempfile.new.path 25 | url = "http://example.com/image.jpg" 26 | all_urls = ["http://example.com/image_2.jpg", "http://example.com/image_3.jpg"] 27 | 28 | assert_equal 0, FindImageCritical.jobs.size 29 | ProcessImage.new.perform(public_id, "primary", path, url, url, all_urls) 30 | assert_equal 1, FindImageCritical.jobs.size 31 | 32 | assert_equal([public_id, "primary", all_urls], FindImageCritical.jobs.first["args"]) 33 | end 34 | end 35 | end 36 | end -------------------------------------------------------------------------------- /test/jobs/upload_image_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "../test_helper" 2 | module Crawler 3 | module Image 4 | class UploadImageTest < Minitest::Test 5 | def setup 6 | flush 7 | end 8 | 9 | def test_should_upload 10 | public_id = SecureRandom.hex 11 | path = support_file("image.jpeg") 12 | url = "http://example.com/image.jpg" 13 | 14 | stub_request(:put, /.*\.s3\.amazonaws\.com/) 15 | 16 | assert_equal 0, EntryImage.jobs.size 17 | UploadImage.new.perform(public_id, "primary", path, url, url) 18 | assert_equal 1, EntryImage.jobs.size 19 | 20 | download_cache = DownloadCache.new(url, public_id: public_id, preset_name: "primary") 21 | assert_equal("https:", download_cache.storage_url) 22 | end 23 | end 24 | end 25 | end -------------------------------------------------------------------------------- /test/meta_images_cache_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | module Crawler 3 | module Image 4 | class MetaImagesCacheTest < Minitest::Test 5 | def setup 6 | flush 7 | end 8 | 9 | def test_should_save_urls 10 | urls = ["one", "two"] 11 | cache = MetaImagesCache.new(Addressable::URI.parse("http://example.com/article")) 12 | cache.save({checked: true, urls: urls}) 13 | 14 | assert_equal(urls, cache.urls) 15 | end 16 | 17 | def test_should_save_checked_status 18 | cache = MetaImagesCache.new(Addressable::URI.parse("http://example.com/article")) 19 | refute cache.checked? 20 | 21 | cache.save({checked: true, urls: []}) 22 | assert cache.checked? 23 | end 24 | 25 | def test_should_save_meta_presence 26 | cache = MetaImagesCache.new(Addressable::URI.parse("http://example.com/article")) 27 | assert cache.has_meta? 28 | 29 | cache.has_meta!(false) 30 | refute cache.has_meta? 31 | 32 | cache.has_meta!(true) 33 | assert cache.has_meta? 34 | end 35 | end 36 | end 37 | end -------------------------------------------------------------------------------- /test/meta_images_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | module Crawler 3 | module Image 4 | class MetaImagesTest < Minitest::Test 5 | def setup 6 | flush 7 | end 8 | 9 | def test_should_find_urls 10 | url = "http://example.com/" 11 | stub_request_file("html.html", url) 12 | urls = MetaImages.find_urls(url) 13 | assert urls.length == 2 14 | assert urls.map(&:to_s).include?("http://example.com/image/og_image.jpg") 15 | assert urls.map(&:to_s).include?("http://example.com/image/twitter_image.jpg") 16 | end 17 | 18 | def test_should_not_download_file 19 | url = "http://example.com/" 20 | stub_request(:get, url).to_return(status: 404) 21 | urls = MetaImages.find_urls(url) 22 | assert urls.empty? 23 | end 24 | 25 | def test_should_be_invalid_url 26 | url = "http://invalid\\.com" 27 | assert_raises(Addressable::URI::InvalidURIError) do 28 | MetaImages.new(url).find_urls 29 | end 30 | end 31 | 32 | def test_should_be_invalid_no_host 33 | url = "invalid" 34 | assert_raises(Addressable::URI::InvalidURIError) do 35 | MetaImages.new(url).find_urls 36 | end 37 | end 38 | 39 | def test_should_determine_download_status 40 | url = "http://example.com/" 41 | stub_request_file("html.html", url) 42 | urls = MetaImages.new(url) 43 | assert urls.needs_download? 44 | urls.find_urls 45 | 46 | urls = MetaImages.new(url) 47 | assert !urls.needs_download? 48 | end 49 | 50 | def test_should_not_download_from_site_with_no_meta 51 | url = Addressable::URI.parse("http://example.com/article") 52 | cache = MetaImagesCache.new(url) 53 | cache.has_meta!(false) 54 | 55 | urls = MetaImages.new(url) 56 | assert !urls.needs_download? 57 | end 58 | end 59 | end 60 | end -------------------------------------------------------------------------------- /test/redirect_cache_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class RedirectCacheTest < Minitest::Test 6 | 7 | def setup 8 | flush 9 | end 10 | 11 | def test_should_collapse_stable_redirects 12 | feed_id = 2 13 | 14 | redirect1 = Redirect.new(feed_id, status: 301, from: "http://example.com", to: "http://example.com/second") 15 | redirect2 = Redirect.new(feed_id, status: 301, from: "http://example.com/second", to: "http://example.com/third") 16 | redirect3 = Redirect.new(feed_id, status: 301, from: "http://example.com/third", to: "http://example.com/final") 17 | 18 | (RedirectCache::PERSIST_AFTER).times do 19 | RedirectCache.new(feed_id).save([redirect1, redirect2]) 20 | end 21 | 22 | assert_nil RedirectCache.new(feed_id).read 23 | 24 | RedirectCache.new(feed_id).save([redirect1, redirect2]) 25 | 26 | assert_equal(redirect2.to, RedirectCache.new(feed_id).read) 27 | 28 | (RedirectCache::PERSIST_AFTER + 1).times do 29 | RedirectCache.new(feed_id).save([redirect2, redirect3]) 30 | end 31 | 32 | assert_equal(redirect3.to, RedirectCache.new(feed_id).read) 33 | end 34 | 35 | def test_should_not_temporary_redirects 36 | redirect1 = Redirect.new(1, status: 302, from: "http://example.com", to: "http://example.com/second") 37 | assert_nil RedirectCache.new(1).save([redirect1]) 38 | end 39 | 40 | def test_should_not_save_empty_redirects 41 | assert_nil RedirectCache.new(1).save([]) 42 | end 43 | end 44 | end 45 | end -------------------------------------------------------------------------------- /test/redirect_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class RedirectTest < Minitest::Test 6 | def test_should_have_cache_key 7 | redirect = Redirect.new(1, status: 301, from: "http://example.com", to: "http://example.com/final") 8 | assert_equal("3981c0f11e525f3f0f4498a238f448957ff1929c", redirect.cache_key) 9 | end 10 | end 11 | end 12 | end -------------------------------------------------------------------------------- /test/support/www/atom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Feedbin 4 | A fast, simple RSS feed reader that delivers a great reading experience. 5 | 6 | 7 | 8 | 2016-06-03T13:25:02-05:00 9 | https://feedbin.com 10 | 11 | Feedbin 12 | support@feedbin.com 13 | 14 | 15 | 16 | 17 | Subscribe to Email Newsletters in Feedbin 18 | 19 | Ben Ubois 20 | 21 | 22 | 23 | 2016-02-03T15:37:25-06:00 24 | 25 | 26 | 2016-02-03T00:00:00-06:00 27 | 28 | /2016/02/03/subscribe-to-email-newsletters-in-feedbin 29 | You can now receive email newsletters in Feedbin. Newsletter Subscriptions To use this feature, go to the settings page and find your secret Feedbin email address. Use this email address whenever you sign up for an email newsletter. Anything sent to it will show up as a feed in Feedbin,... 30 | <p>You can now receive email newsletters in Feedbin.</p> 31 | 32 | <figure> 33 | <a href="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2016-02-03/email-large-ce9a0b27d895b4cb89429bb1d5773e0d1a394ef225dab5c0ced8c4419ed05bae.png"><img src="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2016-02-03/email-large-ce9a0b27d895b4cb89429bb1d5773e0d1a394ef225dab5c0ced8c4419ed05bae.png" /></a> 34 | <figcaption>Newsletter Subscriptions</figcaption> 35 | </figure> 36 | 37 | <p>To use this feature, go to the <a href="https://feedbin.com/settings">settings</a> page and find your secret Feedbin email address. Use this email address whenever you sign up for an email newsletter. Anything sent to it will show up as a feed in Feedbin, grouped by sender.</p> 38 | 39 | <p>Reading email in an email app feels like work to me. However, there’s a certain class of email that I <em>want</em> to enjoy reading, and Feedbin is where I go when I want to read for pleasure.</p> 40 | 41 | <p>For example, many great websites offer subscription content, usually with an email newsletter component. Not only do I enjoy the premium content from these sites, but I believe this a great way forward for people to support writers. Personally I have paid subscriptions to four of these including:</p> 42 | 43 | <ul> 44 | <li><a href="https://www.macstories.net">MacStories</a></li> 45 | <li><a href="http://www.aboveavalon.com">Above Avalon</a></li> 46 | <li><a href="https://stratechery.com">Stratechery</a></li> 47 | <li><a href="https://sixcolors.com">Six Colors</a></li> 48 | </ul> 49 | 50 | <p>This feature is also great for mailing lists and product announcement emails and since it’s just a regular feed, it will sync with your favorite native app as well.</p> 51 | 52 | 53 | 54 | 55 | Feedbin Notifier vs. Notify by Facebook 56 | 57 | Ben Ubois 58 | 59 | 60 | 61 | 2015-11-11T15:44:00-06:00 62 | 63 | 64 | 2015-11-11T00:00:00-06:00 65 | 66 | /2015/11/11/feedbin-notifier-vs-notify-by-facebook 67 | Today, Notify by Facebook was released. Conceptually, this is a similar app to Feedbin Notifier, which was released two days ago. The timing is coincidental but still interesting. Left: Feedbin Notifier. Right: Notify by Facebook. A similar coincidence happened when Feedbin launched, which is that two days later Google announced... 68 | <p>Today, <a href="https://notify.co/">Notify by Facebook</a> was released. Conceptually, this is a similar app to <a href="https://feedbin.com/notifier">Feedbin Notifier</a>, which was released <a href="https://feedbin.com/blog/2015/11/08/notifier/">two days ago</a>. The timing is coincidental but still interesting.</p> 69 | 70 | <figure> 71 | <a href="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-11-11/comparison_large-c56f942852b4f4aafba371de0aa2774be8f3888b696a2aadf4a122f97516bcca.png"><img src="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-11-11/comparison-12756a9e8953957f3167c3ee6ea6f33a8ec4bf2a742a6455af6fcb9d2c3e245e.png" /></a> 72 | <figcaption>Left: Feedbin Notifier. Right: Notify by Facebook.</figcaption> 73 | </figure> 74 | 75 | <p>A similar coincidence happened when <a href="https://feedbin.com/blog/2013/03/12/rediscover-rss/">Feedbin launched</a>, which is that two days later <a href="http://googlereader.blogspot.com/2013/03/powering-down-google-reader.html">Google announced they were shutting down Reader</a>. Weird timing.</p> 76 | 77 | <p>In my mind, Feedbin Notifier offers many advantages over Notify. The biggest reason to choose Feedbin Notifier is that it works with any source that offers an RSS feed, while Facebook has a limited number of built in sources.</p> 78 | 79 | <table> 80 | <thead> 81 | <tr> 82 | <th style="text-align: left">Feature</th> 83 | <th style="text-align: center">Feedbin Notifier</th> 84 | <th style="text-align: right">Notify by Facebook</th> 85 | </tr> 86 | </thead> 87 | <tbody> 88 | <tr> 89 | <td style="text-align: left">Sources</td> 90 | <td style="text-align: center">Anything with an RSS feed</td> 91 | <td style="text-align: right">72</td> 92 | </tr> 93 | <tr> 94 | <td style="text-align: left">Apple Watch App</td> 95 | <td style="text-align: center">Yes</td> 96 | <td style="text-align: right">No</td> 97 | </tr> 98 | <tr> 99 | <td style="text-align: left">Read offline</td> 100 | <td style="text-align: center">Yes</td> 101 | <td style="text-align: right">No</td> 102 | </tr> 103 | <tr> 104 | <td style="text-align: left">Spotlight Integration</td> 105 | <td style="text-align: center">Yes</td> 106 | <td style="text-align: right">No</td> 107 | </tr> 108 | <tr> 109 | <td style="text-align: left">Sync</td> 110 | <td style="text-align: center">Yes</td> 111 | <td style="text-align: right">No</td> 112 | </tr> 113 | <tr> 114 | <td style="text-align: left">Privacy</td> 115 | <td style="text-align: center">Yes</td> 116 | <td style="text-align: right">LOL</td> 117 | </tr> 118 | <tr> 119 | <td style="text-align: left">Price</td> 120 | <td style="text-align: center">$3/mo</td> 121 | <td style="text-align: right">Free</td> 122 | </tr> 123 | </tbody> 124 | </table> 125 | 126 | <p>This comparison is obviously biased. However, it is worth pointing out that Notify costs nothing, while Notifier only works with a paid Feedbin subscription.</p> 127 | 128 | <p>I think that having a large free competitor like this validates the idea. It also creates a market for a premium, more fully-featured version, which is what Feedbin Notifier is. Competing with Free is nothing new, it’s what Feedbin has been doing since day one.</p> 129 | 130 | 131 | 132 | 133 | Feedbin Notifier for iPhone, iPad and Apple Watch 134 | 135 | Ben Ubois 136 | 137 | 138 | 139 | 2015-11-09T12:14:00-06:00 140 | 141 | 142 | 2015-11-08T00:00:00-06:00 143 | 144 | /2015/11/08/notifier 145 | There are already many great full-featured apps that work with Feedbin. Feedbin Notifier aims to be different. Feedbin Notifier is a notifications based reader. The idea is to select the handful of feeds or keywords you care about most. Then when Feedbin matches an article, it will send a push... 146 | <p>There are already many great full-featured apps that work with Feedbin. <a href="/notifier">Feedbin Notifier</a> aims to be different.</p> 147 | 148 | <p><a href="https://itunes.apple.com/app/feedbin-notifier/id996164128?mt=8"><img src="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-11-09/notifier-7164c36f94df6fbe9cec47d9e8c4f4d2ee04460e28ea258e41d8f33ef843b83a.png" class="image-notifier" /></a></p> 149 | 150 | <p>Feedbin Notifier is a notifications based reader. The idea is to select the handful of feeds or keywords you care about most. Then when Feedbin matches an article, it will send a push notification to your iPhone, iPad or Apple Watch, keeping you informed throughout the day.</p> 151 | 152 | <p>This way, Notification Center becomes the primary interface for catching up on the stories that are important to you. You’ll see articles along side your email and other notifications allowing tell at a glance if there’s anything you want to read right away.</p> 153 | 154 | <p>Feedbin is a free universal app for iPhone, iPad and the Apple Watch. On the Apple Watch you can read full articles right away or for a better reading experience use Handoff to continue reading on your iPhone or iPad.</p> 155 | 156 | <p>Hope you enjoy it! Would love to <a href="https://twitter.com/feedbin">hear your feedback</a>.</p> 157 | 158 | 159 | 160 | 161 | Image Previews 162 | 163 | Ben Ubois 164 | 165 | 166 | 167 | 2015-10-22T00:00:00-05:00 168 | 169 | 170 | 2015-10-22T00:00:00-05:00 171 | 172 | /2015/10/22/image-previews 173 | Feedbin now features image previews in the center column. Feedbin, now with more images. The most important thing to me when building this feature was that only images that are reasonably high quality would show up here. To do this Feedbin uses a set of criteria that an image must... 174 | <p>Feedbin now features image previews in the center column.</p> 175 | 176 | <figure> 177 | <a href="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/screenshot_one_large-b08bfd8618a62a09fbad6b70ba28c973a70f1815a2e4fe00a78392b76f05720b.jpg"> 178 | <img src="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/screenshot_one_small-8e3180a61ba1fb72768ab7e21037dd431d40c07e64d146a24adfed06e6fa6158.jpg" /> 179 | </a> 180 | <figcaption>Feedbin, now with more images.</figcaption> 181 | </figure> 182 | 183 | <p>The most important thing to me when building this feature was that only images that are reasonably high quality would show up here. To do this Feedbin uses a set of criteria that an image must meet in order to be chosen.</p> 184 | 185 | <p>Feedbin can find a variety of images including:</p> 186 | 187 | <ul> 188 | <li>Images in the RSS post</li> 189 | <li>Poster frames from YouTube and Vimeo embeds</li> 190 | <li><a href="http://ogp.me/">Open Graph</a> and <a href="https://dev.twitter.com/cards/overview">Twitter Card</a> meta data</li> 191 | </ul> 192 | 193 | <p>One of my favorite operations that Feedbin does to ensure the quality of these image previews is facial detection using <a href="http://opencv.org/">OpenCV</a>. By getting a rough idea about where faces in an image might be, Feedbin is able to get a better crop. I first saw this idea used in <a href="http://blog.iconfactory.com/2015/06/twitterrifics-new-facial-recognition-keeps-faces-front-center/">Twitterrific</a> and loved the results.</p> 194 | 195 | <figure> 196 | <img src="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/crop_smart-cf421c97e5ff5181774b0655490144581705d8cd1fea64e9045a23814c72daa7.jpg" /> 197 | <figcaption>Crop with face detection. Sample image from <a href="https://thegreatdiscontent.com/interview/jim-riswold">The Great Discontent</a>.</figcaption> 198 | </figure> 199 | 200 | <figure> 201 | <img src="https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/crop_naive-16be8a0d9b8f9edbfcc413049e68cc17354a18678b3adb6efd92787684e7ccac.jpg" /> 202 | <figcaption>Crop without face detection. Sample image from <a href="https://thegreatdiscontent.com/interview/jim-riswold">The Great Discontent</a>.</figcaption> 203 | </figure> 204 | 205 | <p>Adding images is a big visual change and if you prefer the old look you can turn off image previews in the <a href="https://feedbin.com/settings/appearance">Appearance settings</a>.</p> 206 | 207 | 208 | 209 | 210 | Link Opener for Chrome 211 | 212 | Ben Ubois 213 | 214 | 215 | 216 | 2015-03-09T00:00:00-05:00 217 | 218 | 219 | 2015-03-09T00:00:00-05:00 220 | 221 | /2015/03/09/link-opener-for-chrome 222 | This official extension restores the ability for Chrome users to open article links in a background tab using a configurable keyboard shortcut. Previously this was possible without an extension, however the Chrome team recently removed this feature. The default shortcut to open a link is option/alt + v. This can... 223 | <p>This <a href="https://chrome.google.com/webstore/detail/feedbin-link-opener/naflkhnfmneiigdcphekaemdmeajiand">official extension</a> restores the ability for Chrome users to open article links in a background tab using a configurable keyboard shortcut.</p> 224 | 225 | <p>Previously this was possible without an extension, however the Chrome team <a href="https://code.google.com/p/chromium/issues/detail?id=456910">recently removed this feature</a>.</p> 226 | 227 | <p>The default shortcut to open a link is <code class="highlighter-rouge">option/alt + v</code>. This can be customized in Chrome’s <a href="chrome://extensions/configureCommands">Keyboard Shortcuts for Extensions and Apps</a> page.</p> 228 | 229 | <p>The <a href="https://github.com/feedbin/feedbin-link-opener-chrome">extension is open source</a> and ideas for improvements are welcome.</p> 230 | 231 | <p>An extension for Safari is not necessary because WebKit still supports the browser API to create background tabs.</p> 232 | 233 | <p><strong>Update:</strong> Martijn van der Ven has created a <a href="https://github.com/Zegnat/feedbin-link-opener-firefox/releases/">FireFox version of this extension</a>. Thanks Martijn!</p> 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /test/support/www/html.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /test/support/www/image.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/test/support/www/image.jpeg -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require "minitest/autorun" 2 | require "webmock/minitest" 3 | 4 | unless ENV["CI"] 5 | socket = Socket.new(:INET, :STREAM, 0) 6 | socket.bind(Addrinfo.tcp("127.0.0.1", 0)) 7 | port = socket.local_address.ip_port 8 | socket.close 9 | 10 | ENV["REDIS_URL"] = "redis://localhost:%d" % port 11 | redis_test_instance = IO.popen("redis-server --port %d --save '' --appendonly no" % port) 12 | 13 | Minitest.after_run do 14 | Process.kill("INT", redis_test_instance.pid) 15 | end 16 | end 17 | 18 | require "sidekiq/testing" 19 | Sidekiq::Testing.fake! 20 | Sidekiq.logger.level = Logger::WARN 21 | 22 | require_relative "../lib/image" 23 | require_relative "../lib/refresher" 24 | 25 | def flush 26 | Sidekiq::Worker.clear_all 27 | Sidekiq.redis do |redis| 28 | redis.flushdb 29 | end 30 | end 31 | 32 | def support_file(file_name) 33 | path = File.join Dir.tmpdir, SecureRandom.hex 34 | FileUtils.cp File.join("test/support/www", file_name), path 35 | path 36 | end 37 | 38 | def stub_request_file(file, url, options = {}) 39 | defaults = {body: File.new(support_file(file)), status: 200} 40 | stub_request(:get, url) 41 | .to_return(defaults.merge(options)) 42 | end 43 | 44 | def load_xml 45 | File.read("test/support/www/atom.xml") 46 | end 47 | 48 | def random_string 49 | (0...50).map { ("a".."z").to_a[rand(26)] }.join 50 | end 51 | 52 | def aws_copy_body 53 | <<~EOT 54 | 55 | 56 | string 57 | Tue, 02 Mar 2021 12:58:45 GMT 58 | 59 | EOT 60 | end 61 | 62 | class EntryImage 63 | include Sidekiq::Worker 64 | def perform(*args) 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /test/throttle_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | module Crawler 4 | module Refresher 5 | class ThrottleTest < Minitest::Test 6 | 7 | def setup 8 | flush 9 | end 10 | 11 | def test_throttled 12 | ENV["THROTTLED_HOSTS"] = "example.com" 13 | assert Throttle.throttled?("https://www.example.com", Time.now.to_i) 14 | assert_equal(false, Throttle.throttled?("https://www.example.com", Time.now.to_i - (Throttle::TIMEOUT * 2))) 15 | assert_equal(false, Throttle.throttled?("https://www.example.com", nil)) 16 | assert_equal(false, Throttle.throttled?("https://www.not-example.com", Time.now.to_i)) 17 | assert_equal(false, Throttle.throttled?(nil, nil)) 18 | end 19 | end 20 | end 21 | end -------------------------------------------------------------------------------- /test/timer_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | module Crawler 3 | module Image 4 | class TimerTest < Minitest::Test 5 | def test_should_time_out 6 | time = 0.11 7 | timer = Timer.new(time) 8 | 9 | sleep(time) 10 | 11 | assert timer.expired? 12 | 13 | elapsed = time + 0.01 14 | assert [elapsed].include?(timer.elapsed), "Around #{elapsed}s should have elapsed." 15 | end 16 | end 17 | end 18 | end --------------------------------------------------------------------------------