├── .drone.yml
├── .env.test
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .ruby-version
├── Gemfile
├── Gemfile.lock
├── LICENSE.md
├── Procfile
├── README.md
├── Rakefile
├── bin
    ├── console
    ├── pigo_darwin_arm64
    ├── pigo_darwin_x86_64
    ├── pigo_linux_x86_64
    └── rake
├── lib
    ├── crawler
    │   ├── image
    │   │   ├── cache.rb
    │   │   ├── cascade
    │   │   │   └── facefinder
    │   │   ├── download.rb
    │   │   ├── download
    │   │   │   ├── default.rb
    │   │   │   ├── instagram.rb
    │   │   │   ├── vimeo.rb
    │   │   │   └── youtube.rb
    │   │   ├── download_cache.rb
    │   │   ├── helpers.rb
    │   │   ├── image_processor.rb
    │   │   ├── initializers
    │   │   │   ├── constants.rb
    │   │   │   ├── down.rb
    │   │   │   ├── librato.rb
    │   │   │   ├── sidekiq.rb
    │   │   │   ├── storage.rb
    │   │   │   └── worker_stat.rb
    │   │   ├── jobs
    │   │   │   ├── find_image.rb
    │   │   │   ├── process_image.rb
    │   │   │   └── upload_image.rb
    │   │   ├── meta_images.rb
    │   │   ├── meta_images_cache.rb
    │   │   └── timer.rb
    │   └── refresher
    │   │   ├── cache.rb
    │   │   ├── feed.rb
    │   │   ├── feed_status.rb
    │   │   ├── http_cache.rb
    │   │   ├── initializers
    │   │       ├── redis.rb
    │   │       └── sidekiq.rb
    │   │   ├── jobs
    │   │       ├── feed_downloader.rb
    │   │       └── feed_status_update.rb
    │   │   ├── redirect_cache.rb
    │   │   └── throttle.rb
    ├── image.rb
    └── refresher.rb
└── test
    ├── cache_test.rb
    ├── download
        ├── default.rb
        ├── instagram.rb
        ├── vimeo.rb
        └── youtube.rb
    ├── download_cache_test.rb
    ├── download_test.rb
    ├── feed_downloader_test.rb
    ├── feed_status_test.rb
    ├── feed_status_update_test.rb
    ├── feed_test.rb
    ├── image_test.rb
    ├── jobs
        ├── find_image_test.rb
        ├── process_image_test.rb
        └── upload_image_test.rb
    ├── meta_images_cache_test.rb
    ├── meta_images_test.rb
    ├── redirect_cache_test.rb
    ├── redirect_test.rb
    ├── support
        └── www
        │   ├── atom.xml
        │   ├── feed.json
        │   ├── html.html
        │   └── image.jpeg
    ├── test_helper.rb
    ├── throttle_test.rb
    └── timer_test.rb


/.drone.yml:
--------------------------------------------------------------------------------
 1 | kind: pipeline
 2 | name: default
 3 | 
 4 | steps:
 5 | - name: test
 6 |   image: ubuntu:16.04
 7 |   commands:
 8 |   - apt-get update
 9 |   - apt-get install -y software-properties-common
10 |   - apt-add-repository ppa:brightbox/ruby-ng
11 |   - apt update
12 |   - apt-get install -y ruby2.5 ruby2.5-dev build-essential curl git libidn11-dev libpq-dev libreadline-dev libxml2-dev libxslt1-dev libcurl4-openssl-dev libssl-dev zlib1g-dev libffi-dev redis-server
13 |   - systemctl restart redis-server.service
14 |   - gem install bundler -v "1.16.5"
15 |   - bundle install --jobs=8 --retry=2
16 |   - rake
17 | 


--------------------------------------------------------------------------------
/.env.test:
--------------------------------------------------------------------------------
1 | AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID
2 | AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY
3 | AWS_S3_BUCKET=s3-bucket
4 | FACEBOOK_ACCESS_TOKEN=FACEBOOK_ACCESS_TOKEN
5 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     strategy:
 8 |       fail-fast: false
 9 |       matrix:
10 |         os: [ubuntu-22.04]
11 |         ruby: [3.1]
12 | 
13 |     runs-on: ${{ matrix.os }}
14 | 
15 |     services:
16 |       redis:
17 |         image: redis
18 |         options: >-
19 |           --health-cmd "redis-cli ping"
20 |           --health-interval 10s
21 |           --health-timeout 5s
22 |           --health-retries 5
23 |         ports:
24 |           - 6379:6379
25 | 
26 |     steps:
27 |     - uses: actions/checkout@v2
28 | 
29 |     - name: Install dependencies
30 |       run: sudo apt-get --yes install libidn11-dev libvips
31 | 
32 |     - uses: ruby/setup-ruby@v1
33 |       with:
34 |         ruby-version: ${{ matrix.ruby }}
35 |         bundler-cache: true
36 | 
37 |     - name: Run tests
38 |       run: bundle exec rake


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | .bundle
4 | vendor/bundle
5 | tmp
6 | .env
7 | coverage/
8 | log/dump.rdb


--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 3.1.2


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source "https://rubygems.org"
 2 | git_source(:github) { |name| "https://github.com/#{name}.git" }
 3 | 
 4 | gem "down", github: "feedbin/down", branch: "normalize"
 5 | gem "unf_ext"
 6 | 
 7 | gem "sax-machine", github: "feedbin/sax-machine", branch: "feedbin"
 8 | gem "feedjira",    github: "feedbin/feedjira",    branch: "f2"
 9 | gem "http",        github: "feedbin/http",        branch: "feedbin"
10 | gem "feedkit",     github: "feedbin/feedkit",     branch: "master"
11 | 
12 | gem "bundler"
13 | gem "addressable"
14 | gem "connection_pool"
15 | gem "dotenv"
16 | gem "fog-aws"
17 | gem "image_processing"
18 | gem "librato-metrics", "~> 1.6.2"
19 | gem "librato-rack"
20 | gem "mime-types"
21 | gem "nokogiri"
22 | gem "rake"
23 | gem "redis"
24 | gem "resolv"
25 | gem "ruby-vips"
26 | gem "sidekiq"
27 | 
28 | group :development do
29 |   gem "foreman"
30 |   gem "standard"
31 | end
32 | 
33 | group :test do
34 |   gem "minitest"
35 |   gem "webmock"
36 | end
37 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
  1 | GIT
  2 |   remote: https://github.com/feedbin/down.git
  3 |   revision: df69f03b65453bbbabc99678c3f6a3a1c4d1a0b2
  4 |   branch: normalize
  5 |   specs:
  6 |     down (5.3.1)
  7 |       addressable (~> 2.8)
  8 | 
  9 | GIT
 10 |   remote: https://github.com/feedbin/feedjira.git
 11 |   revision: da48b21c09604c797854e095504626ac25d0892a
 12 |   branch: f2
 13 |   specs:
 14 |     feedjira (2.1.2)
 15 |       loofah (>= 2.0)
 16 |       sax-machine (>= 1.0)
 17 | 
 18 | GIT
 19 |   remote: https://github.com/feedbin/feedkit.git
 20 |   revision: 68aa2b996f98c9d61cc4bc69388c10a2431efe47
 21 |   branch: master
 22 |   specs:
 23 |     feedkit (0.1.0)
 24 |       addressable (~> 2.8.0)
 25 |       feedjira (~> 2.0)
 26 |       http (~> 4.4)
 27 |       rchardet (~> 1.8.0)
 28 |       twitter (~> 7.0)
 29 |       twitter-text (~> 3.1.0)
 30 | 
 31 | GIT
 32 |   remote: https://github.com/feedbin/http.git
 33 |   revision: 334b230d2033177ebf535e8277e21408cd8fdfcf
 34 |   branch: feedbin
 35 |   specs:
 36 |     http (4.4.1)
 37 |       addressable (~> 2.3)
 38 |       http-cookie (~> 1.0)
 39 |       http-form_data (~> 2.2)
 40 |       http-parser (~> 1.2.0)
 41 | 
 42 | GIT
 43 |   remote: https://github.com/feedbin/sax-machine.git
 44 |   revision: 80c6e3b9eb4ab0ac16a0eeaac13d10a713d10423
 45 |   branch: feedbin
 46 |   specs:
 47 |     sax-machine (1.3.2)
 48 | 
 49 | GEM
 50 |   remote: https://rubygems.org/
 51 |   specs:
 52 |     addressable (2.8.1)
 53 |       public_suffix (>= 2.0.2, < 6.0)
 54 |     aggregate (0.2.3)
 55 |     ast (2.4.2)
 56 |     buftok (0.2.0)
 57 |     builder (3.2.4)
 58 |     connection_pool (2.2.5)
 59 |     crack (0.4.5)
 60 |       rexml
 61 |     crass (1.0.6)
 62 |     domain_name (0.5.20190701)
 63 |       unf (>= 0.0.5, < 1.0.0)
 64 |     dotenv (2.8.1)
 65 |     equalizer (0.0.11)
 66 |     excon (0.92.4)
 67 |     faraday (1.10.2)
 68 |       faraday-em_http (~> 1.0)
 69 |       faraday-em_synchrony (~> 1.0)
 70 |       faraday-excon (~> 1.1)
 71 |       faraday-httpclient (~> 1.0)
 72 |       faraday-multipart (~> 1.0)
 73 |       faraday-net_http (~> 1.0)
 74 |       faraday-net_http_persistent (~> 1.0)
 75 |       faraday-patron (~> 1.0)
 76 |       faraday-rack (~> 1.0)
 77 |       faraday-retry (~> 1.0)
 78 |       ruby2_keywords (>= 0.0.4)
 79 |     faraday-em_http (1.0.0)
 80 |     faraday-em_synchrony (1.0.0)
 81 |     faraday-excon (1.1.0)
 82 |     faraday-httpclient (1.0.1)
 83 |     faraday-multipart (1.0.4)
 84 |       multipart-post (~> 2)
 85 |     faraday-net_http (1.0.1)
 86 |     faraday-net_http_persistent (1.2.0)
 87 |     faraday-patron (1.0.0)
 88 |     faraday-rack (1.0.0)
 89 |     faraday-retry (1.0.3)
 90 |     ffi (1.15.5)
 91 |     ffi-compiler (1.0.1)
 92 |       ffi (>= 1.0.0)
 93 |       rake
 94 |     fog-aws (3.15.0)
 95 |       fog-core (~> 2.1)
 96 |       fog-json (~> 1.1)
 97 |       fog-xml (~> 0.1)
 98 |     fog-core (2.3.0)
 99 |       builder
100 |       excon (~> 0.71)
101 |       formatador (>= 0.2, < 2.0)
102 |       mime-types
103 |     fog-json (1.2.0)
104 |       fog-core
105 |       multi_json (~> 1.10)
106 |     fog-xml (0.1.4)
107 |       fog-core
108 |       nokogiri (>= 1.5.11, < 2.0.0)
109 |     foreman (0.87.2)
110 |     formatador (1.1.0)
111 |     hashdiff (1.0.1)
112 |     hetchy (1.0.0)
113 |     http-cookie (1.0.5)
114 |       domain_name (~> 0.5)
115 |     http-form_data (2.3.0)
116 |     http-parser (1.2.3)
117 |       ffi-compiler (>= 1.0, < 2.0)
118 |     http_parser.rb (0.6.0)
119 |     idn-ruby (0.1.4)
120 |     image_processing (1.12.2)
121 |       mini_magick (>= 4.9.5, < 5)
122 |       ruby-vips (>= 2.0.17, < 3)
123 |     json (2.6.2)
124 |     librato-metrics (1.6.2)
125 |       aggregate (~> 0.2.2)
126 |       faraday (>= 0.7, < 2.0)
127 |       multi_json
128 |     librato-rack (1.1.1)
129 |       hetchy (~> 1.0)
130 |       librato-metrics (~> 1.6)
131 |     loofah (2.18.0)
132 |       crass (~> 1.0.2)
133 |       nokogiri (>= 1.5.9)
134 |     memoizable (0.4.2)
135 |       thread_safe (~> 0.3, >= 0.3.1)
136 |     mime-types (3.4.1)
137 |       mime-types-data (~> 3.2015)
138 |     mime-types-data (3.2022.0105)
139 |     mini_magick (4.11.0)
140 |     mini_portile2 (2.8.0)
141 |     minitest (5.16.3)
142 |     multi_json (1.15.0)
143 |     multipart-post (2.2.3)
144 |     naught (1.1.0)
145 |     nokogiri (1.13.8)
146 |       mini_portile2 (~> 2.8.0)
147 |       racc (~> 1.4)
148 |     parallel (1.22.1)
149 |     parser (3.1.2.1)
150 |       ast (~> 2.4.1)
151 |     public_suffix (5.0.0)
152 |     racc (1.6.0)
153 |     rack (2.2.4)
154 |     rainbow (3.1.1)
155 |     rake (13.0.6)
156 |     rchardet (1.8.0)
157 |     redis (5.0.4)
158 |       redis-client (>= 0.7.4)
159 |     redis-client (0.8.0)
160 |       connection_pool
161 |     regexp_parser (2.5.0)
162 |     resolv (0.2.1)
163 |     rexml (3.2.5)
164 |     rubocop (1.35.1)
165 |       json (~> 2.3)
166 |       parallel (~> 1.10)
167 |       parser (>= 3.1.2.1)
168 |       rainbow (>= 2.2.2, < 4.0)
169 |       regexp_parser (>= 1.8, < 3.0)
170 |       rexml (>= 3.2.5, < 4.0)
171 |       rubocop-ast (>= 1.20.1, < 2.0)
172 |       ruby-progressbar (~> 1.7)
173 |       unicode-display_width (>= 1.4.0, < 3.0)
174 |     rubocop-ast (1.21.0)
175 |       parser (>= 3.1.1.0)
176 |     rubocop-performance (1.14.3)
177 |       rubocop (>= 1.7.0, < 2.0)
178 |       rubocop-ast (>= 0.4.0)
179 |     ruby-progressbar (1.11.0)
180 |     ruby-vips (2.1.4)
181 |       ffi (~> 1.12)
182 |     ruby2_keywords (0.0.5)
183 |     sidekiq (6.5.5)
184 |       connection_pool (>= 2.2.2)
185 |       rack (~> 2.0)
186 |       redis (>= 4.5.0)
187 |     simple_oauth (0.3.1)
188 |     standard (1.16.1)
189 |       rubocop (= 1.35.1)
190 |       rubocop-performance (= 1.14.3)
191 |     thread_safe (0.3.6)
192 |     twitter (7.0.0)
193 |       addressable (~> 2.3)
194 |       buftok (~> 0.2.0)
195 |       equalizer (~> 0.0.11)
196 |       http (~> 4.0)
197 |       http-form_data (~> 2.0)
198 |       http_parser.rb (~> 0.6.0)
199 |       memoizable (~> 0.4.0)
200 |       multipart-post (~> 2.0)
201 |       naught (~> 1.0)
202 |       simple_oauth (~> 0.3.0)
203 |     twitter-text (3.1.0)
204 |       idn-ruby
205 |       unf (~> 0.1.0)
206 |     unf (0.1.4)
207 |       unf_ext
208 |     unf_ext (0.0.8.2)
209 |     unicode-display_width (2.2.0)
210 |     webmock (3.18.1)
211 |       addressable (>= 2.8.0)
212 |       crack (>= 0.3.2)
213 |       hashdiff (>= 0.4.0, < 2.0.0)
214 | 
215 | PLATFORMS
216 |   ruby
217 | 
218 | DEPENDENCIES
219 |   addressable
220 |   bundler
221 |   connection_pool
222 |   dotenv
223 |   down!
224 |   feedjira!
225 |   feedkit!
226 |   fog-aws
227 |   foreman
228 |   http!
229 |   image_processing
230 |   librato-metrics (~> 1.6.2)
231 |   librato-rack
232 |   mime-types
233 |   minitest
234 |   nokogiri
235 |   rake
236 |   redis
237 |   resolv
238 |   ruby-vips
239 |   sax-machine!
240 |   sidekiq
241 |   standard
242 |   unf_ext
243 |   webmock
244 | 
245 | BUNDLED WITH
246 |    2.3.21
247 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | ===============
 3 | 
 4 | Copyright 2013 [Ben Ubois](mailto:ben@feedbin.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 9 | 
10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | crawler_images_parallel: bundle exec sidekiq --concurrency 4  --queue image_parallel_critical,2         --queue image_parallel         -q image_parallel_$HOSTNAME --require ./lib/image.rb
2 | crawler_images_serial:   bundle exec sidekiq --concurrency 1  --queue image_serial_critical_$HOSTNAME,2 --queue image_serial_$HOSTNAME                             --require ./lib/image.rb
3 | crawler_feeds_parallel:  bundle exec sidekiq --concurrency 40 --queue feed_downloader_critical,2        --queue feed_downloader                                    --require ./lib/refresher.rb
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | ‼️ Crawler functionality has been merged into the main Feedbin app.
 3 | ```
 4 | 
 5 | Crawler
 6 | =========
 7 | 
 8 | Crawler is a service meant to be run in combination with [Feedbin](https://github.com/feedbin/feedbin). Crawler refreshes feeds a process image thumbnails. It is a separate service so it can be as lightweight and parallel as possible.
 9 | 
10 | ### Requirements
11 | 
12 | * libvips 8.6+
13 | * Ruby 3.1
14 | * An AWS S3 bucket
15 | * Redis shared with the main Feedbin instance
16 | 
17 | ### Environment variables
18 | 
19 | * `AWS_ACCESS_KEY_ID` - Your AWS access key ID
20 | * `AWS_SECRET_ACCESS_KEY` - You AWS secret access key
21 | * `AWS_S3_BUCKET_IMAGES` (or `AWS_S3_BUCKET` if not set) - The bucket to upload the thumbnails to
22 | * `REDIS_URL` - The URL to the Redis instance used by the main Feedbin instance
23 | * `FACEBOOK_ACCESS_TOKEN` - Needed to access Instagram images
24 | 
25 | Optional variables, you might need these for non-AWS providers:
26 | 
27 | * `AWS_S3_REGION` - The AWS region of your bucket
28 | * `AWS_S3_HOST` - domain of your endpoint
29 | * `AWS_S3_ENDPOINT` - Same but with the scheme and port
30 | * `AWS_S3_PATH_STYLE` - Need to be set to `true` for Minio
31 | 
32 | You can technically also use Minio or another S3 alternative by editing the parameters in [lib/storage.rb](lib/storage.rb). The Minio cookbook has [an example](https://github.com/minio/cookbook/blob/master/docs/fog-aws-for-ruby-with-minio.md) with the necessary parameters.
33 | 
34 | ### Setup
35 | Clone the repo and install dependencies:
36 | ```
37 | git clone https://github.com/feedbin/crawler.git
38 | cd crawler
39 | bundle
40 | ```
41 | 
42 | Start the process with `bundle exec foreman start`
43 | 
44 | You may need to adjust the `ENTRY_IMAGE_HOST` environment variable of the main Feedbin instance if you want to use a reverse proxy to S3 or if you're using an alternative file server. The variable can be used to replace the hostname clients use to get the images, but the path can't be changed.
45 | 
46 | Crawler needs access to the same Redis instance as the main Feedbin instance (`REDIS_URL` environment variable).
47 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require "rake/testtask"
 2 | 
 3 | path = __dir__
 4 | 
 5 | Rake::TestTask.new(:test) do |test|
 6 |   test.libs = []
 7 |   test.ruby_opts = ["-W1"]
 8 |   test.pattern = "test/**/*_test.rb"
 9 | end
10 | 
11 | task default: :test
12 | 


--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "bundler/setup"
 4 | require_relative "../lib/image"
 5 | require_relative "../lib/refresher"
 6 | 
 7 | # You can add fixtures and/or initialization code here to make experimenting
 8 | # with your gem easier. You can also use a different console, if you like.
 9 | 
10 | # (If you use this, don't forget to add pry to your Gemfile!)
11 | # require "pry"
12 | # Pry.start
13 | 
14 | require "irb"
15 | IRB.start(__FILE__)
16 | 


--------------------------------------------------------------------------------
/bin/pigo_darwin_arm64:
--------------------------------------------------------------------------------
1 | pigo_darwin_x86_64


--------------------------------------------------------------------------------
/bin/pigo_darwin_x86_64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/bin/pigo_darwin_x86_64


--------------------------------------------------------------------------------
/bin/pigo_linux_x86_64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/bin/pigo_linux_x86_64


--------------------------------------------------------------------------------
/bin/rake:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | bundle exec rake


--------------------------------------------------------------------------------
/lib/crawler/image/cache.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class Cache
 6 |       def self.read(*args)
 7 |         new.read(*args)
 8 |       end
 9 | 
10 |       def self.delete(*args)
11 |         new.delete(*args)
12 |       end
13 | 
14 |       def self.increment(key, **args)
15 |         new.increment(key, **args)
16 |       end
17 | 
18 |       def self.count(*args)
19 |         new.count(*args)
20 |       end
21 | 
22 |       def self.write(key, value, **args)
23 |         new.write(key, value, **args)
24 |       end
25 | 
26 |       def read(key)
27 |         @read ||= begin
28 |           value = Sidekiq.redis do |redis|
29 |             redis.get key
30 |           end
31 |           JSON.load(value)&.transform_keys(&:to_sym) || {}
32 |         end
33 |       end
34 | 
35 |       def write(key, values, options: {})
36 |         values = values.compact
37 |         unless values.empty?
38 |           Sidekiq.redis do |redis|
39 |             redis.set(key, JSON.dump(values))
40 |           end
41 |         end
42 |         write_key_expiry(key, options)
43 |       end
44 | 
45 |       def delete(*keys)
46 |         Sidekiq.redis { |redis| redis.unlink(*keys) }
47 |       end
48 | 
49 |       def increment(key, options: {})
50 |         count = Sidekiq.redis { |redis| redis.incr(key) }
51 |         write_key_expiry(key, options)
52 |         count
53 |       end
54 | 
55 |       def count(key)
56 |         Sidekiq.redis { |redis| redis.get(key) }.to_i
57 |       end
58 | 
59 |       def write_key_expiry(key, options)
60 |         if options[:expires_in]
61 |           Sidekiq.redis do |redis|
62 |             redis.expire key, options[:expires_in]
63 |           end
64 |         end
65 |       end
66 |     end
67 |   end
68 | end
69 | 


--------------------------------------------------------------------------------
/lib/crawler/image/cascade/facefinder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/lib/crawler/image/cascade/facefinder


--------------------------------------------------------------------------------
/lib/crawler/image/download.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class Download
 6 |       attr_reader :path
 7 | 
 8 |       def initialize(url, minimum_size: 20_000)
 9 |         @url = url
10 |         @valid = false
11 |         @minimum_size = minimum_size
12 |       end
13 | 
14 |       def self.download!(url, **args)
15 |         klass = find_download_provider(url) || Download::Default
16 |         instance = klass.new(url, **args)
17 |         instance.download
18 |         instance
19 |       end
20 | 
21 |       def image_url
22 |         @url
23 |       end
24 | 
25 |       def download_file(url)
26 |         @file = Down.download(url, max_size: 10 * 1024 * 1024, timeout_options: {read_timeout: 20, write_timeout: 5, connect_timeout: 5})
27 |         @path = @file.path
28 |       end
29 | 
30 |       def persist!
31 |         unless @path == persisted_path
32 |           FileUtils.mv @path, persisted_path
33 |           @path = persisted_path
34 |         end
35 |         persisted_path
36 |       end
37 | 
38 |       def delete!
39 |         @file.respond_to?(:close) && @file.close
40 |         @file.respond_to?(:unlink) && @file.unlink
41 |         @path && File.unlink(@path)
42 |       rescue Errno::ENOENT
43 |       end
44 | 
45 |       def persisted_path
46 |         @persisted_path ||= File.join(Dir.tmpdir, ["image_original_", SecureRandom.hex].join)
47 |       end
48 | 
49 |       def valid?
50 |         valid = @file && @file.content_type&.start_with?("image")
51 |         valid &&= @file.size >= @minimum_size unless @minimum_size.nil?
52 |         valid
53 |       end
54 | 
55 |       def provider_identifier
56 |         self.class.recognize_url?(@url)
57 |       end
58 | 
59 |       def self.recognize_url?(src_url)
60 |         if supported_urls.find { src_url.to_s =~ _1 }
61 |           Regexp.last_match[1]
62 |         else
63 |           false
64 |         end
65 |       end
66 | 
67 |       def self.find_download_provider(url)
68 |         download_providers.detect { |klass| klass.recognize_url?(url) }
69 |       end
70 | 
71 |       def self.download_providers
72 |         [
73 |           Download::Youtube,
74 |           Download::Instagram,
75 |           Download::Vimeo
76 |         ]
77 |       end
78 | 
79 |       def self.supported_urls
80 |         []
81 |       end
82 |     end
83 |   end
84 | end


--------------------------------------------------------------------------------
/lib/crawler/image/download/default.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class Download::Default < Download
 6 |       def self.recognize_url?(*args)
 7 |         true
 8 |       end
 9 | 
10 |       def download
11 |         download_file(image_url)
12 |       rescue Down::Error => exception
13 |       end
14 |     end
15 |   end
16 | end


--------------------------------------------------------------------------------
/lib/crawler/image/download/instagram.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class Download::Instagram < Download
 6 |       def self.supported_urls
 7 |         [
 8 |           %r{.*?//www\.instagram\.com/p/(.*?)(/|#|\?|$)},
 9 |           %r{.*?//instagram\.com/p/(.*?)(/|#|\?|$)}
10 |         ]
11 |       end
12 | 
13 |       def download
14 |         download_file(image_url)
15 |       rescue Down::Error => exception
16 |       end
17 | 
18 |       def image_url
19 |         data.dig("thumbnail_url")
20 |       end
21 | 
22 |       private
23 | 
24 |       OEMBED_URL = "https://graph.facebook.com/v9.0/instagram_oembed"
25 | 
26 |       def data
27 |         @data ||= begin
28 |           options = {
29 |             params: {
30 |               access_token: ENV["FACEBOOK_ACCESS_TOKEN"],
31 |               url: "https://instagram.com/p/#{provider_identifier}",
32 |               fields: "thumbnail_url"
33 |             }
34 |           }
35 |           JSON.load(HTTP.get(OEMBED_URL, options).to_s)
36 |         end
37 |       end
38 |     end
39 |   end
40 | end


--------------------------------------------------------------------------------
/lib/crawler/image/download/vimeo.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class Download::Vimeo < Download
 6 |       def self.supported_urls
 7 |         [
 8 |           %r{.*?//vimeo\.com/video/(.*?)(#|\?|$)},
 9 |           %r{.*?//vimeo\.com/(.*?)(#|\?|$)},
10 |           %r{.*?//player\.vimeo\.com/video/(.*?)(#|\?|$)}
11 |         ]
12 |       end
13 | 
14 |       def download
15 |         download_file(image_url)
16 |       rescue Down::Error => exception
17 |       end
18 | 
19 |       def image_url
20 |         data.dig("thumbnail_url").gsub(/_\d+.jpg/, ".jpg")
21 |       end
22 | 
23 |       private
24 | 
25 |       OEMBED_URL = "https://vimeo.com/api/oembed.json"
26 | 
27 |       def data
28 |         @data ||= begin
29 |           options = {
30 |             params: {
31 |               url: "https://vimeo.com/#{provider_identifier}"
32 |             }
33 |           }
34 |           JSON.load(HTTP.get(OEMBED_URL, options).to_s)
35 |         end
36 |       end
37 |     end
38 |   end
39 | end


--------------------------------------------------------------------------------
/lib/crawler/image/download/youtube.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class Download::Youtube < Download
 6 |       attr_reader :image_url
 7 | 
 8 |       def self.supported_urls
 9 |         [
10 |           %r{.*?//www\.youtube-nocookie\.com/embed/(.*?)(\?|$)},
11 |           %r{.*?//www\.youtube\.com/embed/(.*?)(\?|$)},
12 |           %r{.*?//www\.youtube\.com/user/.*?#\w/\w/\w/\w/(.+)\b},
13 |           %r{.*?//www\.youtube\.com/v/(.*?)(#|\?|$)},
14 |           %r{.*?//www\.youtube\.com/watch\?v=(.*?)(&|#|$)},
15 |           %r{.*?//youtube-nocookie\.com/embed/(.*?)(\?|$)},
16 |           %r{.*?//youtube\.com/embed/(.*?)(\?|$)},
17 |           %r{.*?//youtu\.be/(.+)}
18 |         ]
19 |       end
20 | 
21 |       def download
22 |         ["maxresdefault", "hqdefault"].each do |option|
23 |           @image_url = "https://i.ytimg.com/vi/#{provider_identifier}/#{option}.jpg"
24 |           download_file(@image_url)
25 |           break
26 |         rescue Down::Error => exception
27 |         end
28 |       end
29 |     end
30 |   end
31 | end


--------------------------------------------------------------------------------
/lib/crawler/image/download_cache.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class DownloadCache
 6 |       include Helpers
 7 | 
 8 |       attr_reader :storage_url
 9 | 
10 |       def initialize(url, public_id:, preset_name:)
11 |         @url = url
12 |         @public_id = public_id
13 |         @preset_name = preset_name
14 |         @storage_url = nil
15 |       end
16 | 
17 |       def self.copy(url, **args)
18 |         instance = new(url, **args)
19 |         instance.copy
20 |         instance
21 |       end
22 | 
23 |       def copy
24 |         @storage_url = copy_image unless storage_url.nil? || storage_url == false
25 |       end
26 | 
27 |       def copied?
28 |         !!@storage_url
29 |       end
30 | 
31 |       def storage_url
32 |         @storage_url ||= cache[:storage_url]
33 |       end
34 | 
35 |       def image_url
36 |         @image_url ||= cache[:image_url]
37 |       end
38 | 
39 |       def download?
40 |         !previously_attempted? && storage_url != false
41 |       end
42 | 
43 |       def previously_attempted?
44 |         !cache.empty?
45 |       end
46 | 
47 |       def save(storage_url:, image_url:)
48 |         @cache = {storage_url: storage_url, image_url: image_url}
49 |         Cache.write(cache_key, @cache, options: {expires_in: 7 * 24 * 60 * 60})
50 |       end
51 | 
52 |       def cache
53 |         @cache ||= Cache.read(cache_key)
54 |       end
55 | 
56 |       def cache_key
57 |         "image_download_#{@preset_name}_#{Digest::SHA1.hexdigest(@url)}"
58 |       end
59 | 
60 |       def copy_image
61 |         url = URI.parse(storage_url)
62 |         source_object_name = url.path[1..-1]
63 |         Fog::Storage.new(STORAGE_OPTIONS).copy_object(AWS_S3_BUCKET_IMAGES, source_object_name, AWS_S3_BUCKET_IMAGES, image_name, storage_options)
64 |         final_url = url.path = "/#{image_name}"
65 |         url.to_s
66 |       rescue Excon::Error::NotFound
67 |         false
68 |       end
69 |     end
70 |   end
71 | end
72 | 


--------------------------------------------------------------------------------
/lib/crawler/image/helpers.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     module Helpers
 6 |       AWS_S3_BUCKET_IMAGES = ENV["AWS_S3_BUCKET_IMAGES"] || ENV["AWS_S3_BUCKET"]
 7 | 
 8 |       def preset
 9 |         OpenStruct.new(IMAGE_PRESETS[@preset_name.to_sym])
10 |       end
11 | 
12 |       def send_to_feedbin(original_url:, storage_url:)
13 |         Sidekiq::Client.push(
14 |         "args" => [@public_id, {
15 |           "original_url" => original_url,
16 |           "processed_url" => storage_url,
17 |           "width" => preset.width,
18 |           "height" => preset.height
19 |           }],
20 |           "class" => preset.job_class,
21 |           "queue" => "default"
22 |           )
23 |         end
24 | 
25 |         def image_name
26 |           File.join(@public_id[0..6], "#{@public_id}.jpg")
27 |         end
28 | 
29 |         def storage_options
30 |           {
31 |             "Cache-Control" => "max-age=315360000, public",
32 |             "Expires" => "Sun, 29 Jun 2036 17:48:34 GMT",
33 |             "x-amz-storage-class" => ENV["AWS_S3_STORAGE_CLASS"] || "REDUCED_REDUNDANCY",
34 |             "x-amz-acl" => "public-read"
35 |           }
36 |         end
37 |       end
38 |     end
39 |   end


--------------------------------------------------------------------------------
/lib/crawler/image/image_processor.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module Crawler
  4 |   module Image
  5 |     class ImageProcessor
  6 |       attr_reader :path
  7 | 
  8 |       def initialize(file, target_width:, target_height:)
  9 |         @file = file
 10 |         @target_width = target_width
 11 |         @target_height = target_height
 12 |       end
 13 | 
 14 |       def valid?
 15 |         source.avg && width >= @target_width && height >= @target_height
 16 |       rescue ::Vips::Error
 17 |         false
 18 |       end
 19 | 
 20 |       def height
 21 |         source.height
 22 |       end
 23 | 
 24 |       def width
 25 |         source.width
 26 |       end
 27 | 
 28 |       def color
 29 |         hex = nil
 30 |         file = ImageProcessing::Vips
 31 |         .source(source)
 32 |         .resize_to_fill(1, 1, sharpen: false)
 33 |         .custom { |image|
 34 |           image.tap do |data|
 35 |             hex = data.getpoint(0, 0).map { |value| "%02x" % value }.join
 36 |           end
 37 |         }.call
 38 |         file.unlink
 39 |         hex
 40 |       end
 41 | 
 42 |       def source
 43 |         @source ||= Vips::Image.new_from_file(@file)
 44 |       end
 45 | 
 46 |       def pipeline(resized_width, resized_height)
 47 |         ImageProcessing::Vips
 48 |         .source(source)
 49 |         .resize_to_fill(resized_width, resized_height)
 50 |         .convert("jpg")
 51 |         .saver(strip: true, quality: 90)
 52 |       end
 53 | 
 54 |       def fill_crop
 55 |         pipeline(@target_width, @target_height).call(destination: persisted_path)
 56 |         persisted_path
 57 |       end
 58 | 
 59 |       def smart_crop
 60 |         return fill_crop if resize_too_small? || resize_just_right?
 61 | 
 62 |         image = pipeline(resized.width, resized.height)
 63 | 
 64 |         if resized.width > @target_width
 65 |           axis = "x"
 66 |           contraint = @target_width
 67 |           max = resized.width - @target_width
 68 |         else
 69 |           axis = "y"
 70 |           contraint = @target_height
 71 |           max = resized.height - @target_height
 72 |         end
 73 | 
 74 |         if center = average_face_position(axis, image.call)
 75 |           point = {"x" => 0, "y" => 0}
 76 |           point[axis] = (center.to_f - contraint.to_f / 2.0).floor
 77 | 
 78 |           if point[axis] < 0
 79 |             point[axis] = 0
 80 |           elsif point[axis] > max
 81 |             point[axis] = max
 82 |           end
 83 | 
 84 |           image = image.crop(point["x"], point["y"], @target_width, @target_height)
 85 |         else
 86 |           image = image.resize_to_fill(@target_width, @target_height, crop: :attention)
 87 |         end
 88 | 
 89 |         image.call(destination: persisted_path)
 90 |         persisted_path
 91 |       end
 92 | 
 93 |       def resized
 94 |         @resized ||= begin
 95 |           resized_width = @target_width.to_f
 96 | 
 97 |           width_proportion = width.to_f / height.to_f
 98 |           height_proportion = height.to_f / width.to_f
 99 | 
100 |           resized_height = resized_width * height_proportion
101 | 
102 |           if resized_height < @target_height
103 |             resized_height = @target_height.to_f
104 |             resized_width = resized_height * width_proportion
105 |           end
106 |           OpenStruct.new({width: resized_width.to_i, height: resized_height.to_i})
107 |         end
108 |       end
109 | 
110 |       def average_face_position(axis, file)
111 |         params = {
112 |           pigo: Shellwords.escape(PIGO),
113 |           image: Shellwords.escape(file.path),
114 |           cascade: Shellwords.escape(CASCADE)
115 |         }
116 |         command = "%<pigo>s -in %<image>s -out empty -cf %<cascade>s -scale 1.2 -json -"
117 |         out, _, status = Open3.capture3(command % params)
118 |         begin
119 |           File.unlink(file)
120 |         rescue
121 |           Errno::ENOENT
122 |         end
123 | 
124 |         faces = if status.success?
125 |           JSON.load(out)
126 |         end
127 | 
128 |         return nil if faces.nil?
129 | 
130 |         result = faces.flat_map { |face| face.dig("face") }.map do |face|
131 |           face[axis] + face["size"] / 2
132 |         end
133 | 
134 |         (result.sum(0.0) / result.size).to_i
135 |       end
136 | 
137 |       def persisted_path
138 |         @persisted_path ||= File.join(Dir.tmpdir, ["image_processed_", SecureRandom.hex, ".jpg"].join)
139 |       end
140 | 
141 |       def resize_too_small?
142 |         resized.width < @target_width || resized.height < @target_height
143 |       end
144 | 
145 |       def resize_just_right?
146 |         resized.width == @target_width && resized.height == @target_height
147 |       end
148 |     end
149 |   end
150 | end
151 | 


--------------------------------------------------------------------------------
/lib/crawler/image/initializers/constants.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | module Crawler
 3 |   module Image
 4 |     pigo_name = "pigo_#{Etc.uname[:sysname].downcase}_#{Etc.uname[:machine]}"
 5 |     CASCADE = File.expand_path("../cascade/facefinder", __dir__)
 6 |     PIGO = File.expand_path("../../../../bin/#{pigo_name}", __dir__)
 7 |     raise "Architecture not supported. Add #{pigo_name} to ./bin from https://github.com/esimov/pigo" unless File.executable?(PIGO)
 8 | 
 9 |     IMAGE_PRESETS = {
10 |       primary: {
11 |         width: 542,
12 |         height: 304,
13 |         minimum_size: 20_000,
14 |         crop: :smart_crop,
15 |         job_class: "EntryImage"
16 |       },
17 |       twitter: {
18 |         width: 542,
19 |         height: 304,
20 |         minimum_size: 10_000,
21 |         crop: :smart_crop,
22 |         job_class: "TwitterLinkImage"
23 |       },
24 |       youtube: {
25 |         width: 542,
26 |         height: 304,
27 |         minimum_size: nil,
28 |         crop: :fill_crop,
29 |         job_class: "EntryImage"
30 |       },
31 |       podcast: {
32 |         width: 200,
33 |         height: 200,
34 |         minimum_size: nil,
35 |         crop: :fill_crop,
36 |         job_class: "ItunesImage"
37 |       },
38 |       podcast_feed: {
39 |         width: 200,
40 |         height: 200,
41 |         minimum_size: nil,
42 |         crop: :fill_crop,
43 |         job_class: "ItunesFeedImage"
44 |       }
45 |     }
46 |   end
47 | end


--------------------------------------------------------------------------------
/lib/crawler/image/initializers/down.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | Down.backend :http
4 | 


--------------------------------------------------------------------------------
/lib/crawler/image/initializers/librato.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | Librato.tracker.start!
4 | 


--------------------------------------------------------------------------------
/lib/crawler/image/initializers/sidekiq.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | Sidekiq.configure_server do |config|
 4 |   config.server_middleware do |chain|
 5 |     chain.add WorkerStat
 6 |   end
 7 |   config.redis = {id: "image-server-#{::Process.pid}"}
 8 | end
 9 | 
10 | Sidekiq.configure_client do |config|
11 |   config.redis = {id: "image-client-#{::Process.pid}"}
12 | end
13 | 
14 | Sidekiq.strict_args!(false)
15 | 


--------------------------------------------------------------------------------
/lib/crawler/image/initializers/storage.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     STORAGE_OPTIONS = {
 6 |       provider: "AWS",
 7 |       aws_access_key_id: ENV["AWS_ACCESS_KEY_ID"],
 8 |       aws_secret_access_key: ENV["AWS_SECRET_ACCESS_KEY"]
 9 |     }
10 |     STORAGE_OPTIONS[:region] = ENV["AWS_S3_REGION"] if ENV["AWS_S3_REGION"]
11 |     STORAGE_OPTIONS[:host] = ENV["AWS_S3_HOST"] if ENV["AWS_S3_HOST"]
12 |     STORAGE_OPTIONS[:endpoint] = ENV["AWS_S3_ENDPOINT"] if ENV["AWS_S3_ENDPOINT"]
13 |     STORAGE_OPTIONS[:path_style] = ENV["AWS_S3_PATH_STYLE"] if ENV["AWS_S3_PATH_STYLE"]
14 |   end
15 | end


--------------------------------------------------------------------------------
/lib/crawler/image/initializers/worker_stat.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | class WorkerStat
 4 |   def call(worker, item, queue)
 5 |     title = "worker.#{worker.class}"
 6 |     Librato.increment "#{title}.count"
 7 |     Librato.timing title, percentile: [95] do
 8 |       yield
 9 |     end
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/crawler/image/jobs/find_image.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class FindImage
 6 |       include Sidekiq::Worker
 7 |       include Helpers
 8 |       sidekiq_options queue: "image_parallel", retry: false
 9 | 
10 |       def perform(public_id, preset_name, candidate_urls, entry_url = nil)
11 |         @public_id = public_id
12 |         @preset_name = preset_name
13 |         @entry_url = entry_url
14 |         @candidate_urls = combine_urls(candidate_urls)
15 |         timer = Timer.new(45)
16 |         count = 0
17 | 
18 |         while original_url = @candidate_urls.shift
19 |           count += 1
20 | 
21 |           if count > 10
22 |             Sidekiq.logger.info "Exceeded count limit: public_id=#{@public_id} count=#{count}"
23 |             break
24 |           end
25 | 
26 |           if timer.expired?
27 |             Sidekiq.logger.info "Exceeded total time limit: public_id=#{@public_id} elapsed_time=#{timer.elapsed}"
28 |             break
29 |           end
30 | 
31 |           Sidekiq.logger.info "Candidate: public_id=#{@public_id} original_url=#{original_url} count=#{count}"
32 | 
33 |           download_cache = DownloadCache.copy(original_url, public_id: @public_id, preset_name: @preset_name)
34 |           if download_cache.copied?
35 |             send_to_feedbin(original_url: download_cache.image_url, storage_url: download_cache.storage_url)
36 |             Sidekiq.logger.info "Copied image: public_id=#{@public_id} image_url=#{download_cache.image_url} storage_url=#{download_cache.storage_url}"
37 |             break
38 |           elsif download_cache.download?
39 |             break if download_image(original_url, download_cache)
40 |           else
41 |             Sidekiq.logger.info "Skipping download: public_id=#{@public_id} original_url=#{original_url}"
42 |           end
43 | 
44 |         end
45 |       end
46 | 
47 |       def download_image(original_url, download_cache)
48 |         found = false
49 |         download = Download.download!(original_url, minimum_size: preset.minimum_size)
50 |         if download.valid?
51 |           found = true
52 |           ProcessImage.perform_async(@public_id, @preset_name, download.persist!, original_url, download.image_url, @candidate_urls)
53 |           Sidekiq.logger.info "Download valid: public_id=#{@public_id} image_url=#{download.image_url}"
54 |         else
55 |           download.delete!
56 |           download_cache.save(storage_url: false, image_url: false)
57 |           Sidekiq.logger.info "Download invalid: public_id=#{@public_id} original_url=#{original_url}"
58 |         end
59 |         found
60 |       rescue => exception
61 |         download.delete!
62 |         Sidekiq.logger.info "Download failed: exception=#{exception.inspect} original_url=#{original_url}"
63 |         false
64 |       end
65 | 
66 |       def combine_urls(candidate_urls)
67 |         return candidate_urls unless @entry_url
68 | 
69 |         if Download.find_download_provider(@entry_url)
70 |           page_urls = [@entry_url]
71 |           Sidekiq.logger.info "Recognized URL: public_id=#{@public_id} entry_url=#{@entry_url}"
72 |         else
73 |           page_urls = MetaImages.find_urls(@entry_url)
74 |           Sidekiq.logger.info "MetaImages: public_id=#{@public_id} count=#{page_urls&.length || 0} entry_url=#{@entry_url}"
75 |         end
76 |         page_urls ||= []
77 |         page_urls.concat(candidate_urls)
78 |       end
79 |     end
80 | 
81 |     class FindImageCritical
82 |       include Sidekiq::Worker
83 |       sidekiq_options queue: "image_parallel_critical", retry: false
84 |       def perform(*args)
85 |         FindImage.new.perform(*args)
86 |       end
87 |     end
88 |   end
89 | end


--------------------------------------------------------------------------------
/lib/crawler/image/jobs/process_image.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class ProcessImage
 6 |       include Sidekiq::Worker
 7 |       include Helpers
 8 |       sidekiq_options queue: "image_serial_#{Socket.gethostname}", retry: false
 9 | 
10 |       def perform(public_id, preset_name, original_path, original_url, image_url, candidate_urls)
11 |         @preset_name = preset_name
12 |         Sidekiq.logger.info "ProcessImage: public_id=#{public_id} original_url=#{original_url}"
13 |         image = ImageProcessor.new(original_path, target_width: preset.width, target_height: preset.height)
14 |         if image.valid?
15 |           processed_path = image.send(preset.crop)
16 |           UploadImage.perform_async(public_id, @preset_name, processed_path, original_url, image_url)
17 |         else
18 |           FindImageCritical.perform_async(public_id, @preset_name, candidate_urls) unless candidate_urls.empty?
19 |         end
20 |         begin
21 |           File.unlink(original_path)
22 |         rescue Errno::ENOENT
23 |         end
24 |       end
25 |     end
26 | 
27 |     class ProcessImageCritical
28 |       include Sidekiq::Worker
29 |       sidekiq_options queue: "image_serial_critical_#{Socket.gethostname}", retry: false
30 |       def perform(*args)
31 |         ProcessImage.new.perform(*args)
32 |       end
33 |     end
34 |   end
35 | end


--------------------------------------------------------------------------------
/lib/crawler/image/jobs/upload_image.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class UploadImage
 6 |       include Sidekiq::Worker
 7 |       include Helpers
 8 |       sidekiq_options queue: "image_parallel_#{Socket.gethostname}", retry: false
 9 | 
10 |       def perform(public_id, preset_name, image_path, original_url, image_url)
11 |         @public_id = public_id
12 |         @preset_name = preset_name
13 |         @original_url = original_url
14 |         @image_path = image_path
15 | 
16 |         storage_url = upload
17 |         send_to_feedbin(original_url: image_url, storage_url: storage_url)
18 |         begin
19 |           File.unlink(image_path)
20 |         rescue Errno::ENOENT
21 |         end
22 | 
23 |         DownloadCache.new(@original_url, public_id: @public_id, preset_name: @preset_name).save(storage_url: storage_url, image_url: image_url)
24 |         Sidekiq.logger.info "UploadImage: public_id=#{@public_id} original_url=#{@original_url} storage_url=#{storage_url}"
25 |       end
26 | 
27 |       def upload
28 |         File.open(@image_path) do |file|
29 |           response = Fog::Storage.new(STORAGE_OPTIONS).put_object(AWS_S3_BUCKET_IMAGES, image_name, file, storage_options)
30 |           URI::HTTPS.build(
31 |             host: response.data[:host],
32 |             path: response.data[:path]
33 |           ).to_s
34 |         end
35 |       end
36 |     end
37 |   end
38 | end


--------------------------------------------------------------------------------
/lib/crawler/image/meta_images.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class MetaImages
 6 |       def initialize(url)
 7 |         @url = url
 8 |       end
 9 | 
10 |       def self.find_urls(url)
11 |         new(url).find_urls
12 |       rescue Addressable::URI::InvalidURIError
13 |         []
14 |       end
15 | 
16 |       def find_urls
17 |         if cache.urls
18 |           cache.urls
19 |         elsif needs_download?
20 |           download
21 |         end
22 |       end
23 | 
24 |       def download
25 |         urls = []
26 |         file = Down.download(parsed_url, max_size: 5 * 1024 * 1024)
27 |         urls = parse(file)
28 |       rescue Down::Error => exception
29 |         Sidekiq.logger.info "PageImages: exception=#{exception.inspect} url=#{@url}"
30 |         urls
31 |       ensure
32 |         cache.save({checked: true, urls: urls})
33 |         cache.has_meta!(!urls.empty?)
34 |       end
35 | 
36 |       def parse(file)
37 |         Nokogiri.HTML5(file.read).search("meta[property='twitter:image'], meta[property='og:image']").map do |element|
38 |           url = element["content"]&.strip
39 |           next if url.nil?
40 |           next if url == ""
41 |           Addressable::URI.join(parsed_url, url)
42 |         end.compact
43 |       end
44 | 
45 |       def needs_download?
46 |         !cache.checked? && cache.has_meta?
47 |       end
48 | 
49 |       def cache
50 |         @cache ||= MetaImagesCache.new(parsed_url)
51 |       end
52 | 
53 |       def parsed_url
54 |         @parsed_url ||= begin
55 |           parsed = Addressable::URI.parse(@url)
56 |           raise Addressable::URI::InvalidURIError if parsed.host.nil?
57 |           parsed
58 |         end
59 |       end
60 |     end
61 |   end
62 | end
63 | 


--------------------------------------------------------------------------------
/lib/crawler/image/meta_images_cache.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class MetaImagesCache
 6 |       def initialize(url)
 7 |         @url = url
 8 |       end
 9 | 
10 |       def urls
11 |         url_cache[:urls]
12 |       end
13 | 
14 |       def checked?
15 |         !!url_cache[:checked]
16 |       end
17 | 
18 |       def has_meta!(result)
19 |         @host_cache = {has_meta: result}
20 |         Cache.write(host_cache_key, @host_cache, options: {expires_in: 24 * 60 * 60})
21 |       end
22 | 
23 |       def has_meta?
24 |         host_cache[:has_meta].nil? ? true : host_cache[:has_meta]
25 |       end
26 | 
27 |       def save(data)
28 |         @url_cache = data
29 |         Cache.write(url_cache_key, data, options: {expires_in: 24 * 60 * 60})
30 |       end
31 | 
32 |       def url_cache
33 |         @url_cache ||= Cache.read(url_cache_key)
34 |       end
35 | 
36 |       def host_cache
37 |         @host_cache ||= Cache.read(host_cache_key)
38 |       end
39 | 
40 |       def host_cache_key
41 |         "image_host_#{Digest::SHA1.hexdigest(@url.host)}"
42 |       end
43 | 
44 |       def url_cache_key
45 |         "image_url_#{Digest::SHA1.hexdigest(@url)}"
46 |       end
47 |     end
48 |   end
49 | end


--------------------------------------------------------------------------------
/lib/crawler/image/timer.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Image
 5 |     class Timer
 6 |       def initialize(timeout = 0)
 7 |         start
 8 |         @deadline = now + timeout.to_f
 9 |       end
10 | 
11 |       def expired?
12 |         now > @deadline
13 |       end
14 | 
15 |       def now
16 |         ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
17 |       end
18 | 
19 |       def elapsed
20 |         (now - start).ceil(2)
21 |       end
22 | 
23 |       def start
24 |         @start ||= now
25 |       end
26 |     end
27 |   end
28 | end


--------------------------------------------------------------------------------
/lib/crawler/refresher/cache.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class Cache
 6 |       def self.read(*args)
 7 |         new.read(*args)
 8 |       end
 9 | 
10 |       def self.delete(*args)
11 |         new.delete(*args)
12 |       end
13 | 
14 |       def self.increment(key, **args)
15 |         new.increment(key, **args)
16 |       end
17 | 
18 |       def self.count(*args)
19 |         new.count(*args)
20 |       end
21 | 
22 |       def self.write(key, value, **args)
23 |         new.write(key, value, **args)
24 |       end
25 | 
26 |       def read(key)
27 |         @read ||= begin
28 |           hash = Sidekiq.redis do |redis|
29 |             redis.hgetall key
30 |           end
31 |           hash.transform_keys(&:to_sym)
32 |         end
33 |       end
34 | 
35 |       def write(key, values, options: {})
36 |         values = values.compact
37 |         unless values.empty?
38 |           Sidekiq.redis do |redis|
39 |             redis.mapped_hmset(key, values)
40 |           end
41 |         end
42 |         write_key_expiry(key, options)
43 |       end
44 | 
45 |       def delete(*keys)
46 |         Sidekiq.redis {|redis| redis.unlink(*keys) }
47 |       end
48 | 
49 |       def increment(key, options: {})
50 |         count = Sidekiq.redis {|redis| redis.incr(key) }
51 |         write_key_expiry(key, options)
52 |         count
53 |       end
54 | 
55 |       def count(key)
56 |         Sidekiq.redis {|redis| redis.get(key) }.to_i
57 |       end
58 | 
59 |       def write_key_expiry(key, options)
60 |         if options[:expires_in]
61 |           Sidekiq.redis do |redis|
62 |             redis.expire key, options[:expires_in]
63 |           end
64 |         end
65 |       end
66 |     end
67 |   end
68 | end


--------------------------------------------------------------------------------
/lib/crawler/refresher/feed.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class Feed
 6 |       extend Forwardable
 7 | 
 8 |       def_delegators :http_cache, :etag, :last_modified, :checksum, :save
 9 |       def_delegators :feed_status, :ok?, :log_download!, :downloaded_at
10 | 
11 |       def_delegator :feed_status, :count, :attempt_count
12 |       def_delegator :feed_status, :error!, :download_error
13 |       def_delegator :redirect_cache, :read, :redirect
14 | 
15 |       attr_accessor :redirects
16 | 
17 |       def initialize(feed_id)
18 |         @feed_id = feed_id
19 |         @redirects = []
20 |       end
21 | 
22 |       def next_attempt
23 |         Time.at(feed_status.next_retry).utc.iso8601
24 |       end
25 | 
26 |       def download_success
27 |         feed_status.clear! unless last_error && last_error["class"] == "Feedkit::NotFeed"
28 |         redirect_cache.save(redirects)
29 |       end
30 | 
31 |       def last_error
32 |         feed_status.attempt_log.first
33 |       end
34 | 
35 |       def redirect_cache
36 |         @redirect_cache ||= RedirectCache.new(@feed_id)
37 |       end
38 | 
39 |       def feed_status
40 |         @feed_status ||= FeedStatus.new(@feed_id)
41 |       end
42 | 
43 |       def http_cache
44 |         @http_cache ||= HTTPCache.new(@feed_id)
45 |       end
46 | 
47 |       def inspect
48 |         "#<#{self.class}:#{object_id.to_s(16)} @feed_id=#{@feed_id} next_attempt=#{next_attempt} redirect=#{redirect.inspect} http_cache=#{http_cache.cached} last_error=#{last_error.inspect}>"
49 |       end
50 |     end
51 |   end
52 | end


--------------------------------------------------------------------------------
/lib/crawler/refresher/feed_status.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | 
  4 | module Crawler
  5 |   module Refresher
  6 |     class FeedStatus
  7 |       def initialize(feed_id)
  8 |         @feed_id = feed_id
  9 |       end
 10 | 
 11 |       def self.clear!(*args)
 12 |         new(*args).clear!
 13 |       end
 14 | 
 15 |       def clear!
 16 |         Cache.delete(cache_key, errors_cache_key, log_cache_key)
 17 |       end
 18 | 
 19 |       def error!(exception, formatted: false)
 20 |         @count = count + 1
 21 |         Cache.write(cache_key, {
 22 |           count: @count,
 23 |           failed_at: Time.now.to_i
 24 |         })
 25 |         exception = formatted ? exception : error_json(exception)
 26 |         Sidekiq.redis do |redis|
 27 |           redis.pipelined do |pipeline|
 28 |             pipeline.lpush(errors_cache_key, exception)
 29 |             pipeline.ltrim(errors_cache_key, 0, 25)
 30 |           end
 31 |         end
 32 |       end
 33 | 
 34 |       def log_download!
 35 |         @downloaded_at = Time.now.to_i
 36 |         Cache.write(log_cache_key, {
 37 |           downloaded_at: @downloaded_at
 38 |         })
 39 |         @downloaded_at
 40 |       end
 41 | 
 42 |       def downloaded_at
 43 |         @downloaded_at ||= log_cache[:downloaded_at] && log_cache[:downloaded_at].to_i
 44 |       end
 45 | 
 46 |       def ok?
 47 |         Time.now.to_i > next_retry
 48 |       end
 49 | 
 50 |       def next_retry
 51 |         failed_at + backoff
 52 |       end
 53 | 
 54 |       def backoff
 55 |         multiplier = [count, 8].max
 56 |         multiplier = [multiplier, 23].min
 57 |         multiplier ** 4
 58 |       end
 59 | 
 60 |       def count
 61 |         @count ||= cached[:count].to_i
 62 |       end
 63 | 
 64 |       def failed_at
 65 |         cached[:failed_at].to_i
 66 |       end
 67 | 
 68 |       def attempt_log
 69 |         @attempt_log ||= begin
 70 |           Sidekiq.redis do |redis|
 71 |             redis.lrange(errors_cache_key, 0, -1)
 72 |           end.map do |json|
 73 |             data = JSON.load(json)
 74 |             data["date"] = Time.at(data["date"])
 75 |             data
 76 |           end
 77 |         end
 78 |       end
 79 | 
 80 |       def error_json(exception)
 81 |         status = exception.respond_to?(:response) ? exception.response.status.code : nil
 82 |         JSON.dump({date: Time.now.to_i, class: exception.class, message: exception.message, status: status})
 83 |       end
 84 | 
 85 |       def log_cache
 86 |         @log_cache ||= Cache.read(log_cache_key)
 87 |       end
 88 | 
 89 |       def cached
 90 |         @cached ||= Cache.read(cache_key)
 91 |       end
 92 | 
 93 |       def cache_key
 94 |         "refresher_status_#{@feed_id}"
 95 |       end
 96 | 
 97 |       def errors_cache_key
 98 |         "refresher_errors_#{@feed_id}"
 99 |       end
100 | 
101 |       def log_cache_key
102 |         "refresher_log_#{@feed_id}"
103 |       end
104 |     end
105 |   end
106 | end


--------------------------------------------------------------------------------
/lib/crawler/refresher/http_cache.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class HTTPCache
 6 |       def initialize(feed_id)
 7 |         @feed_id = feed_id
 8 |       end
 9 | 
10 |       def save(response)
11 |         data = {
12 |           etag:          response.etag,
13 |           last_modified: response.last_modified,
14 |           checksum:      response.checksum
15 |         }
16 |         Cache.write(cache_key, data, options: {expires_in: 8 * 60 * 60})
17 |       end
18 | 
19 |       def etag
20 |         cached[:etag]
21 |       end
22 | 
23 |       def last_modified
24 |         cached[:last_modified]
25 |       end
26 | 
27 |       def checksum
28 |         cached[:checksum]
29 |       end
30 | 
31 |       def cached
32 |         @cached ||= begin
33 |           Cache.read(cache_key)
34 |         end
35 |       end
36 | 
37 |       def cache_key
38 |         "refresher_http_#{@feed_id}"
39 |       end
40 |     end
41 |   end
42 | end


--------------------------------------------------------------------------------
/lib/crawler/refresher/initializers/redis.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | $redis = ConnectionPool.new(size: 3, timeout: 5) {
 4 |   if ENV["REDIS_ID_URL"]
 5 |     Redis.new(url: ENV["REDIS_ID_URL"])
 6 |   else
 7 |     Redis.new
 8 |   end
 9 | }
10 | 


--------------------------------------------------------------------------------
/lib/crawler/refresher/initializers/sidekiq.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | Sidekiq.configure_server do |config|
 4 |   config.redis = {id: "refresher-server-#{::Process.pid}"}
 5 | end
 6 | 
 7 | Sidekiq.configure_client do |config|
 8 |   config.redis = {id: "refresher-client-#{::Process.pid}"}
 9 | end
10 | 
11 | Sidekiq.strict_args!(false)


--------------------------------------------------------------------------------
/lib/crawler/refresher/jobs/feed_downloader.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class FeedDownloader
 6 |       include Sidekiq::Worker
 7 | 
 8 |       sidekiq_options queue: :feed_downloader, retry: false, backtrace: false
 9 | 
10 |       def perform(feed_id, feed_url, subscribers, critical = false)
11 |         @feed_id     = feed_id
12 |         @feed_url    = feed_url
13 |         @subscribers = subscribers
14 |         @critical    = critical
15 |         @feed        = Feed.new(feed_id)
16 | 
17 |         throttle = Throttle.new(@feed_url, @feed.downloaded_at)
18 |         if @critical
19 |           download
20 |         elsif throttle.throttled?
21 |           Sidekiq.logger.info "Throttled downloaded_at=#{Time.at(@feed.downloaded_at)} url=#{@feed_url}"
22 |         elsif @feed.ok?
23 |           download
24 |         end
25 |       end
26 | 
27 |       def download
28 |         @feed.log_download!
29 |         @response = begin
30 |           request
31 |         rescue Feedkit::ZlibError
32 |           request(auto_inflate: false)
33 |         end
34 | 
35 |         not_modified = @response.not_modified?(@feed.checksum)
36 |         Sidekiq.logger.info "Downloaded modified=#{!not_modified} http_status=\"#{@response.status}\" url=#{@feed_url}"
37 |         parse unless not_modified
38 |         @feed.download_success
39 |       rescue Feedkit::Error => exception
40 |         @feed.download_error(exception)
41 |         Sidekiq.logger.info "Feedkit::Error: attempts=#{@feed.attempt_count} exception=#{exception.inspect} id=#{@feed_id} url=#{@feed_url}"
42 |       end
43 | 
44 |       def request(auto_inflate: true)
45 |         parsed_url = Feedkit::BasicAuth.parse(@feed_url)
46 |         url = @feed.redirect ? @feed.redirect : parsed_url.url
47 |         Sidekiq.logger.info "Redirect: from=#{@feed_url} to=#{@feed.redirect} id=#{@feed_id}" if @feed.redirect
48 |         Feedkit::Request.download(url,
49 |           on_redirect:   on_redirect,
50 |           username:      parsed_url.username,
51 |           password:      parsed_url.password,
52 |           last_modified: @feed.last_modified,
53 |           etag:          @feed.etag,
54 |           auto_inflate:  auto_inflate,
55 |           user_agent:    "Feedbin feed-id:#{@feed_id} - #{@subscribers} subscribers"
56 |         )
57 |       end
58 | 
59 |       def on_redirect
60 |         proc do |from, to|
61 |           @feed.redirects.push Redirect.new(@feed_id, status: from.status.code, from: from.uri.to_s, to: to.uri.to_s)
62 |         end
63 |       end
64 | 
65 |       def parse
66 |         @response.persist!
67 |         job_id = Sidekiq::Client.push(
68 |           "args" => [@feed_id, @feed_url, @response.path, @response.encoding.to_s],
69 |           "class" => @critical ? "FeedParserCritical" : "FeedParser",
70 |           "queue" => @critical ? "feed_parser_critical_#{Socket.gethostname}" : "feed_parser_#{Socket.gethostname}",
71 |           "retry" => false
72 |         )
73 |         Sidekiq.logger.info "Parse enqueued job_id: #{job_id} path=#{@response.path}"
74 |         @feed.save(@response)
75 |       end
76 |     end
77 | 
78 |     class FeedDownloaderCritical
79 |       include Sidekiq::Worker
80 |       sidekiq_options queue: :feed_downloader_critical, retry: false
81 |       def perform(*args)
82 |         FeedDownloader.new.perform(*args, true)
83 |       end
84 |     end
85 |   end
86 | end


--------------------------------------------------------------------------------
/lib/crawler/refresher/jobs/feed_status_update.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class FeedStatusUpdate
 6 |       include Sidekiq::Worker
 7 |       sidekiq_options queue: :feed_downloader_critical
 8 | 
 9 |       def perform(feed_id, exception = nil)
10 |         if exception
11 |           FeedStatus.new(feed_id).error!(exception, formatted: true)
12 |         else
13 |           FeedStatus.clear!(feed_id)
14 |         end
15 |       end
16 | 
17 |     end
18 |   end
19 | end


--------------------------------------------------------------------------------
/lib/crawler/refresher/redirect_cache.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class RedirectCache
 6 | 
 7 |       # 4 redirect/hr 24hrs a day for 6 days
 8 |       PERSIST_AFTER = 4 * 24 * 6
 9 | 
10 |       attr_reader :redirects
11 | 
12 |       def initialize(feed_id)
13 |         @feed_id = feed_id
14 |       end
15 | 
16 |       def save(redirects)
17 |         @redirects = redirects
18 |         Cache.write(stable_key, {to: @redirects.last.to}) if redirect_stable?
19 |       end
20 | 
21 |       def redirect_stable?
22 |         return false if @redirects.empty?
23 |         return false unless @redirects.all?(&:permanent?)
24 |         Cache.increment(counter_key, options: {expires_in: 72 * 60 * 60}) > PERSIST_AFTER
25 |       end
26 | 
27 |       def read
28 |         @read ||= Cache.read(stable_key)[:to]
29 |       end
30 | 
31 |       def delete
32 |         Cache.delete(stable_key)
33 |       end
34 | 
35 |       def counter_key
36 |         @counter_key ||= begin
37 |           "refresher_redirect_tmp_" + Digest::SHA1.hexdigest(@redirects.map(&:cache_key).join)
38 |         end
39 |       end
40 | 
41 |       def stable_key
42 |         @stable_key ||= begin
43 |           "refresher_redirect_stable_#{@feed_id}"
44 |         end
45 |       end
46 |     end
47 | 
48 |     class Redirect
49 |       PERMANENT_REDIRECTS = [301, 308].to_set.freeze
50 | 
51 |       attr_reader :from, :to
52 | 
53 |       def initialize(feed_id, status:, from:, to:)
54 |         @feed_id = feed_id
55 |         @status = status
56 |         @from = from
57 |         @to = to
58 |       end
59 | 
60 |       def permanent?
61 |         PERMANENT_REDIRECTS.include?(@status)
62 |       end
63 | 
64 |       def cache_key
65 |         @cache_key ||= Digest::SHA1.hexdigest([@feed_id, @status, @from, @to].join)
66 |       end
67 |     end
68 |   end
69 | end
70 | 


--------------------------------------------------------------------------------
/lib/crawler/refresher/throttle.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class Throttle
 6 | 
 7 |       TIMEOUT = 60 * 30
 8 | 
 9 |       def initialize(feed_url, last_download)
10 |         @feed_url = feed_url
11 |         @last_download = last_download
12 |       end
13 | 
14 |       def self.throttled?(*args)
15 |         new(*args).throttled?
16 |       end
17 | 
18 |       def throttled?
19 |         throttled_hosts.include?(host) && downloaded_recently?
20 |       end
21 | 
22 |       def downloaded_recently?
23 |         return false if @last_download.nil?
24 |         (Time.now.to_i - @last_download) < random_timeout
25 |       end
26 | 
27 |       def random_timeout
28 |         rand(TIMEOUT..(TIMEOUT * 2))
29 |       end
30 | 
31 |       def throttled_hosts
32 |         ENV["THROTTLED_HOSTS"]&.split(",") || []
33 |       end
34 | 
35 |       def host
36 |         Addressable::URI.heuristic_parse(@feed_url).host.split(".").last(2).join(".")
37 |       rescue
38 |         nil
39 |       end
40 |     end
41 |   end
42 | end


--------------------------------------------------------------------------------
/lib/image.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | $LOAD_PATH.unshift File.expand_path(File.dirname(File.dirname(__FILE__)))
 4 | 
 5 | $stdout.sync = true
 6 | 
 7 | require "bundler/setup"
 8 | require "dotenv"
 9 | Dotenv.load(".env", ".env.test")
10 | 
11 | require "socket"
12 | require "etc"
13 | require "net/http"
14 | require "securerandom"
15 | require "time"
16 | require "uri"
17 | require "etc"
18 | require "digest"
19 | 
20 | require "addressable"
21 | require "dotenv"
22 | require "down"
23 | require "fog/aws"
24 | require "http"
25 | require "image_processing/vips"
26 | require "json"
27 | require "librato-rack"
28 | require "mime-types"
29 | require "open3"
30 | require "redis"
31 | require "shellwords"
32 | require "sidekiq"
33 | 
34 | require "lib/crawler/image/initializers/constants"
35 | require "lib/crawler/image/initializers/down"
36 | require "lib/crawler/image/initializers/librato"
37 | require "lib/crawler/image/initializers/worker_stat"
38 | require "lib/crawler/image/initializers/sidekiq"
39 | require "lib/crawler/image/initializers/storage"
40 | 
41 | require "lib/crawler/image/helpers"
42 | require "lib/crawler/image/timer"
43 | require "lib/crawler/image/cache"
44 | require "lib/crawler/image/meta_images"
45 | require "lib/crawler/image/meta_images_cache"
46 | require "lib/crawler/image/download_cache"
47 | require "lib/crawler/image/download"
48 | require "lib/crawler/image/download/default"
49 | require "lib/crawler/image/download/instagram"
50 | require "lib/crawler/image/download/vimeo"
51 | require "lib/crawler/image/download/youtube"
52 | require "lib/crawler/image/image_processor"
53 | require "lib/crawler/image/jobs/find_image"
54 | require "lib/crawler/image/jobs/process_image"
55 | require "lib/crawler/image/jobs/upload_image"
56 | 


--------------------------------------------------------------------------------
/lib/refresher.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift File.expand_path(File.dirname(File.dirname(__FILE__)))
 2 | 
 3 | $stdout.sync = true
 4 | 
 5 | require "bundler/setup"
 6 | require "dotenv"
 7 | 
 8 | if ENV["ENV_PATH"]
 9 |   Dotenv.load ENV["ENV_PATH"]
10 | else
11 |   Dotenv.load
12 | end
13 | 
14 | require "digest/sha1"
15 | require "date"
16 | require "socket"
17 | require "time"
18 | require "forwardable"
19 | require "json"
20 | 
21 | require "sidekiq"
22 | require "connection_pool"
23 | require "redis"
24 | require "feedkit"
25 | 
26 | require "lib/crawler/refresher/initializers/sidekiq"
27 | require "lib/crawler/refresher/initializers/redis"
28 | 
29 | require "lib/crawler/refresher/cache"
30 | require "lib/crawler/refresher/feed_status"
31 | require "lib/crawler/refresher/redirect_cache"
32 | require "lib/crawler/refresher/http_cache"
33 | require "lib/crawler/refresher/feed"
34 | require "lib/crawler/refresher/throttle"
35 | require "lib/crawler/refresher/jobs/feed_downloader"
36 | require "lib/crawler/refresher/jobs/feed_status_update"
37 | 


--------------------------------------------------------------------------------
/test/cache_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class CacheTest < Minitest::Test
 6 |       def setup
 7 |         flush
 8 |       end
 9 | 
10 |       def test_should_delete
11 |         cache_key = "cache_key"
12 |         Cache.increment(cache_key)
13 |         assert_equal(1, Cache.count(cache_key))
14 |         Cache.delete(cache_key)
15 |         assert_equal(0, Cache.count(cache_key))
16 |       end
17 | 
18 |       def test_should_increment
19 |         assert_equal(1, Cache.increment("cache_key"))
20 |       end
21 | 
22 |       def test_should_get_count
23 |         cache_key = "cache_key"
24 |         assert_equal(0, Cache.count(cache_key))
25 |         Cache.increment(cache_key)
26 |         assert_equal(1, Cache.count(cache_key))
27 |       end
28 | 
29 |       def test_should_cache_values
30 |         cache_key = "cache_key"
31 |         Cache.write(cache_key, {
32 |           etag: nil,
33 |           last_modified: "last_modified",
34 |         })
35 | 
36 |         values = Cache.read(cache_key)
37 | 
38 |         assert_equal("last_modified", values[:last_modified])
39 |         assert_nil(values[:etag])
40 |       end
41 | 
42 |       def test_should_cache_values_with_exiration
43 |         cache_key = "cache_key"
44 | 
45 |         Cache.write(cache_key, {
46 |           key: "value",
47 |         },
48 |         options: {expires_in: 1}
49 |         )
50 | 
51 |         result = Sidekiq.redis do |redis|
52 |           redis.ttl(cache_key)
53 |         end
54 | 
55 |         assert_equal(1, result)
56 |       end
57 |     end
58 |   end
59 | end


--------------------------------------------------------------------------------
/test/download/default.rb:
--------------------------------------------------------------------------------
 1 | require_relative "../test_helper"
 2 | module Crawler
 3 |   module Image
 4 | 
 5 |     class Download::DefaultTest < Minitest::Test
 6 |       def test_should_download_valid_image
 7 |         url = "http://example.com/image.jpg"
 8 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678")
 9 |         download = Download.download!(url, minimum_size: 8)
10 |         assert_instance_of Download::Default, download
11 |       end
12 |     end
13 |   end
14 | end


--------------------------------------------------------------------------------
/test/download/instagram.rb:
--------------------------------------------------------------------------------
 1 | require_relative "../test_helper"
 2 | module Crawler
 3 |   module Image
 4 | 
 5 |     class Download::InstagramTest < Minitest::Test
 6 |       def test_should_download_valid_image
 7 |         url = "http://example.com/image.jpg"
 8 |         stub_request(:get, /graph\.facebook\.com/).to_return(body: {thumbnail_url: url}.to_json)
 9 | 
10 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678")
11 |         download = Download.download!("https://www.instagram.com/p/CMGfYFaJoF7/", minimum_size: 8)
12 |         assert download.valid?
13 |         assert_instance_of Download::Instagram, download
14 |       end
15 |     end
16 |   end
17 | end


--------------------------------------------------------------------------------
/test/download/vimeo.rb:
--------------------------------------------------------------------------------
 1 | require_relative "../test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class Download::VimeoTest < Minitest::Test
 5 |       def test_should_download_valid_image
 6 |         url = "http://example.com/image.jpg"
 7 |         stub_request(:get, /vimeo\.com\/api/).to_return(body: {thumbnail_url: url}.to_json)
 8 | 
 9 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678")
10 |         download = Download.download!("https://player.vimeo.com/video/CMGfYFaJoF7/", minimum_size: 8)
11 |         assert download.valid?
12 |         assert_instance_of Download::Vimeo, download
13 |       end
14 |     end
15 |   end
16 | end


--------------------------------------------------------------------------------
/test/download/youtube.rb:
--------------------------------------------------------------------------------
 1 | require_relative "../test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class Download::YoutubeTest < Minitest::Test
 5 |       def test_should_download_valid_image
 6 |         id = SecureRandom.hex
 7 | 
 8 |         max_url = "https://i.ytimg.com/vi/#{id}/maxresdefault.jpg"
 9 |         hq_url = "https://i.ytimg.com/vi/#{id}/hqdefault.jpg"
10 | 
11 |         stub_request(:get, max_url).to_return(status: 404)
12 |         stub_request(:get, hq_url).to_return(headers: {content_type: "image/jpg"}, body: "12345678")
13 | 
14 |         download = Download.download!("https://www.youtube.com/watch?v=#{id}", minimum_size: 8)
15 |         assert download.valid?
16 | 
17 |         assert_instance_of Download::Youtube, download
18 |         assert_requested :get, max_url
19 |         assert_requested :get, hq_url
20 |       end
21 | 
22 |       def test_should_stop_at_first_image
23 |         id = SecureRandom.hex
24 | 
25 |         max_url = "https://i.ytimg.com/vi/#{id}/maxresdefault.jpg"
26 |         hq_url = "https://i.ytimg.com/vi/#{id}/hqdefault.jpg"
27 | 
28 |         stub_request(:get, max_url).to_return(headers: {content_type: "image/jpg"}, body: "12345678")
29 |         stub_request(:get, hq_url).to_return(headers: {content_type: "image/jpg"}, body: "12345678")
30 | 
31 |         download = Download.download!("https://www.youtube.com/watch?v=#{id}", minimum_size: 8)
32 |         assert download.valid?
33 | 
34 |         assert_instance_of Download::Youtube, download
35 |         assert_requested :get, max_url
36 |         refute_requested :get, hq_url
37 |       end
38 |     end
39 |   end
40 | end


--------------------------------------------------------------------------------
/test/download_cache_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class DownloadCacheTest < Minitest::Test
 5 |       def setup
 6 |         flush
 7 |       end
 8 | 
 9 |       def test_should_save_url
10 |         image_url = "http://example.com/example/example.jpg"
11 |         storage_url = "http://s3.com/example/example.jpg"
12 |         public_id = SecureRandom.hex
13 | 
14 |         cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary")
15 |         cache.save(storage_url: storage_url, image_url: image_url)
16 | 
17 |         cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary")
18 |         assert_equal(storage_url, cache.storage_url)
19 |       end
20 | 
21 |       def test_should_copy_existing_image
22 |         image_url = "http://example.com/example/example.jpg"
23 |         storage_url = "http://s3.com/example/example.jpg"
24 |         public_id = SecureRandom.hex
25 | 
26 |         stub_request(:put, /.*\.s3\.amazonaws\.com/).to_return(status: 200, body: aws_copy_body)
27 | 
28 |         cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary")
29 |         refute cache.copied?
30 | 
31 |         cache.save(storage_url: storage_url, image_url: image_url)
32 |         cache.copy
33 | 
34 |         assert cache.copied?
35 |         assert cache.storage_url.include?(public_id)
36 |       end
37 | 
38 |       def test_should_fail_to_copy_missing_image
39 |         image_url = "http://example.com/example/example.jpg"
40 |         storage_url = "http://s3.com/example/example.jpg"
41 |         public_id = SecureRandom.hex
42 |         s3_host = /.*\.s3\.amazonaws\.com/
43 | 
44 |         stub_request(:put, s3_host).to_return(status: 404)
45 | 
46 |         cache = DownloadCache.new(image_url, public_id: public_id, preset_name: "primary")
47 |         cache.save(storage_url: storage_url, image_url: image_url)
48 |         cache.copy
49 |         refute cache.copied?
50 |         assert_requested :put, s3_host
51 |       end
52 |     end
53 |   end
54 | end


--------------------------------------------------------------------------------
/test/download_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class DownloadTest < Minitest::Test
 5 |       def test_should_download_valid_image
 6 |         url = "http://example.com/image.jpg"
 7 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "12345678")
 8 |         download = Download.download!(url, minimum_size: 8)
 9 |         assert download.valid?
10 |       end
11 | 
12 |       def test_should_be_too_small
13 |         url = "http://example.com/image.jpg"
14 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "1234567")
15 |         download = Download.download!(url, minimum_size: 8)
16 |         refute download.valid?
17 |       end
18 | 
19 |       def test_should_ignore_size
20 |         url = "http://example.com/image.jpg"
21 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: "1")
22 |         download = Download.download!(url, minimum_size: nil)
23 |         assert download.valid?
24 |       end
25 | 
26 |       def test_should_be_wrong_content_type
27 |         url = "http://example.com/image.jpg"
28 |         stub_request(:get, url).to_return(headers: {content_type: "text/html"})
29 |         download = Download.download!(url, minimum_size: nil)
30 |         refute download.valid?
31 |       end
32 | 
33 |       def test_should_persist_file
34 |         url = "http://example.com/image.jpg"
35 |         body = "body"
36 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: body)
37 |         download = Download.download!(url)
38 |         path = download.path
39 |         download.persist!
40 |         refute path == download.path
41 |         FileUtils.rm download.path
42 |       end
43 |     end
44 |   end
45 | end


--------------------------------------------------------------------------------
/test/feed_downloader_test.rb:
--------------------------------------------------------------------------------
  1 | require_relative "test_helper"
  2 | 
  3 | module Crawler
  4 |   module Refresher
  5 |     class FeedDownloaderTest < Minitest::Test
  6 |       def setup
  7 |         flush
  8 |       end
  9 | 
 10 |       def test_should_schedule_feed_parser
 11 |         url = "http://example.com/atom.xml"
 12 |         stub_request_file("atom.xml", url)
 13 | 
 14 |         assert_equal 0, Sidekiq::Queues["feed_parser_#{Socket.gethostname}"].size
 15 |         FeedDownloader.new.perform(1, url, 10)
 16 |         assert_equal 1, Sidekiq::Queues["feed_parser_#{Socket.gethostname}"].size
 17 | 
 18 |         FeedDownloader.new.perform(1, url, 10)
 19 |         assert_equal 1, Sidekiq::Queues["feed_parser_#{Socket.gethostname}"].size
 20 |       end
 21 | 
 22 |       def test_should_schedule_critical_feed_parser
 23 |         url = "http://example.com/atom.xml"
 24 |         stub_request_file("atom.xml", url)
 25 | 
 26 |         assert_equal 0, Sidekiq::Queues["feed_parser_critical_#{Socket.gethostname}"].size
 27 |         FeedDownloaderCritical.new.perform(1, url, 10)
 28 |         assert_equal 1, Sidekiq::Queues["feed_parser_critical_#{Socket.gethostname}"].size
 29 |       end
 30 | 
 31 |       def test_should_send_user_agent
 32 |         url = "http://example.com/atom.xml"
 33 |         stub_request_file("atom.xml", url).with(headers: {"User-Agent" => "Feedbin feed-id:1 - 10 subscribers"})
 34 |         FeedDownloader.new.perform(1, url, 10)
 35 |       end
 36 | 
 37 |       def test_should_send_authorization
 38 |         username = "username"
 39 |         password = "password"
 40 |         url = "http://#{username}:#{password}@example.com/atom.xml"
 41 | 
 42 |         stub_request(:get, "http://example.com/atom.xml").with(headers: {"Authorization" => "Basic #{Base64.strict_encode64("#{username}:#{password}")}"})
 43 |         FeedDownloader.new.perform(1, url, 10)
 44 |       end
 45 | 
 46 |       def test_should_use_saved_redirect
 47 |         feed_id = 1
 48 |         url_one = "http://example.com/one"
 49 |         url_two = "http://example.com/two"
 50 | 
 51 |         redirect_cache = RedirectCache.new(feed_id)
 52 |         Cache.write(redirect_cache.stable_key, {to: url_two})
 53 | 
 54 |         stub_request(:get, url_two)
 55 |         FeedDownloader.new.perform(feed_id, url_one, 10)
 56 |       end
 57 | 
 58 |       def test_should_use_saved_redirect_with_basic_auth
 59 |         feed_id = 1
 60 |         username = "username"
 61 |         password = "password"
 62 |         url_one = "http://#{username}:#{password}@example.com/one"
 63 |         url_two = "http://example.com/two"
 64 | 
 65 |         redirect_cache = RedirectCache.new(feed_id)
 66 |         Cache.write(redirect_cache.stable_key, {to: url_two})
 67 | 
 68 |         stub_request(:get, url_two).with(headers: {"Authorization" => "Basic #{Base64.strict_encode64("#{username}:#{password}")}"})
 69 |         FeedDownloader.new.perform(feed_id, url_one, 10)
 70 |       end
 71 | 
 72 |       def test_should_do_nothing_if_not_modified
 73 |         feed_id = 1
 74 |         etag = "etag"
 75 |         last_modified = "last_modified"
 76 |         Cache.write("refresher_http_#{feed_id}", {
 77 |           etag: etag,
 78 |           last_modified: last_modified,
 79 |           checksum: nil
 80 |         })
 81 | 
 82 |         url = "http://example.com/atom.xml"
 83 |         stub_request(:get, url).with(headers: {"If-None-Match" => etag, "If-Modified-Since" => last_modified}).to_return(status: 304)
 84 |         FeedDownloader.new.perform(feed_id, url, 10)
 85 |         assert_equal 0, Sidekiq::Queues["feed_parser_critical_#{Socket.gethostname}"].size
 86 |       end
 87 | 
 88 |       def test_should_not_be_ok_after_error
 89 |         feed_id = 1
 90 | 
 91 |         url = "http://example.com/atom.xml"
 92 |         stub_request(:get, url).to_return(status: 429)
 93 | 
 94 |         FeedDownloader.new.perform(feed_id, url, 10)
 95 | 
 96 |         refute FeedStatus.new(feed_id).ok?, "Should not be ok?"
 97 |       end
 98 | 
 99 |       def test_should_follow_redirects
100 |         first_url = "http://www.example.com"
101 |         last_url = "#{first_url}/final"
102 | 
103 |         response = {
104 |           status: 301,
105 |           headers: {
106 |             "Location" => "/final"
107 |           }
108 |         }
109 |         stub_request(:get, first_url).to_return(response)
110 |         stub_request(:get, last_url)
111 | 
112 |         FeedDownloader.new.perform(1, first_url, 10)
113 |       end
114 |     end
115 |   end
116 | end


--------------------------------------------------------------------------------
/test/feed_status_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class FeedStatusTest < Minitest::Test
 6 | 
 7 |       def setup
 8 |         flush
 9 |       end
10 | 
11 |       def test_should_not_be_ok
12 |         feed_id = 1
13 |         FeedStatus.new(feed_id).error!(Exception.new)
14 |         refute FeedStatus.new(feed_id).ok?, "ok? should be false."
15 |       end
16 | 
17 |       def test_should_be_ok
18 |         feed_id = 1
19 |         FeedStatus.new(feed_id).error!(Exception.new)
20 |         FeedStatus.clear!(feed_id)
21 |         assert FeedStatus.new(feed_id).ok?, "ok? should be true."
22 |       end
23 | 
24 |       def test_should_get_count
25 |         feed_id = 1
26 |         FeedStatus.new(feed_id).error!(Exception.new)
27 |         FeedStatus.new(feed_id).error!(Exception.new)
28 |         assert_equal(2, FeedStatus.new(feed_id).count)
29 |       end
30 | 
31 |       def test_should_be_ok_after_timeout
32 |         feed_id = 1
33 | 
34 |         FeedStatus.new(feed_id).error!(Exception.new)
35 | 
36 |         one_hour = 60 * 60
37 |         one_hour_from_now = Time.now.to_i + one_hour
38 |         two_hours_ago = Time.now.to_i - one_hour - one_hour
39 | 
40 |         feed_status = FeedStatus.new(feed_id)
41 | 
42 |         assert feed_status.next_retry > one_hour_from_now
43 | 
44 |         Cache.write(feed_status.cache_key, { failed_at: two_hours_ago })
45 | 
46 |         assert FeedStatus.new(feed_id).ok?, "Status should be ok after rewinding failed_at"
47 |       end
48 | 
49 |       def test_should_save_last_download
50 |         feed_id = 1
51 |         now = Time.now.to_i
52 |         FeedStatus.new(1).log_download!
53 |         difference = FeedStatus.new(1).downloaded_at - now
54 |         assert difference <= 1
55 |       end
56 |     end
57 |   end
58 | end


--------------------------------------------------------------------------------
/test/feed_status_update_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class FeedStatusUpdateTest < Minitest::Test
 6 | 
 7 |       def setup
 8 |         flush
 9 |       end
10 | 
11 |       def test_should_clear_status
12 |         feed_id = 1
13 |         FeedStatus.new(feed_id).error!(Exception.new)
14 |         refute FeedStatus.new(feed_id).ok?, "ok? should be false."
15 |         FeedStatusUpdate.new.perform(feed_id)
16 |         assert FeedStatus.new(feed_id).ok?, "ok? should be true."
17 |       end
18 | 
19 |       def test_should_record_error
20 |         feed_id = 1
21 |         exception = Exception.new
22 |         formatted_exception = JSON.dump({date: Time.now.to_i, class: exception.class, message: exception.message, status: nil})
23 |         FeedStatusUpdate.new.perform(feed_id, formatted_exception)
24 |         status = FeedStatus.new(feed_id)
25 |         refute status.ok?, "ok? should be false."
26 |         assert_equal exception.class.name, status.attempt_log.first["class"]
27 |       end
28 | 
29 |     end
30 |   end
31 | end


--------------------------------------------------------------------------------
/test/feed_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class FeedTest < Minitest::Test
 6 | 
 7 |       def setup
 8 |         flush
 9 |       end
10 | 
11 |       def test_should_be_ok
12 |         feed_id = 1
13 |         feed = Feed.new(feed_id)
14 |         feed.download_error(Exception.new)
15 | 
16 |         feed = Feed.new(feed_id)
17 |         feed.download_success
18 | 
19 |         feed = Feed.new(feed_id)
20 |         assert feed.ok?
21 |       end
22 | 
23 |       def test_should_not_be_ok
24 |         feed_id = 1
25 |         feed = Feed.new(feed_id)
26 |         feed.download_error(Feedkit::NotFeed.new)
27 | 
28 |         feed = Feed.new(feed_id)
29 |         feed.download_success
30 | 
31 |         feed = Feed.new(feed_id)
32 |         assert_equal("Feedkit::NotFeed", feed.last_error["class"])
33 |         refute feed.ok?
34 |       end
35 |     end
36 |   end
37 | end
38 | 


--------------------------------------------------------------------------------
/test/image_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class ImageTest < Minitest::Test
 5 |       def test_should_get_image_size
 6 |         file = File.expand_path("support/www/image.jpeg", __dir__)
 7 |         image = ImageProcessor.new(file, target_width: 542, target_height: 304)
 8 |         assert_equal(image.width, 640)
 9 |         assert_equal(image.height, 828)
10 |         assert_equal(542, image.resized.width)
11 |         assert_equal(701, image.resized.height)
12 |       end
13 | 
14 |       def test_should_get_face_location
15 |         file = support_file("image.jpeg")
16 |         image = ImageProcessor.new(file, target_width: 542, target_height: 304)
17 | 
18 |         assert_equal(462, image.average_face_position("y", File.new(file)))
19 |       end
20 | 
21 |       def test_should_crop
22 |         file = File.expand_path("support/www/image.jpeg", __dir__)
23 |         image = ImageProcessor.new(file, target_width: 542, target_height: 304)
24 |         cropped_path = image.smart_crop
25 |         assert cropped_path.include?(".jpg")
26 |         FileUtils.rm cropped_path
27 |       end
28 | 
29 |       def test_should_return_same_size_image
30 |         file = File.expand_path("support/www/image.jpeg", __dir__)
31 |         image = ImageProcessor.new(file, target_width: 640, target_height: 828)
32 |         cropped_path = image.smart_crop
33 |         assert cropped_path.include?(".jpg")
34 |       end
35 |     end
36 |   end
37 | end


--------------------------------------------------------------------------------
/test/jobs/find_image_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "../test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class FindImageTest < Minitest::Test
 5 |       def setup
 6 |         flush
 7 |       end
 8 | 
 9 |       def test_should_copy_image
10 |         image_url = "https://i.ytimg.com/vi/id/maxresdefault.jpg"
11 |         original_url = "https://www.youtube.com/watch?v=id"
12 | 
13 |         stub_request_file("image.jpeg", image_url, headers: {content_type: "image/jpeg"})
14 |         stub_request(:put, /.*\.s3\.amazonaws\.com/).to_return(status: 200, body: aws_copy_body)
15 | 
16 |         Sidekiq::Testing.inline! do
17 |           FindImage.perform_async(SecureRandom.hex, "primary", [original_url])
18 |         end
19 | 
20 |         FindImage.new.perform(SecureRandom.hex, "primary", [original_url])
21 |         assert_equal(image_url, EntryImage.jobs.first["args"][1]["original_url"])
22 |       end
23 | 
24 |       def test_should_process_an_image
25 |         image_url = "http://example.com/image.jpg"
26 |         page_url = "http://example.com/article"
27 |         urls = [image_url]
28 | 
29 |         stub_request_file("html.html", page_url)
30 |         stub_request_file("image.jpeg", image_url, headers: {content_type: "image/jpeg"})
31 | 
32 |         stub_request(:get, "http://example.com/image/og_image.jpg").to_return(status: 404)
33 |         stub_request(:get, "http://example.com/image/twitter_image.jpg").to_return(status: 404)
34 | 
35 |         stub_request(:put, /.*\.s3\.amazonaws\.com/).to_return(status: 200, body: aws_copy_body)
36 | 
37 |         Sidekiq::Testing.inline! do
38 |           FindImage.perform_async(SecureRandom.hex, "primary", urls, page_url)
39 |         end
40 | 
41 |         assert_requested :get, "http://example.com/image/og_image.jpg"
42 |         assert_requested :get, "http://example.com/image/twitter_image.jpg"
43 | 
44 |         assert_equal 0, EntryImage.jobs.size
45 |         FindImage.new.perform(SecureRandom.hex, "primary", urls, nil)
46 |         assert_equal 1, EntryImage.jobs.size
47 |       end
48 | 
49 |       def test_should_enqueue_recognized_image
50 |         url = "https://i.ytimg.com/vi/id/maxresdefault.jpg"
51 |         image_url = "http://example.com/image.jpg"
52 | 
53 |         stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: ("lorem " * 3_500))
54 | 
55 |         assert_equal 0, ProcessImage.jobs.size
56 |         FindImage.new.perform(SecureRandom.hex, "primary", [image_url], "https://www.youtube.com/watch?v=id")
57 |         assert_equal 1, ProcessImage.jobs.size
58 | 
59 |         effective_image_url = ProcessImage.jobs.first["args"][4]
60 | 
61 |         assert_equal(url, effective_image_url)
62 | 
63 |         assert_requested :get, url
64 |         refute_requested :get, image_url
65 |       end
66 | 
67 |       def test_should_try_all_urls
68 |         urls = [
69 |           "http://example.com/image_1.jpg",
70 |           "http://example.com/image_2.jpg",
71 |           "http://example.com/image_3.jpg"
72 |         ]
73 | 
74 |         urls.each do |url|
75 |           stub_request(:get, url).to_return(headers: {content_type: "image/jpg"}, body: ("lorem " * 3_500))
76 |         end
77 | 
78 |         Sidekiq::Testing.inline! do
79 |           FindImage.perform_async(SecureRandom.hex, "primary", urls, nil)
80 |         end
81 | 
82 |         assert_requested :get, urls[0]
83 |         assert_requested :get, urls[1]
84 |         assert_requested :get, urls[2]
85 |       end
86 |     end
87 |   end
88 | end


--------------------------------------------------------------------------------
/test/jobs/process_image_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "../test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class ProcessImageTest < Minitest::Test
 5 |       def setup
 6 |         flush
 7 |       end
 8 | 
 9 |       def test_should_enqueue_upload
10 |         public_id = SecureRandom.hex
11 |         path = support_file("image.jpeg")
12 |         url = "http://example.com/image.jpg"
13 | 
14 |         assert_equal 0, UploadImage.jobs.size
15 |         ProcessImage.new.perform(public_id, "primary", path, url, url, [])
16 |         assert_equal 1, UploadImage.jobs.size
17 | 
18 |         assert_equal(public_id, UploadImage.jobs.first["args"].first)
19 |         assert_equal(url, UploadImage.jobs.first["args"].last)
20 |       end
21 | 
22 |       def test_should_enqueue_find
23 |         public_id = SecureRandom.hex
24 |         path = Tempfile.new.path
25 |         url = "http://example.com/image.jpg"
26 |         all_urls = ["http://example.com/image_2.jpg", "http://example.com/image_3.jpg"]
27 | 
28 |         assert_equal 0, FindImageCritical.jobs.size
29 |         ProcessImage.new.perform(public_id, "primary", path, url, url, all_urls)
30 |         assert_equal 1, FindImageCritical.jobs.size
31 | 
32 |         assert_equal([public_id, "primary", all_urls], FindImageCritical.jobs.first["args"])
33 |       end
34 |     end
35 |   end
36 | end


--------------------------------------------------------------------------------
/test/jobs/upload_image_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "../test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class UploadImageTest < Minitest::Test
 5 |       def setup
 6 |         flush
 7 |       end
 8 | 
 9 |       def test_should_upload
10 |         public_id = SecureRandom.hex
11 |         path = support_file("image.jpeg")
12 |         url = "http://example.com/image.jpg"
13 | 
14 |         stub_request(:put, /.*\.s3\.amazonaws\.com/)
15 | 
16 |         assert_equal 0, EntryImage.jobs.size
17 |         UploadImage.new.perform(public_id, "primary", path, url, url)
18 |         assert_equal 1, EntryImage.jobs.size
19 | 
20 |         download_cache = DownloadCache.new(url, public_id: public_id, preset_name: "primary")
21 |         assert_equal("https:", download_cache.storage_url)
22 |       end
23 |     end
24 |   end
25 | end


--------------------------------------------------------------------------------
/test/meta_images_cache_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class MetaImagesCacheTest < Minitest::Test
 5 |       def setup
 6 |         flush
 7 |       end
 8 | 
 9 |       def test_should_save_urls
10 |         urls = ["one", "two"]
11 |         cache = MetaImagesCache.new(Addressable::URI.parse("http://example.com/article"))
12 |         cache.save({checked: true, urls: urls})
13 | 
14 |         assert_equal(urls, cache.urls)
15 |       end
16 | 
17 |       def test_should_save_checked_status
18 |         cache = MetaImagesCache.new(Addressable::URI.parse("http://example.com/article"))
19 |         refute cache.checked?
20 | 
21 |         cache.save({checked: true, urls: []})
22 |         assert cache.checked?
23 |       end
24 | 
25 |       def test_should_save_meta_presence
26 |         cache = MetaImagesCache.new(Addressable::URI.parse("http://example.com/article"))
27 |         assert cache.has_meta?
28 | 
29 |         cache.has_meta!(false)
30 |         refute cache.has_meta?
31 | 
32 |         cache.has_meta!(true)
33 |         assert cache.has_meta?
34 |       end
35 |     end
36 |   end
37 | end


--------------------------------------------------------------------------------
/test/meta_images_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class MetaImagesTest < Minitest::Test
 5 |       def setup
 6 |         flush
 7 |       end
 8 | 
 9 |       def test_should_find_urls
10 |         url = "http://example.com/"
11 |         stub_request_file("html.html", url)
12 |         urls = MetaImages.find_urls(url)
13 |         assert urls.length == 2
14 |         assert urls.map(&:to_s).include?("http://example.com/image/og_image.jpg")
15 |         assert urls.map(&:to_s).include?("http://example.com/image/twitter_image.jpg")
16 |       end
17 | 
18 |       def test_should_not_download_file
19 |         url = "http://example.com/"
20 |         stub_request(:get, url).to_return(status: 404)
21 |         urls = MetaImages.find_urls(url)
22 |         assert urls.empty?
23 |       end
24 | 
25 |       def test_should_be_invalid_url
26 |         url = "http://invalid\\.com"
27 |         assert_raises(Addressable::URI::InvalidURIError) do
28 |           MetaImages.new(url).find_urls
29 |         end
30 |       end
31 | 
32 |       def test_should_be_invalid_no_host
33 |         url = "invalid"
34 |         assert_raises(Addressable::URI::InvalidURIError) do
35 |           MetaImages.new(url).find_urls
36 |         end
37 |       end
38 | 
39 |       def test_should_determine_download_status
40 |         url = "http://example.com/"
41 |         stub_request_file("html.html", url)
42 |         urls = MetaImages.new(url)
43 |         assert urls.needs_download?
44 |         urls.find_urls
45 | 
46 |         urls = MetaImages.new(url)
47 |         assert !urls.needs_download?
48 |       end
49 | 
50 |       def test_should_not_download_from_site_with_no_meta
51 |         url = Addressable::URI.parse("http://example.com/article")
52 |         cache = MetaImagesCache.new(url)
53 |         cache.has_meta!(false)
54 | 
55 |         urls = MetaImages.new(url)
56 |         assert !urls.needs_download?
57 |       end
58 |     end
59 |   end
60 | end


--------------------------------------------------------------------------------
/test/redirect_cache_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class RedirectCacheTest < Minitest::Test
 6 | 
 7 |       def setup
 8 |         flush
 9 |       end
10 | 
11 |       def test_should_collapse_stable_redirects
12 |         feed_id = 2
13 | 
14 |         redirect1 = Redirect.new(feed_id, status: 301, from: "http://example.com", to: "http://example.com/second")
15 |         redirect2 = Redirect.new(feed_id, status: 301, from: "http://example.com/second", to: "http://example.com/third")
16 |         redirect3 = Redirect.new(feed_id, status: 301, from: "http://example.com/third", to: "http://example.com/final")
17 | 
18 |         (RedirectCache::PERSIST_AFTER).times do
19 |           RedirectCache.new(feed_id).save([redirect1, redirect2])
20 |         end
21 | 
22 |         assert_nil RedirectCache.new(feed_id).read
23 | 
24 |         RedirectCache.new(feed_id).save([redirect1, redirect2])
25 | 
26 |         assert_equal(redirect2.to, RedirectCache.new(feed_id).read)
27 | 
28 |         (RedirectCache::PERSIST_AFTER + 1).times do
29 |           RedirectCache.new(feed_id).save([redirect2, redirect3])
30 |         end
31 | 
32 |         assert_equal(redirect3.to, RedirectCache.new(feed_id).read)
33 |       end
34 | 
35 |       def test_should_not_temporary_redirects
36 |         redirect1 = Redirect.new(1, status: 302, from: "http://example.com", to: "http://example.com/second")
37 |         assert_nil RedirectCache.new(1).save([redirect1])
38 |       end
39 | 
40 |       def test_should_not_save_empty_redirects
41 |         assert_nil RedirectCache.new(1).save([])
42 |       end
43 |     end
44 |   end
45 | end


--------------------------------------------------------------------------------
/test/redirect_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class RedirectTest < Minitest::Test
 6 |       def test_should_have_cache_key
 7 |         redirect = Redirect.new(1, status: 301, from: "http://example.com", to: "http://example.com/final")
 8 |         assert_equal("3981c0f11e525f3f0f4498a238f448957ff1929c", redirect.cache_key)
 9 |       end
10 |     end
11 |   end
12 | end


--------------------------------------------------------------------------------
/test/support/www/atom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <feed xmlns="http://www.w3.org/2005/Atom">
  3 |     <title>Feedbin</title>
  4 |     <subtitle>A fast, simple RSS feed reader that delivers a great reading experience.</subtitle>
  5 |     <link rel="hub" href="http://pubsubhubbub.superfeedr.com/" />
  6 |     <link href="https://feedbin.com/blog/atom.xml" rel="self" />
  7 |     <link href="https://feedbin.com" />
  8 |     <updated>2016-06-03T13:25:02-05:00</updated>
  9 |     <id>https://feedbin.com</id>
 10 |     <author>
 11 |         <name>Feedbin</name>
 12 |         <email>support@feedbin.com</email>
 13 |     </author>
 14 | 
 15 | 
 16 |     <entry>
 17 |         <title>Subscribe to Email Newsletters in Feedbin</title>
 18 |         <author>
 19 |             <name>Ben Ubois</name>
 20 |         </author>
 21 |         <link href="https://feedbin.com/blog/2016/02/03/subscribe-to-email-newsletters-in-feedbin/" />
 22 | 
 23 |             <published>2016-02-03T15:37:25-06:00</published>
 24 | 
 25 | 
 26 |             <updated>2016-02-03T00:00:00-06:00</updated>
 27 | 
 28 |         <id>/2016/02/03/subscribe-to-email-newsletters-in-feedbin</id>
 29 |         <summary>You can now receive email newsletters in Feedbin. Newsletter Subscriptions To use this feature, go to the settings page and find your secret Feedbin email address. Use this email address whenever you sign up for an email newsletter. Anything sent to it will show up as a feed in Feedbin,...</summary>
 30 |         <content type="html">&lt;p&gt;You can now receive email newsletters in Feedbin.&lt;/p&gt;
 31 | 
 32 | &lt;figure&gt;
 33 |   &lt;a href=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2016-02-03/email-large-ce9a0b27d895b4cb89429bb1d5773e0d1a394ef225dab5c0ced8c4419ed05bae.png&quot;&gt;&lt;img src=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2016-02-03/email-large-ce9a0b27d895b4cb89429bb1d5773e0d1a394ef225dab5c0ced8c4419ed05bae.png&quot; /&gt;&lt;/a&gt;
 34 |   &lt;figcaption&gt;Newsletter Subscriptions&lt;/figcaption&gt;
 35 | &lt;/figure&gt;
 36 | 
 37 | &lt;p&gt;To use this feature, go to the &lt;a href=&quot;https://feedbin.com/settings&quot;&gt;settings&lt;/a&gt; page and find your secret Feedbin email address. Use this email address whenever you sign up for an email newsletter. Anything sent to it will show up as a feed in Feedbin, grouped by sender.&lt;/p&gt;
 38 | 
 39 | &lt;p&gt;Reading email in an email app feels like work to me. However, there’s a certain class of email that I &lt;em&gt;want&lt;/em&gt; to enjoy reading, and Feedbin is where I go when I want to read for pleasure.&lt;/p&gt;
 40 | 
 41 | &lt;p&gt;For example, many great websites offer subscription content, usually with an email newsletter component. Not only do I enjoy the premium content from these sites, but I believe this a great way forward for people to support writers. Personally I have paid subscriptions to four of these including:&lt;/p&gt;
 42 | 
 43 | &lt;ul&gt;
 44 |   &lt;li&gt;&lt;a href=&quot;https://www.macstories.net&quot;&gt;MacStories&lt;/a&gt;&lt;/li&gt;
 45 |   &lt;li&gt;&lt;a href=&quot;http://www.aboveavalon.com&quot;&gt;Above Avalon&lt;/a&gt;&lt;/li&gt;
 46 |   &lt;li&gt;&lt;a href=&quot;https://stratechery.com&quot;&gt;Stratechery&lt;/a&gt;&lt;/li&gt;
 47 |   &lt;li&gt;&lt;a href=&quot;https://sixcolors.com&quot;&gt;Six Colors&lt;/a&gt;&lt;/li&gt;
 48 | &lt;/ul&gt;
 49 | 
 50 | &lt;p&gt;This feature is also great for mailing lists and product announcement emails and since it’s just a regular feed, it will sync with your favorite native app as well.&lt;/p&gt;
 51 | </content>
 52 |     </entry>
 53 | 
 54 |     <entry>
 55 |         <title>Feedbin Notifier vs. Notify by Facebook</title>
 56 |         <author>
 57 |             <name>Ben Ubois</name>
 58 |         </author>
 59 |         <link href="https://feedbin.com/blog/2015/11/11/feedbin-notifier-vs-notify-by-facebook/" />
 60 | 
 61 |             <published>2015-11-11T15:44:00-06:00</published>
 62 | 
 63 | 
 64 |             <updated>2015-11-11T00:00:00-06:00</updated>
 65 | 
 66 |         <id>/2015/11/11/feedbin-notifier-vs-notify-by-facebook</id>
 67 |         <summary>Today, Notify by Facebook was released. Conceptually, this is a similar app to Feedbin Notifier, which was released two days ago. The timing is coincidental but still interesting. Left: Feedbin Notifier. Right: Notify by Facebook. A similar coincidence happened when Feedbin launched, which is that two days later Google announced...</summary>
 68 |         <content type="html">&lt;p&gt;Today, &lt;a href=&quot;https://notify.co/&quot;&gt;Notify by Facebook&lt;/a&gt; was released. Conceptually, this is a similar app to &lt;a href=&quot;https://feedbin.com/notifier&quot;&gt;Feedbin Notifier&lt;/a&gt;, which was released &lt;a href=&quot;https://feedbin.com/blog/2015/11/08/notifier/&quot;&gt;two days ago&lt;/a&gt;. The timing is coincidental but still interesting.&lt;/p&gt;
 69 | 
 70 | &lt;figure&gt;
 71 |   &lt;a href=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-11-11/comparison_large-c56f942852b4f4aafba371de0aa2774be8f3888b696a2aadf4a122f97516bcca.png&quot;&gt;&lt;img src=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-11-11/comparison-12756a9e8953957f3167c3ee6ea6f33a8ec4bf2a742a6455af6fcb9d2c3e245e.png&quot; /&gt;&lt;/a&gt;
 72 |   &lt;figcaption&gt;Left: Feedbin Notifier. Right: Notify by Facebook.&lt;/figcaption&gt;
 73 | &lt;/figure&gt;
 74 | 
 75 | &lt;p&gt;A similar coincidence happened when &lt;a href=&quot;https://feedbin.com/blog/2013/03/12/rediscover-rss/&quot;&gt;Feedbin launched&lt;/a&gt;, which is that two days later &lt;a href=&quot;http://googlereader.blogspot.com/2013/03/powering-down-google-reader.html&quot;&gt;Google announced they were shutting down Reader&lt;/a&gt;. Weird timing.&lt;/p&gt;
 76 | 
 77 | &lt;p&gt;In my mind, Feedbin Notifier offers many advantages over Notify. The biggest reason to choose Feedbin Notifier is that it works with any source that offers an RSS feed, while Facebook has a limited number of built in sources.&lt;/p&gt;
 78 | 
 79 | &lt;table&gt;
 80 |   &lt;thead&gt;
 81 |     &lt;tr&gt;
 82 |       &lt;th style=&quot;text-align: left&quot;&gt;Feature&lt;/th&gt;
 83 |       &lt;th style=&quot;text-align: center&quot;&gt;Feedbin Notifier&lt;/th&gt;
 84 |       &lt;th style=&quot;text-align: right&quot;&gt;Notify by Facebook&lt;/th&gt;
 85 |     &lt;/tr&gt;
 86 |   &lt;/thead&gt;
 87 |   &lt;tbody&gt;
 88 |     &lt;tr&gt;
 89 |       &lt;td style=&quot;text-align: left&quot;&gt;Sources&lt;/td&gt;
 90 |       &lt;td style=&quot;text-align: center&quot;&gt;Anything with an RSS feed&lt;/td&gt;
 91 |       &lt;td style=&quot;text-align: right&quot;&gt;72&lt;/td&gt;
 92 |     &lt;/tr&gt;
 93 |     &lt;tr&gt;
 94 |       &lt;td style=&quot;text-align: left&quot;&gt;Apple Watch App&lt;/td&gt;
 95 |       &lt;td style=&quot;text-align: center&quot;&gt;Yes&lt;/td&gt;
 96 |       &lt;td style=&quot;text-align: right&quot;&gt;No&lt;/td&gt;
 97 |     &lt;/tr&gt;
 98 |     &lt;tr&gt;
 99 |       &lt;td style=&quot;text-align: left&quot;&gt;Read offline&lt;/td&gt;
100 |       &lt;td style=&quot;text-align: center&quot;&gt;Yes&lt;/td&gt;
101 |       &lt;td style=&quot;text-align: right&quot;&gt;No&lt;/td&gt;
102 |     &lt;/tr&gt;
103 |     &lt;tr&gt;
104 |       &lt;td style=&quot;text-align: left&quot;&gt;Spotlight Integration&lt;/td&gt;
105 |       &lt;td style=&quot;text-align: center&quot;&gt;Yes&lt;/td&gt;
106 |       &lt;td style=&quot;text-align: right&quot;&gt;No&lt;/td&gt;
107 |     &lt;/tr&gt;
108 |     &lt;tr&gt;
109 |       &lt;td style=&quot;text-align: left&quot;&gt;Sync&lt;/td&gt;
110 |       &lt;td style=&quot;text-align: center&quot;&gt;Yes&lt;/td&gt;
111 |       &lt;td style=&quot;text-align: right&quot;&gt;No&lt;/td&gt;
112 |     &lt;/tr&gt;
113 |     &lt;tr&gt;
114 |       &lt;td style=&quot;text-align: left&quot;&gt;Privacy&lt;/td&gt;
115 |       &lt;td style=&quot;text-align: center&quot;&gt;Yes&lt;/td&gt;
116 |       &lt;td style=&quot;text-align: right&quot;&gt;LOL&lt;/td&gt;
117 |     &lt;/tr&gt;
118 |     &lt;tr&gt;
119 |       &lt;td style=&quot;text-align: left&quot;&gt;Price&lt;/td&gt;
120 |       &lt;td style=&quot;text-align: center&quot;&gt;$3/mo&lt;/td&gt;
121 |       &lt;td style=&quot;text-align: right&quot;&gt;Free&lt;/td&gt;
122 |     &lt;/tr&gt;
123 |   &lt;/tbody&gt;
124 | &lt;/table&gt;
125 | 
126 | &lt;p&gt;This comparison is obviously biased. However, it is worth pointing out that Notify costs nothing, while Notifier only works with a paid Feedbin subscription.&lt;/p&gt;
127 | 
128 | &lt;p&gt;I think that having a large free competitor like this validates the idea. It also creates a market for a premium, more fully-featured version, which is what Feedbin Notifier is. Competing with Free is nothing new, it’s what Feedbin has been doing since day one.&lt;/p&gt;
129 | </content>
130 |     </entry>
131 | 
132 |     <entry>
133 |         <title>Feedbin Notifier for iPhone, iPad and Apple Watch</title>
134 |         <author>
135 |             <name>Ben Ubois</name>
136 |         </author>
137 |         <link href="https://feedbin.com/blog/2015/11/08/notifier/" />
138 | 
139 |             <published>2015-11-09T12:14:00-06:00</published>
140 | 
141 | 
142 |             <updated>2015-11-08T00:00:00-06:00</updated>
143 | 
144 |         <id>/2015/11/08/notifier</id>
145 |         <summary>There are already many great full-featured apps that work with Feedbin. Feedbin Notifier aims to be different. Feedbin Notifier is a notifications based reader. The idea is to select the handful of feeds or keywords you care about most. Then when Feedbin matches an article, it will send a push...</summary>
146 |         <content type="html">&lt;p&gt;There are already many great full-featured apps that work with Feedbin. &lt;a href=&quot;/notifier&quot;&gt;Feedbin Notifier&lt;/a&gt; aims to be different.&lt;/p&gt;
147 | 
148 | &lt;p&gt;&lt;a href=&quot;https://itunes.apple.com/app/feedbin-notifier/id996164128?mt=8&quot;&gt;&lt;img src=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-11-09/notifier-7164c36f94df6fbe9cec47d9e8c4f4d2ee04460e28ea258e41d8f33ef843b83a.png&quot; class=&quot;image-notifier&quot; /&gt;&lt;/a&gt;&lt;/p&gt;
149 | 
150 | &lt;p&gt;Feedbin Notifier is a notifications based reader. The idea is to select the handful of feeds or keywords you care about most. Then when Feedbin matches an article, it will send a push notification to your iPhone, iPad or Apple Watch, keeping you informed throughout the day.&lt;/p&gt;
151 | 
152 | &lt;p&gt;This way, Notification Center becomes the primary interface for catching up on the stories that are important to you. You’ll see articles along side your email and other notifications allowing tell at a glance if there’s anything you want to read right away.&lt;/p&gt;
153 | 
154 | &lt;p&gt;Feedbin is a free universal app for iPhone, iPad and the Apple Watch. On the Apple Watch you can read full articles right away or for a better reading experience use Handoff to continue reading on your iPhone or iPad.&lt;/p&gt;
155 | 
156 | &lt;p&gt;Hope you enjoy it! Would love to &lt;a href=&quot;https://twitter.com/feedbin&quot;&gt;hear your feedback&lt;/a&gt;.&lt;/p&gt;
157 | </content>
158 |     </entry>
159 | 
160 |     <entry>
161 |         <title>Image Previews</title>
162 |         <author>
163 |             <name>Ben Ubois</name>
164 |         </author>
165 |         <link href="https://feedbin.com/blog/2015/10/22/image-previews/" />
166 | 
167 |             <published>2015-10-22T00:00:00-05:00</published>
168 | 
169 | 
170 |             <updated>2015-10-22T00:00:00-05:00</updated>
171 | 
172 |         <id>/2015/10/22/image-previews</id>
173 |         <summary>Feedbin now features image previews in the center column. Feedbin, now with more images. The most important thing to me when building this feature was that only images that are reasonably high quality would show up here. To do this Feedbin uses a set of criteria that an image must...</summary>
174 |         <content type="html">&lt;p&gt;Feedbin now features image previews in the center column.&lt;/p&gt;
175 | 
176 | &lt;figure&gt;
177 | 	&lt;a href=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/screenshot_one_large-b08bfd8618a62a09fbad6b70ba28c973a70f1815a2e4fe00a78392b76f05720b.jpg&quot;&gt;
178 |         &lt;img src=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/screenshot_one_small-8e3180a61ba1fb72768ab7e21037dd431d40c07e64d146a24adfed06e6fa6158.jpg&quot; /&gt;
179 |     &lt;/a&gt;
180 | 	&lt;figcaption&gt;Feedbin, now with more images.&lt;/figcaption&gt;
181 | &lt;/figure&gt;
182 | 
183 | &lt;p&gt;The most important thing to me when building this feature was that only images that are reasonably high quality would show up here. To do this Feedbin uses a set of criteria that an image must meet in order to be chosen.&lt;/p&gt;
184 | 
185 | &lt;p&gt;Feedbin can find a variety of images including:&lt;/p&gt;
186 | 
187 | &lt;ul&gt;
188 |   &lt;li&gt;Images in the RSS post&lt;/li&gt;
189 |   &lt;li&gt;Poster frames from YouTube and Vimeo embeds&lt;/li&gt;
190 |   &lt;li&gt;&lt;a href=&quot;http://ogp.me/&quot;&gt;Open Graph&lt;/a&gt; and &lt;a href=&quot;https://dev.twitter.com/cards/overview&quot;&gt;Twitter Card&lt;/a&gt; meta data&lt;/li&gt;
191 | &lt;/ul&gt;
192 | 
193 | &lt;p&gt;One of my favorite operations that Feedbin does to ensure the quality of these image previews is facial detection using &lt;a href=&quot;http://opencv.org/&quot;&gt;OpenCV&lt;/a&gt;. By getting a rough idea about where faces in an image might be, Feedbin is able to get a better crop. I first saw this idea used in &lt;a href=&quot;http://blog.iconfactory.com/2015/06/twitterrifics-new-facial-recognition-keeps-faces-front-center/&quot;&gt;Twitterrific&lt;/a&gt; and loved the results.&lt;/p&gt;
194 | 
195 | &lt;figure&gt;
196 |     &lt;img src=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/crop_smart-cf421c97e5ff5181774b0655490144581705d8cd1fea64e9045a23814c72daa7.jpg&quot; /&gt;
197 |     &lt;figcaption&gt;Crop with face detection. Sample image from &lt;a href=&quot;https://thegreatdiscontent.com/interview/jim-riswold&quot;&gt;The Great Discontent&lt;/a&gt;.&lt;/figcaption&gt;
198 | &lt;/figure&gt;
199 | 
200 | &lt;figure&gt;
201 |     &lt;img src=&quot;https://dhy5vgj5baket.cloudfront.net/assets-site/blog/2015-10-22/crop_naive-16be8a0d9b8f9edbfcc413049e68cc17354a18678b3adb6efd92787684e7ccac.jpg&quot; /&gt;
202 |     &lt;figcaption&gt;Crop without face detection. Sample image from &lt;a href=&quot;https://thegreatdiscontent.com/interview/jim-riswold&quot;&gt;The Great Discontent&lt;/a&gt;.&lt;/figcaption&gt;
203 | &lt;/figure&gt;
204 | 
205 | &lt;p&gt;Adding images is a big visual change and if you prefer the old look you can turn off image previews in the &lt;a href=&quot;https://feedbin.com/settings/appearance&quot;&gt;Appearance settings&lt;/a&gt;.&lt;/p&gt;
206 | </content>
207 |     </entry>
208 | 
209 |     <entry>
210 |         <title>Link Opener for Chrome</title>
211 |         <author>
212 |             <name>Ben Ubois</name>
213 |         </author>
214 |         <link href="https://feedbin.com/blog/2015/03/09/link-opener-for-chrome/" />
215 | 
216 |             <published>2015-03-09T00:00:00-05:00</published>
217 | 
218 | 
219 |             <updated>2015-03-09T00:00:00-05:00</updated>
220 | 
221 |         <id>/2015/03/09/link-opener-for-chrome</id>
222 |         <summary>This official extension restores the ability for Chrome users to open article links in a background tab using a configurable keyboard shortcut. Previously this was possible without an extension, however the Chrome team recently removed this feature. The default shortcut to open a link is option/alt + v. This can...</summary>
223 |         <content type="html">&lt;p&gt;This &lt;a href=&quot;https://chrome.google.com/webstore/detail/feedbin-link-opener/naflkhnfmneiigdcphekaemdmeajiand&quot;&gt;official extension&lt;/a&gt; restores the ability for Chrome users to open article links in a background tab using a configurable keyboard shortcut.&lt;/p&gt;
224 | 
225 | &lt;p&gt;Previously this was possible without an extension, however the Chrome team &lt;a href=&quot;https://code.google.com/p/chromium/issues/detail?id=456910&quot;&gt;recently removed this feature&lt;/a&gt;.&lt;/p&gt;
226 | 
227 | &lt;p&gt;The default shortcut to open a link is &lt;code class=&quot;highlighter-rouge&quot;&gt;option/alt + v&lt;/code&gt;. This can be customized in Chrome’s &lt;a href=&quot;chrome://extensions/configureCommands&quot;&gt;Keyboard Shortcuts for Extensions and Apps&lt;/a&gt; page.&lt;/p&gt;
228 | 
229 | &lt;p&gt;The &lt;a href=&quot;https://github.com/feedbin/feedbin-link-opener-chrome&quot;&gt;extension is open source&lt;/a&gt; and ideas for improvements are welcome.&lt;/p&gt;
230 | 
231 | &lt;p&gt;An extension for Safari is not necessary because WebKit still supports the browser API to create background tabs.&lt;/p&gt;
232 | 
233 | &lt;p&gt;&lt;strong&gt;Update:&lt;/strong&gt; Martijn van der Ven has created a &lt;a href=&quot;https://github.com/Zegnat/feedbin-link-opener-firefox/releases/&quot;&gt;FireFox version of this extension&lt;/a&gt;. Thanks Martijn!&lt;/p&gt;
234 | </content>
235 |     </entry>
236 | 
237 | </feed>


--------------------------------------------------------------------------------
/test/support/www/html.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |       <meta property="og:image" content="/image/og_image.jpg" />
 4 |       <meta property="twitter:image" content="/image/twitter_image.jpg" />
 5 |       <meta property="twitter:image" />
 6 |       <meta property="twitter:image" content="" />
 7 |   </head>
 8 |   <body>
 9 |       
10 |   </body>
11 | </html>


--------------------------------------------------------------------------------
/test/support/www/image.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feedbin/crawler/9135bb3b506e07453db2002ceb8bb6491826990f/test/support/www/image.jpeg


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
 1 | require "minitest/autorun"
 2 | require "webmock/minitest"
 3 | 
 4 | unless ENV["CI"]
 5 |   socket = Socket.new(:INET, :STREAM, 0)
 6 |   socket.bind(Addrinfo.tcp("127.0.0.1", 0))
 7 |   port = socket.local_address.ip_port
 8 |   socket.close
 9 | 
10 |   ENV["REDIS_URL"] = "redis://localhost:%d" % port
11 |   redis_test_instance = IO.popen("redis-server --port %d --save '' --appendonly no" % port)
12 | 
13 |   Minitest.after_run do
14 |     Process.kill("INT", redis_test_instance.pid)
15 |   end
16 | end
17 | 
18 | require "sidekiq/testing"
19 | Sidekiq::Testing.fake!
20 | Sidekiq.logger.level = Logger::WARN
21 | 
22 | require_relative "../lib/image"
23 | require_relative "../lib/refresher"
24 | 
25 | def flush
26 |   Sidekiq::Worker.clear_all
27 |   Sidekiq.redis do |redis|
28 |     redis.flushdb
29 |   end
30 | end
31 | 
32 | def support_file(file_name)
33 |   path = File.join Dir.tmpdir, SecureRandom.hex
34 |   FileUtils.cp File.join("test/support/www", file_name), path
35 |   path
36 | end
37 | 
38 | def stub_request_file(file, url, options = {})
39 |   defaults = {body: File.new(support_file(file)), status: 200}
40 |   stub_request(:get, url)
41 |     .to_return(defaults.merge(options))
42 | end
43 | 
44 | def load_xml
45 |   File.read("test/support/www/atom.xml")
46 | end
47 | 
48 | def random_string
49 |   (0...50).map { ("a".."z").to_a[rand(26)] }.join
50 | end
51 | 
52 | def aws_copy_body
53 |   <<~EOT
54 |     <?xml version="1.0" encoding="UTF-8"?>
55 |     <CopyObjectResult>
56 |        <ETag>string</ETag>
57 |        <LastModified>Tue, 02 Mar 2021 12:58:45 GMT</LastModified>
58 |     </CopyObjectResult>
59 |   EOT
60 | end
61 | 
62 | class EntryImage
63 |   include Sidekiq::Worker
64 |   def perform(*args)
65 |   end
66 | end
67 | 


--------------------------------------------------------------------------------
/test/throttle_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | module Crawler
 4 |   module Refresher
 5 |     class ThrottleTest < Minitest::Test
 6 | 
 7 |       def setup
 8 |         flush
 9 |       end
10 | 
11 |       def test_throttled
12 |         ENV["THROTTLED_HOSTS"] = "example.com"
13 |         assert     Throttle.throttled?("https://www.example.com", Time.now.to_i)
14 |         assert_equal(false, Throttle.throttled?("https://www.example.com", Time.now.to_i - (Throttle::TIMEOUT * 2)))
15 |         assert_equal(false, Throttle.throttled?("https://www.example.com", nil))
16 |         assert_equal(false, Throttle.throttled?("https://www.not-example.com", Time.now.to_i))
17 |         assert_equal(false, Throttle.throttled?(nil, nil))
18 |       end
19 |     end
20 |   end
21 | end


--------------------------------------------------------------------------------
/test/timer_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | module Crawler
 3 |   module Image
 4 |     class TimerTest < Minitest::Test
 5 |       def test_should_time_out
 6 |         time = 0.11
 7 |         timer = Timer.new(time)
 8 | 
 9 |         sleep(time)
10 | 
11 |         assert timer.expired?
12 | 
13 |         elapsed = time + 0.01
14 |         assert [elapsed].include?(timer.elapsed), "Around #{elapsed}s should have elapsed."
15 |       end
16 |     end
17 |   end
18 | end


--------------------------------------------------------------------------------