├── .document ├── .github ├── dependabot.yml └── workflows │ ├── release.yml │ └── ruby.yml ├── .gitignore ├── .rspec ├── .rubocop.yml ├── .rubocop_todo.yml ├── .ruby-version ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── Gemfile ├── Guardfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── docker-compose.yml ├── exe └── image_scraper ├── image_scraper.gemspec ├── lib ├── image_scraper.rb └── image_scraper │ ├── client.rb │ ├── railtie.rb │ ├── util.rb │ └── version.rb ├── sig └── image_scraper.rbs └── spec ├── cassettes └── ImageScraper_Client │ ├── _image_urls │ ├── handles_url_with_unescaped_spaces.yml │ ├── scrapes_absolute_paths.yml │ └── scrapes_relative_paths.yml │ ├── _page_images │ ├── handldes_image_urls_that_include_square_brackets.yml │ └── handles_unescaped_urls.yml │ ├── _stylesheet_images │ ├── handles_404s.yml │ ├── handles_stylesheet_image_with_a_relative_url.yml │ └── scrapes_stylesheet_images.yml │ ├── _stylesheets │ └── lists_relative_path_stylesheets.yml │ └── foo │ └── something.yml ├── image_scraper ├── client_spec.rb └── util_spec.rb ├── image_scraper_spec.rb ├── spec_helper.rb └── support ├── extra_whitespace.html ├── relative_image_url.css ├── relative_image_url.html ├── space in url.html ├── stylesheet_test.html ├── stylesheet_unescaped_image.html └── unescaped_image.css /.document: -------------------------------------------------------------------------------- 1 | lib/**/*.rb 2 | bin/* 3 | - 4 | features/**/*.feature 5 | LICENSE.txt 6 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "bundler" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | on: 3 | push: 4 | branches: [master] 5 | jobs: 6 | release: 7 | runs-on: ubuntu-latest 8 | env: 9 | GEM_NAME: image_scraper 10 | steps: 11 | - uses: google-github-actions/release-please-action@v3 12 | id: release 13 | with: 14 | bump-minor-pre-major: true 15 | package-name: image_scraper 16 | release-type: ruby 17 | version-file: "lib/image_scraper/version.rb" 18 | - uses: actions/checkout@v3 19 | - name: install ruby 20 | if: "${{ steps.release.outputs.release_created }}" 21 | uses: ruby/setup-ruby@v1 22 | with: 23 | bundler-cache: true 24 | - name: bundle 25 | if: "${{ steps.release.outputs.release_created }}" 26 | run: | 27 | bundle config unset --local deployment 28 | bundle 29 | - name: publish gem 30 | if: "${{ steps.release.outputs.release_created }}" 31 | uses: dawidd6/action-publish-gem@v1 32 | with: 33 | api_key: "${{secrets.RUBYGEMS_API_KEY}}" 34 | github_token: "${{secrets.GITHUB_TOKEN}}" 35 | -------------------------------------------------------------------------------- /.github/workflows/ruby.yml: -------------------------------------------------------------------------------- 1 | name: ruby 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | build-and-run-tests: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Ruby 16 | uses: ruby/setup-ruby@v1 17 | with: 18 | bundler-cache: true 19 | - name: Run tests 20 | run: bundle exec rspec spec 21 | - name: Run rubocop 22 | run: bundle exec rubocop 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # rspec failure tracking 2 | .rspec_status 3 | 4 | /.bundle/ 5 | /.yardoc 6 | /Gemfile*lock 7 | /_yardoc/ 8 | /coverage/ 9 | /doc/ 10 | /pkg/ 11 | /spec/reports/ 12 | /tmp/ 13 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | --require spec_helper 3 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | inherit_from: .rubocop_todo.yml 2 | 3 | AllCops: 4 | SuggestExtensions: false 5 | NewCops: enable 6 | 7 | require: 8 | - rubocop-rspec 9 | 10 | Layout/SpaceAroundMethodCallOperator: 11 | Enabled: true 12 | Lint/RaiseException: 13 | Enabled: True 14 | Lint/StructNewOverride: 15 | Enabled: True 16 | Style/ExponentialNotation: 17 | Enabled: True 18 | Style/HashEachMethods: 19 | Enabled: True 20 | Style/HashTransformKeys: 21 | Enabled: True 22 | Style/HashTransformValues: 23 | Enabled: True 24 | Layout/EmptyLinesAroundAttributeAccessor: 25 | Enabled: True 26 | Lint/DeprecatedOpenSSLConstant: 27 | Enabled: True 28 | Lint/MixedRegexpCaptureTypes: 29 | Enabled: True 30 | Style/RedundantRegexpCharacterClass: 31 | Enabled: False 32 | Style/RedundantRegexpEscape: 33 | Enabled: False 34 | Style/SlicingWithRange: 35 | Enabled: True 36 | Style/AccessorGrouping: 37 | Enabled: True 38 | Style/BisectedAttrAccessor: 39 | Enabled: True 40 | Style/RedundantAssignment: 41 | Enabled: True 42 | Style/RedundantFetchBlock: 43 | Enabled: True 44 | -------------------------------------------------------------------------------- /.rubocop_todo.yml: -------------------------------------------------------------------------------- 1 | # This configuration was generated by 2 | # `rubocop --auto-gen-config` 3 | # on 2021-12-21 21:23:53 UTC using RuboCop version 1.23.0. 4 | # The point is for the user to remove these configuration records 5 | # one by one as the offenses are removed from the code base. 6 | # Note that changes in the inspected code, or installation of new 7 | # versions of RuboCop, may require this file to be generated again. 8 | 9 | # Offense count: 1 10 | # Configuration parameters: Include. 11 | # Include: **/*.gemspec 12 | Gemspec/RequiredRubyVersion: 13 | Exclude: 14 | - 'image_scraper.gemspec' 15 | 16 | # Offense count: 1 17 | # Cop supports --auto-correct. 18 | # Configuration parameters: EnforcedStyle. 19 | # SupportedStyles: empty_lines, no_empty_lines 20 | Layout/EmptyLinesAroundBlockBody: 21 | Exclude: 22 | - 'spec/image_scraper/util_spec.rb' 23 | 24 | # Offense count: 1 25 | Lint/DuplicateRescueException: 26 | Exclude: 27 | - 'lib/image_scraper/client.rb' 28 | 29 | # Offense count: 1 30 | # Cop supports --auto-correct. 31 | Lint/OrderedMagicComments: 32 | Exclude: 33 | - 'image_scraper.gemspec' 34 | 35 | # Offense count: 2 36 | # Configuration parameters: IgnoredMethods, CountRepeatedAttributes. 37 | Metrics/AbcSize: 38 | Max: 19 39 | 40 | # Offense count: 5 41 | # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods. 42 | # IgnoredMethods: refine 43 | Metrics/BlockLength: 44 | Max: 120 45 | 46 | # Offense count: 5 47 | # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods. 48 | Metrics/MethodLength: 49 | Max: 18 50 | 51 | # Offense count: 2 52 | RSpec/AnyInstance: 53 | Exclude: 54 | - 'spec/image_scraper/client_spec.rb' 55 | 56 | # Offense count: 6 57 | # Configuration parameters: Max. 58 | RSpec/ExampleLength: 59 | Exclude: 60 | - 'spec/image_scraper/client_spec.rb' 61 | - 'spec/image_scraper/util_spec.rb' 62 | 63 | # Offense count: 4 64 | RSpec/MultipleExpectations: 65 | Max: 5 66 | 67 | # Offense count: 1 68 | Security/Open: 69 | Exclude: 70 | - 'lib/image_scraper/client.rb' 71 | 72 | # Offense count: 1 73 | # Cop supports --auto-correct. 74 | Style/CaseLikeIf: 75 | Exclude: 76 | - 'lib/image_scraper/client.rb' 77 | 78 | # Offense count: 2 79 | # Configuration parameters: AllowedConstants. 80 | Style/Documentation: 81 | Exclude: 82 | - 'spec/**/*' 83 | - 'test/**/*' 84 | - 'lib/image_scraper/client.rb' 85 | - 'lib/image_scraper/util.rb' 86 | 87 | # Offense count: 1 88 | # Cop supports --auto-correct. 89 | # Configuration parameters: EnforcedStyle. 90 | # SupportedStyles: always, always_true, never 91 | Style/FrozenStringLiteralComment: 92 | Exclude: 93 | - 'Guardfile' 94 | 95 | # Offense count: 1 96 | # Cop supports --auto-correct. 97 | Style/IfUnlessModifier: 98 | Exclude: 99 | - 'image_scraper.gemspec' 100 | 101 | # Offense count: 1 102 | # Cop supports --auto-correct. 103 | # Configuration parameters: PreferredDelimiters. 104 | Style/PercentLiteralDelimiters: 105 | Exclude: 106 | - 'Guardfile' 107 | 108 | # Offense count: 8 109 | # Cop supports --auto-correct. 110 | # Configuration parameters: EnforcedStyle, ConsistentQuotesInMultiline. 111 | # SupportedStyles: single_quotes, double_quotes 112 | Style/StringLiterals: 113 | Exclude: 114 | - 'Guardfile' 115 | - 'spec/image_scraper/client_spec.rb' 116 | 117 | # Offense count: 2 118 | # Cop supports --auto-correct. 119 | # Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns. 120 | # URISchemes: http, https 121 | Layout/LineLength: 122 | Max: 142 123 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 3.2.0 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [Unreleased] 2 | 3 | ## [0.1.14](https://github.com/charlotte-ruby/image_scraper/compare/v0.1.13...v0.1.14) (2022-12-09) 4 | 5 | 6 | ### Bug Fixes 7 | 8 | * testing release please ([c722746](https://github.com/charlotte-ruby/image_scraper/commit/c72274671065e8329f879b9640a39fc5652bbbd7)) 9 | 10 | ## [0.1.13](https://github.com/charlotte-ruby/image_scraper/compare/v0.1.12...v0.1.13) (2022-11-25) 11 | 12 | 13 | ### Bug Fixes 14 | 15 | * bump to gems. bump ruby to 3.1.3 ([7ee3877](https://github.com/charlotte-ruby/image_scraper/commit/7ee38775bbc0e9684fe512c698a9caa1ecb3c07b)) 16 | 17 | ## [0.1.12](https://github.com/charlotte-ruby/image_scraper/compare/v0.1.11...v0.1.12) (2022-11-25) 18 | 19 | 20 | ### Bug Fixes 21 | 22 | * trying again lol ([9f55af5](https://github.com/charlotte-ruby/image_scraper/commit/9f55af55bb34d61a4a6dbf141998212229bf6ac9)) 23 | 24 | ## [0.1.11](https://github.com/charlotte-ruby/image_scraper/compare/v0.1.10...v0.1.11) (2022-11-25) 25 | 26 | 27 | ### Bug Fixes 28 | 29 | * bump version ([936f850](https://github.com/charlotte-ruby/image_scraper/commit/936f850b0f8f3d87607d28a5f3a4a088975b2ada)) 30 | 31 | ### [0.1.10](https://www.github.com/charlotte-ruby/image_scraper/compare/v0.1.7...v0.1.10) (2022-04-17) 32 | 33 | ### Bug Fixes 34 | 35 | * bump ruby ([3519010](https://github.com/charlotte-ruby/image_scraper/commit/351901036ed4b4b9432814ce05bcd5c67ae0c332)) 36 | 37 | ## [0.1.9] - 2022-02-21 38 | 39 | - deprecate jeweler in favor of bundler and rake tasks 40 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. 8 | 9 | ## Our Standards 10 | 11 | Examples of behavior that contributes to a positive environment for our community include: 12 | 13 | * Demonstrating empathy and kindness toward other people 14 | * Being respectful of differing opinions, viewpoints, and experiences 15 | * Giving and gracefully accepting constructive feedback 16 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 17 | * Focusing on what is best not just for us as individuals, but for the overall community 18 | 19 | Examples of unacceptable behavior include: 20 | 21 | * The use of sexualized language or imagery, and sexual attention or 22 | advances of any kind 23 | * Trolling, insulting or derogatory comments, and personal or political attacks 24 | * Public or private harassment 25 | * Publishing others' private information, such as a physical or email 26 | address, without their explicit permission 27 | * Other conduct which could reasonably be considered inappropriate in a 28 | professional setting 29 | 30 | ## Enforcement Responsibilities 31 | 32 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. 33 | 34 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. 35 | 36 | ## Scope 37 | 38 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 39 | 40 | ## Enforcement 41 | 42 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at matt@invalid8.com. All complaints will be reviewed and investigated promptly and fairly. 43 | 44 | All community leaders are obligated to respect the privacy and security of the reporter of any incident. 45 | 46 | ## Enforcement Guidelines 47 | 48 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: 49 | 50 | ### 1. Correction 51 | 52 | **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. 53 | 54 | **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. 55 | 56 | ### 2. Warning 57 | 58 | **Community Impact**: A violation through a single incident or series of actions. 59 | 60 | **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. 61 | 62 | ### 3. Temporary Ban 63 | 64 | **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. 65 | 66 | **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. 67 | 68 | ### 4. Permanent Ban 69 | 70 | **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. 71 | 72 | **Consequence**: A permanent ban from any sort of public interaction within the community. 73 | 74 | ## Attribution 75 | 76 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, 77 | available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 78 | 79 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). 80 | 81 | [homepage]: https://www.contributor-covenant.org 82 | 83 | For answers to common questions about this code of conduct, see the FAQ at 84 | https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 85 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ruby:3.2-alpine 2 | 3 | WORKDIR /usr/src/app 4 | 5 | RUN apk update && \ 6 | apk add gcc gcompat git \ 7 | libxml2-dev libxslt-dev \ 8 | make musl-dev 9 | 10 | RUN mkdir -p lib/image_scraper 11 | COPY lib/image_scraper/version.rb ./lib/image_scraper/version.rb 12 | COPY .ruby-version image_scraper.gemspec Gemfile Gemfile.lock ./ 13 | 14 | RUN bundle install 15 | COPY . . 16 | 17 | CMD ["bundle", "exec", "rspec"] 18 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'http://rubygems.org' 4 | 5 | ruby File.read('.ruby-version').chomp 6 | 7 | gemspec 8 | 9 | gem 'rake', '~> 13.0' 10 | gem 'rspec', '~> 3.4' 11 | gem 'rubocop', '~> 1.21' 12 | 13 | group :development do 14 | gem 'bundler', '~> 2.3' 15 | gem 'guard-rspec', require: false 16 | gem 'pry' 17 | gem 'rubocop-rspec', require: false 18 | gem 'test-unit' 19 | gem 'vcr', '~> 6.0' 20 | gem 'webmock' 21 | end 22 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | # A sample Guardfile 2 | # More info at https://github.com/guard/guard#readme 3 | 4 | ## Uncomment and set this to only include directories you want to watch 5 | # directories %w(app lib config test spec features) \ 6 | # .select{|d| Dir.exist?(d) ? d : UI.warning("Directory #{d} does not exist")} 7 | 8 | ## Note: if you are using the `directories` clause above and you are not 9 | ## watching the project directory ('.'), then you will want to move 10 | ## the Guardfile to a watched dir and symlink it back, e.g. 11 | # 12 | # $ mkdir config 13 | # $ mv Guardfile config/ 14 | # $ ln -s config/Guardfile . 15 | # 16 | # and, you'll have to watch "config/Guardfile" instead of "Guardfile" 17 | 18 | # NOTE: The cmd option is now required due to the increasing number of ways 19 | # rspec may be run, below are examples of the most common uses. 20 | # * bundler: 'bundle exec rspec' 21 | # * bundler binstubs: 'bin/rspec' 22 | # * spring: 'bin/rspec' (This will use spring if running and you have 23 | # installed the spring binstubs per the docs) 24 | # * zeus: 'zeus rspec' (requires the server to be started separately) 25 | # * 'just' rspec: 'rspec' 26 | 27 | guard :rspec, cmd: "bundle exec rspec" do 28 | require "guard/rspec/dsl" 29 | dsl = Guard::RSpec::Dsl.new(self) 30 | 31 | # Feel free to open issues for suggestions and improvements 32 | 33 | # RSpec files 34 | rspec = dsl.rspec 35 | watch(rspec.spec_helper) { rspec.spec_dir } 36 | watch(rspec.spec_support) { rspec.spec_dir } 37 | watch(rspec.spec_files) 38 | 39 | # Ruby files 40 | ruby = dsl.ruby 41 | dsl.watch_spec_files_for(ruby.lib_files) 42 | 43 | # Rails files 44 | rails = dsl.rails(view_extensions: %w(erb haml slim)) 45 | dsl.watch_spec_files_for(rails.app_files) 46 | dsl.watch_spec_files_for(rails.views) 47 | 48 | watch(rails.controllers) do |m| 49 | [ 50 | rspec.spec.call("routing/#{m[1]}_routing"), 51 | rspec.spec.call("controllers/#{m[1]}_controller"), 52 | rspec.spec.call("acceptance/#{m[1]}") 53 | ] 54 | end 55 | 56 | # Rails config changes 57 | watch(rails.spec_helper) { rspec.spec_dir } 58 | watch(rails.routes) { "#{rspec.spec_dir}/routing" } 59 | watch(rails.app_controller) { "#{rspec.spec_dir}/controllers" } 60 | 61 | # Capybara features specs 62 | watch(rails.view_dirs) { |m| rspec.spec.call("features/#{m[1]}") } 63 | watch(rails.layouts) { |m| rspec.spec.call("features/#{m[1]}") } 64 | 65 | # Turnip features and steps 66 | watch(%r{^spec/acceptance/(.+)\.feature$}) 67 | watch(%r{^spec/acceptance/steps/(.+)_steps\.rb$}) do |m| 68 | Dir[File.join("**/#{m[1]}.feature")][0] || "spec/acceptance" 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 John McAliley 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ImageScraper 2 | [![ruby](https://github.com/charlotte-ruby/image_scraper/actions/workflows/ruby.yml/badge.svg)](https://github.com/charlotte-ruby/image_scraper/actions/workflows/ruby.yml) 3 | 4 | Simple utility that pulls image URLs from web page 5 | ## Installation 6 | 7 | Install in your application's Gemfile or as a standalone gem: 8 | 9 | ```ruby 10 | gem 'image_scraper' 11 | ``` 12 | 13 | And then execute: 14 | 15 | ``` 16 | $ bundle install 17 | ``` 18 | 19 | Standalone install: 20 | 21 | ``` 22 | $ gem install image_scraper 23 | ``` 24 | 25 | ## Usage 26 | 27 | ```ruby 28 | options = { 29 | convert_to_absolute_url: true, 30 | include_css_images: true # convert any relative images to absolute urls. 31 | include_css_data_images: true # convert any data images (data:image/gif;base64....) 32 | } 33 | 34 | image_scraper = ImageScraper::Client.new("http://www.rubygems.org", options) 35 | image_scraper.image_urls 36 | 37 | # => ["https://rubygems.org/assets/github_icon.png"", "https://rubygems.org/sponsors.png"] 38 | ``` 39 | 40 | ### CLI 41 | 42 | ``` 43 | $ image_scraper https://unsplash.com | head -n 2 44 | https://images.unsplash.com/photo-1471897488648 45 | https://images.unsplash.com/photo-1590073242678 46 | ``` 47 | 48 | ## Development 49 | 50 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 51 | 52 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`. Releases are done via Github Actions. 53 | 54 | If you prefer to use docker: 55 | 56 | ``` 57 | docker-compose build 58 | docker-compose run app 59 | ``` 60 | 61 | Once inside the container, run the tests and you'll see output similar to this: 62 | 63 | ``` 64 | /usr/src/app # bundle exec rspec 65 | ........................ 66 | 67 | Finished in 0.54303 seconds (files took 0.95976 seconds to load) 68 | 24 examples, 0 failures 69 | ``` 70 | 71 | ## Contributing 72 | 73 | Bug reports and pull requests are welcome on GitHub at https://github.com/charlotte-ruby/image_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/charlotte-ruby/image_scraper/blob/master/CODE_OF_CONDUCT.md). 74 | 75 | - Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet 76 | - Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it 77 | - Fork the project 78 | - Start a feature/bugfix branch 79 | - Commit and push until you are happy with your contribution 80 | - Make sure to add tests for it. This is important so I don't break it in a future version unintentionally. 81 | - Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it. 82 | 83 | ## Copyright 84 | 85 | Copyright (c) 2011 John McAliley. See LICENSE.txt for 86 | further details. 87 | 88 | ## Code of Conduct 89 | 90 | Everyone interacting in the ImageScraper project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/charlotte-ruby/image_scraper/blob/master/CODE_OF_CONDUCT.md). 91 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'bundler/gem_tasks' 4 | require 'rspec/core/rake_task' 5 | 6 | RSpec::Core::RakeTask.new(:spec) 7 | 8 | require 'rubocop/rake_task' 9 | 10 | RuboCop::RakeTask.new 11 | 12 | task default: %i[spec rubocop] 13 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'bundler/setup' 5 | require 'image_scraper' 6 | 7 | # You can add fixtures and/or initialization code here to make experimenting 8 | # with your gem easier. You can also use a different console, if you like. 9 | 10 | # (If you use this, don't forget to add pry to your Gemfile!) 11 | # require "pry" 12 | # Pry.start 13 | 14 | require 'irb' 15 | IRB.start(__FILE__) 16 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | app: 4 | build: . 5 | command: /bin/sh 6 | volumes: 7 | - .:/usr/src/app 8 | -------------------------------------------------------------------------------- /exe/image_scraper: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'image_scraper' 5 | 6 | url = ARGV[0].to_s 7 | 8 | unless url.length.positive? 9 | puts 'usage: image_scraper ' 10 | exit 1 11 | end 12 | 13 | image_scraper = ImageScraper::Client.new(url) 14 | scraped_urls = image_scraper.image_urls 15 | 16 | puts scraped_urls * "\n" if scraped_urls.count.positive? 17 | -------------------------------------------------------------------------------- /image_scraper.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative 'lib/image_scraper/version' 4 | 5 | Gem::Specification.new do |spec| 6 | spec.name = 'image_scraper' 7 | spec.version = ImageScraper::VERSION 8 | spec.authors = ['John McAliley', 'Matt McMahand'] 9 | spec.email = ['john.mcaliley@gmail.com', 'matt@invalid8.com'] 10 | spec.summary = 'Simple utility to pull image urls from web page' 11 | spec.description = spec.summary 12 | spec.homepage = 'http://github.com/charlotte-ruby/image_scraper' 13 | spec.license = 'MIT' 14 | spec.required_ruby_version = '>= 2.6.0' 15 | 16 | spec.metadata['homepage_uri'] = spec.homepage 17 | spec.metadata['source_code_uri'] = spec.homepage 18 | spec.metadata['changelog_uri'] = File.join(spec.homepage, 'blob/master/CHANGELOG.md') 19 | 20 | spec.metadata['rubygems_mfa_required'] = 'true' 21 | 22 | begin 23 | files = (result = `git ls-files -z`.split "\0").empty? ? Dir['**/*'] : result 24 | rescue StandardError 25 | files = Dir['**/*'] 26 | end 27 | 28 | # Specify which files should be added to the gem when it is released. 29 | spec.files = files.grep_v(/^\A(?:(?:test|spec|features|.git|sig))/) 30 | 31 | spec.bindir = 'exe' 32 | spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } 33 | spec.require_paths = ['lib'] 34 | 35 | spec.add_dependency 'css_parser', '~> 1.11' 36 | spec.add_dependency 'nokogiri', '~> 1.13' 37 | end 38 | -------------------------------------------------------------------------------- /lib/image_scraper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'open-uri' 4 | require 'nokogiri' 5 | 6 | require_relative 'image_scraper/client' 7 | require_relative 'image_scraper/railtie' if defined?(Rails::Railtie) 8 | require_relative 'image_scraper/util' 9 | require_relative 'image_scraper/version' 10 | 11 | module ImageScraper 12 | class Error < StandardError; end 13 | end 14 | -------------------------------------------------------------------------------- /lib/image_scraper/client.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'uri' 5 | require 'cgi' 6 | 7 | module ImageScraper 8 | class Client 9 | USER_AGENT = 'Mozilla/5.0 (Macintosh)' 10 | 11 | attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc 12 | attr_reader :uri, :error 13 | 14 | def initialize(url, options = {}) 15 | defaults = { convert_to_absolute_url: true, include_css_images: true, include_css_data_images: false } 16 | options.merge!(defaults) 17 | 18 | @url = url 19 | @uri = Util.convert_to_uri(url) 20 | 21 | @convert_to_absolute_url = options[:convert_to_absolute_url] 22 | @include_css_images = options[:include_css_images] 23 | @include_css_data_images = options[:include_css_data_images] 24 | 25 | begin 26 | html = fetch(@uri) 27 | rescue StandardError => e 28 | @error = e 29 | html = nil 30 | end 31 | 32 | @doc = html ? Nokogiri::HTML(html, nil, 'UTF-8') : nil 33 | end 34 | 35 | def fetch(url, limit = 10) 36 | raise ArgumentError, 'HTTP redirect too deep' if limit.zero? 37 | 38 | uri = Util.convert_to_uri(url) 39 | 40 | return false unless uri 41 | 42 | result = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.port == 443) do |http| 43 | request = Net::HTTP::Get.new(uri, { 'User-Agent' => USER_AGENT }) 44 | response = http.request request 45 | 46 | case response 47 | when Net::HTTPSuccess then response 48 | when Net::HTTPRedirection then fetch(response['location'], limit - 1) 49 | else 50 | response.error! 51 | end 52 | end 53 | 54 | if result.is_a? Net::HTTPOK 55 | result.body 56 | elsif result.is_a? String 57 | result 58 | end 59 | end 60 | 61 | def image_urls 62 | images = page_images 63 | images += stylesheet_images if include_css_images 64 | images.sort.uniq 65 | end 66 | 67 | def cleanup_src_value(text) 68 | text.to_s.strip! 69 | text.gsub!(' ', '%20') 70 | 71 | # escape characters that CGI::escape doesn't get 72 | text.gsub(/([{}|\^\[\]\@`])/) { |s| s } 73 | end 74 | 75 | def page_images 76 | return [] if doc.to_s.empty? 77 | 78 | doc.xpath('//img').collect do |e| 79 | src = cleanup_src_value(e['src']) 80 | next if src.empty? 81 | 82 | if convert_to_absolute_url 83 | Util.absolute_url(@uri.to_s, src) 84 | else 85 | src 86 | end 87 | end.compact 88 | end 89 | 90 | def fetch_css(url) 91 | begin 92 | file = URI.open(url) 93 | rescue StandardError 94 | return '' 95 | end 96 | 97 | begin 98 | css = file.string 99 | rescue StandardError 100 | css = File.read(file) 101 | rescue StandardError 102 | return '' 103 | end 104 | 105 | css.unpack('C*').pack('U*') 106 | end 107 | 108 | def stylesheet_images 109 | images = [] 110 | 111 | stylesheets.each do |stylesheet| 112 | css = fetch_css(stylesheet) 113 | 114 | next unless css.to_s.length.positive? 115 | 116 | images += css.scan(/url\((.*?)\)/).collect do |image_url| 117 | image_url = Util.cleanup_url(image_url[0]) 118 | image_url = image_url.gsub(/([{}|\^\[\]\@`])/) { |s| CGI.escape(s) } # escape characters that URI.escape doesn't get 119 | if image_url.include?('data:image') && @include_css_data_images 120 | image_url 121 | else 122 | @convert_to_absolute_url ? Util.absolute_url(stylesheet, image_url) : image_url 123 | end 124 | end 125 | end 126 | images.compact 127 | end 128 | 129 | def stylesheets 130 | return [] if doc.to_s.empty? 131 | 132 | doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet| 133 | Util.absolute_url(@uri.to_s, Util.cleanup_url(stylesheet['href'])) 134 | end.compact 135 | end 136 | end 137 | end 138 | -------------------------------------------------------------------------------- /lib/image_scraper/railtie.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ImageScraper 4 | class Railtie < Rails::Railtie 5 | end 6 | end 7 | -------------------------------------------------------------------------------- /lib/image_scraper/util.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ImageScraper 4 | module Util 5 | def self.absolute_url(url, asset = nil) 6 | # TODO: - what happens when an index redirect occurs? 7 | # Example: 'http://example.com/about' specified as url 8 | # 'style.css' specified as asset 9 | # url redirects to 'http://example.com/about/' 10 | # and serves http://example.com/about/index.html 11 | # which then links to the relative asset path 'style.css' 12 | # based on original url (http://example.com/about), 13 | # self.absolute_url gives 14 | # 'http://example.com/style.css 15 | # but should get: 16 | # 'http://example.com/about/style.css 17 | 18 | URI.parse(url).merge(URI.parse(asset.to_s)).to_s 19 | rescue StandardError 20 | nil 21 | end 22 | 23 | def self.convert_to_uri(url) 24 | if url.is_a?(URI::HTTP) 25 | url 26 | else 27 | url = url.strip 28 | url = "http://#{url}" unless url.include?('://') 29 | url = url.gsub(' ', '%20') if url.include?(' ') 30 | 31 | begin 32 | URI.parse(url) 33 | rescue URI::InvalidURIError 34 | nil 35 | end 36 | end 37 | end 38 | 39 | def self.domain(url) 40 | uri = URI.parse(url) 41 | "#{uri.scheme}://#{uri.host}" 42 | rescue StandardError 43 | print('domain error') 44 | nil 45 | end 46 | 47 | def self.path(url) 48 | URI.parse(url).path 49 | rescue StandardError 50 | nil 51 | end 52 | 53 | def self.strip_backslashes(image_url) 54 | image_url.gsub('\\', '') 55 | end 56 | 57 | def self.strip_quotes(image_url) 58 | image_url.gsub("'", '').gsub('"', '') 59 | end 60 | 61 | def self.chomp(image_url) 62 | image_url.gsub(/\s/, '') 63 | end 64 | 65 | def self.cleanup_url(image_url) 66 | ImageScraper::Util.chomp( 67 | ImageScraper::Util.strip_quotes( 68 | ImageScraper::Util.strip_backslashes(image_url || '') 69 | ) 70 | ) 71 | end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /lib/image_scraper/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ImageScraper 4 | VERSION = '0.1.14' 5 | end 6 | -------------------------------------------------------------------------------- /sig/image_scraper.rbs: -------------------------------------------------------------------------------- 1 | module ImageScraper 2 | VERSION: String 3 | # See the writing guide of rbs: https://github.com/ruby/rbs#guides 4 | end 5 | -------------------------------------------------------------------------------- /spec/cassettes/ImageScraper_Client/_image_urls/handles_url_with_unescaped_spaces.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http_interactions: 3 | - request: 4 | method: get 5 | uri: https://raw.github.com/syoder/image_scraper/stylesheet_fix/test/resources/space%20in%20url.html 6 | body: 7 | encoding: US-ASCII 8 | string: '' 9 | headers: 10 | User-Agent: 11 | - Mozilla/5.0 (Macintosh) 12 | Accept-Encoding: 13 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 14 | Accept: 15 | - "*/*" 16 | Host: 17 | - raw.github.com 18 | response: 19 | status: 20 | code: 301 21 | message: Moved Permanently 22 | headers: 23 | Connection: 24 | - keep-alive 25 | Content-Length: 26 | - '0' 27 | Location: 28 | - https://raw.githubusercontent.com/syoder/image_scraper/stylesheet_fix/test/resources/space%20in%20url.html 29 | Accept-Ranges: 30 | - bytes 31 | Date: 32 | - Sat, 11 Jul 2020 02:48:32 GMT 33 | Via: 34 | - 1.1 varnish 35 | Age: 36 | - '0' 37 | X-Served-By: 38 | - cache-fty21351-FTY 39 | X-Cache: 40 | - MISS 41 | X-Cache-Hits: 42 | - '0' 43 | Vary: 44 | - Accept-Encoding 45 | X-Fastly-Request-Id: 46 | - 1938ad6b24faae4219882b22499a0884f3fcfac5 47 | body: 48 | encoding: UTF-8 49 | string: '' 50 | recorded_at: Sat, 11 Jul 2020 02:48:32 GMT 51 | - request: 52 | method: get 53 | uri: https://raw.githubusercontent.com/syoder/image_scraper/stylesheet_fix/test/resources/space%20in%20url.html 54 | body: 55 | encoding: US-ASCII 56 | string: '' 57 | headers: 58 | User-Agent: 59 | - Mozilla/5.0 (Macintosh) 60 | Accept-Encoding: 61 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 62 | Accept: 63 | - "*/*" 64 | Host: 65 | - raw.githubusercontent.com 66 | response: 67 | status: 68 | code: 200 69 | message: OK 70 | headers: 71 | Connection: 72 | - keep-alive 73 | Content-Length: 74 | - '66' 75 | Cache-Control: 76 | - max-age=300 77 | Content-Security-Policy: 78 | - default-src 'none'; style-src 'unsafe-inline'; sandbox 79 | Content-Type: 80 | - text/plain; charset=utf-8 81 | Etag: 82 | - W/"97a67c7d07a8aa3a39a54347e00f2eb4a7b9a18080d07f38fa16538b9187a65a" 83 | Strict-Transport-Security: 84 | - max-age=31536000 85 | X-Content-Type-Options: 86 | - nosniff 87 | X-Frame-Options: 88 | - deny 89 | X-Xss-Protection: 90 | - 1; mode=block 91 | Via: 92 | - 1.1 varnish 93 | - 1.1 varnish (Varnish/6.0) 94 | X-Github-Request-Id: 95 | - B330:6FFF:8A3AC:A38C4:5F09287F 96 | Accept-Ranges: 97 | - bytes 98 | Date: 99 | - Sat, 11 Jul 2020 02:48:32 GMT 100 | X-Served-By: 101 | - cache-fty21323-FTY 102 | X-Cache: 103 | - MISS, MISS 104 | X-Cache-Hits: 105 | - 0, 0 106 | X-Timer: 107 | - S1594435712.380265,VS0,VE118 108 | Vary: 109 | - Authorization,Accept-Encoding 110 | Access-Control-Allow-Origin: 111 | - "*" 112 | X-Fastly-Request-Id: 113 | - 3ce7f01ccb996f373840a16c0c3b1756eba30c9a 114 | Expires: 115 | - Sat, 11 Jul 2020 02:53:32 GMT 116 | Source-Age: 117 | - '0' 118 | body: 119 | encoding: ASCII-8BIT 120 | string: | 121 | 122 | 123 | 124 | 125 | 126 | recorded_at: Sat, 11 Jul 2020 02:48:32 GMT 127 | recorded_with: VCR 6.0.0 128 | -------------------------------------------------------------------------------- /spec/cassettes/ImageScraper_Client/_page_images/handldes_image_urls_that_include_square_brackets.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http_interactions: 3 | - request: 4 | method: get 5 | uri: http://google.com/ 6 | body: 7 | encoding: US-ASCII 8 | string: '' 9 | headers: 10 | User-Agent: 11 | - Mozilla/5.0 (Macintosh) 12 | Accept-Encoding: 13 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 14 | Accept: 15 | - "*/*" 16 | Host: 17 | - google.com 18 | response: 19 | status: 20 | code: 301 21 | message: Moved Permanently 22 | headers: 23 | Location: 24 | - http://www.google.com/ 25 | Content-Type: 26 | - text/html; charset=UTF-8 27 | Date: 28 | - Sat, 11 Jul 2020 03:19:08 GMT 29 | Expires: 30 | - Mon, 10 Aug 2020 03:19:08 GMT 31 | Cache-Control: 32 | - public, max-age=2592000 33 | Server: 34 | - gws 35 | Content-Length: 36 | - '219' 37 | X-Xss-Protection: 38 | - '0' 39 | X-Frame-Options: 40 | - SAMEORIGIN 41 | body: 42 | encoding: UTF-8 43 | string: "\n301 44 | Moved\n

301 Moved

\nThe document has moved\nhere.\r\n\r\n" 46 | recorded_at: Sat, 11 Jul 2020 03:19:08 GMT 47 | - request: 48 | method: get 49 | uri: http://www.google.com/ 50 | body: 51 | encoding: US-ASCII 52 | string: '' 53 | headers: 54 | User-Agent: 55 | - Mozilla/5.0 (Macintosh) 56 | Accept-Encoding: 57 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 58 | Accept: 59 | - "*/*" 60 | Host: 61 | - www.google.com 62 | response: 63 | status: 64 | code: 200 65 | message: OK 66 | headers: 67 | Date: 68 | - Sat, 11 Jul 2020 03:19:08 GMT 69 | Expires: 70 | - "-1" 71 | Cache-Control: 72 | - private, max-age=0 73 | Content-Type: 74 | - text/html; charset=UTF-8 75 | P3p: 76 | - CP="This is not a P3P policy! See g.co/p3phelp for more info." 77 | Server: 78 | - gws 79 | Content-Length: 80 | - '15676' 81 | X-Xss-Protection: 82 | - '0' 83 | X-Frame-Options: 84 | - SAMEORIGIN 85 | Set-Cookie: 86 | - 1P_JAR=2020-07-11-03; expires=Mon, 10-Aug-2020 03:19:08 GMT; path=/; domain=.google.com; 87 | Secure 88 | - NID=204=NKuzVtzV2dKPsOc7_z9bjNoIRhrN5kM9LQVZji5cqnUzPnDf6oMK-ritO8eMSwcgEJfqzbjCFUynwrJg1XMrJTjn_8t3hvFXqZhqvCCVAgBSQiN1wFE7ZP2JGvXkvahwYVo6QkfpnbwfRK3uMGL2Wk5tD4KbnopvxHAQ9qCHrGA; 89 | expires=Sun, 10-Jan-2021 03:19:08 GMT; path=/; domain=.google.com; HttpOnly 90 | body: 91 | encoding: ASCII-8BIT 92 | string: !binary |- 93 |  94 | recorded_at: Sat, 11 Jul 2020 03:19:08 GMT 95 | recorded_with: VCR 6.0.0 96 | -------------------------------------------------------------------------------- /spec/cassettes/ImageScraper_Client/_page_images/handles_unescaped_urls.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http_interactions: 3 | - request: 4 | method: get 5 | uri: http://test.com/ 6 | body: 7 | encoding: US-ASCII 8 | string: '' 9 | headers: 10 | User-Agent: 11 | - Mozilla/5.0 (Macintosh) 12 | Accept-Encoding: 13 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 14 | Accept: 15 | - "*/*" 16 | Host: 17 | - test.com 18 | response: 19 | status: 20 | code: 301 21 | message: Moved Permanently 22 | headers: 23 | Server: 24 | - nginx/1.16.1 25 | Date: 26 | - Sat, 11 Jul 2020 03:19:00 GMT 27 | Content-Type: 28 | - text/html; charset=UTF-8 29 | Transfer-Encoding: 30 | - chunked 31 | Connection: 32 | - keep-alive 33 | Keep-Alive: 34 | - timeout=20 35 | X-Dis-Request-Id: 36 | - a2838aa187355b40c449c1d4341cc5d8 37 | Location: 38 | - http://www.test.com/ 39 | body: 40 | encoding: UTF-8 41 | string: "301 Moved Permanently

301 42 | Moved Permanently

Object moved to here.


DOSarrest 43 | Internet Security
\n" 44 | recorded_at: Sat, 11 Jul 2020 03:19:00 GMT 45 | - request: 46 | method: get 47 | uri: http://www.test.com/ 48 | body: 49 | encoding: US-ASCII 50 | string: '' 51 | headers: 52 | User-Agent: 53 | - Mozilla/5.0 (Macintosh) 54 | Accept-Encoding: 55 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 56 | Accept: 57 | - "*/*" 58 | Host: 59 | - www.test.com 60 | response: 61 | status: 62 | code: 301 63 | message: Moved Permanently 64 | headers: 65 | Server: 66 | - nginx/1.16.1 67 | Date: 68 | - Sat, 11 Jul 2020 03:19:00 GMT 69 | Content-Type: 70 | - text/html; charset=UTF-8 71 | Transfer-Encoding: 72 | - chunked 73 | Connection: 74 | - keep-alive 75 | Keep-Alive: 76 | - timeout=20 77 | X-Dis-Request-Id: 78 | - 8b09747e816f16714b25596e80293886 79 | Location: 80 | - https://www.test.com/ 81 | body: 82 | encoding: UTF-8 83 | string: "301 Moved Permanently

301 84 | Moved Permanently

Object moved to here.


DOSarrest 85 | Internet Security
\n" 86 | recorded_at: Sat, 11 Jul 2020 03:19:01 GMT 87 | - request: 88 | method: get 89 | uri: https://www.test.com/ 90 | body: 91 | encoding: US-ASCII 92 | string: '' 93 | headers: 94 | User-Agent: 95 | - Mozilla/5.0 (Macintosh) 96 | Accept-Encoding: 97 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 98 | Accept: 99 | - "*/*" 100 | Host: 101 | - www.test.com 102 | response: 103 | status: 104 | code: 200 105 | message: OK 106 | headers: 107 | Server: 108 | - nginx/1.16.1 109 | Date: 110 | - Sat, 11 Jul 2020 03:19:01 GMT 111 | Content-Type: 112 | - text/html 113 | Transfer-Encoding: 114 | - chunked 115 | Connection: 116 | - keep-alive 117 | Keep-Alive: 118 | - timeout=20 119 | X-Dis-Request-Id: 120 | - ab80d220c4e964a267b7edb970c710d0 121 | P3p: 122 | - CP="NON DSP COR ADMa OUR IND UNI COM NAV INT" 123 | Cache-Control: 124 | - no-cache 125 | body: 126 | encoding: ASCII-8BIT 127 | string: "\n\n\n\n\n\n\n\n\n\n\n" 146 | recorded_at: Sat, 11 Jul 2020 03:19:01 GMT 147 | recorded_with: VCR 6.0.0 148 | -------------------------------------------------------------------------------- /spec/cassettes/ImageScraper_Client/_stylesheet_images/handles_404s.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http_interactions: 3 | - request: 4 | method: get 5 | uri: http://google.com/does_not_exist.css 6 | body: 7 | encoding: US-ASCII 8 | string: '' 9 | headers: 10 | Accept-Encoding: 11 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 12 | Accept: 13 | - "*/*" 14 | User-Agent: 15 | - Ruby 16 | response: 17 | status: 18 | code: 404 19 | message: Not Found 20 | headers: 21 | Content-Type: 22 | - text/html; charset=UTF-8 23 | Referrer-Policy: 24 | - no-referrer 25 | Content-Length: 26 | - '1579' 27 | Date: 28 | - Sat, 11 Jul 2020 02:47:46 GMT 29 | body: 30 | encoding: ASCII-8BIT 31 | string: !binary |- 32 | PCFET0NUWVBFIGh0bWw+CjxodG1sIGxhbmc9ZW4+CiAgPG1ldGEgY2hhcnNldD11dGYtOD4KICA8bWV0YSBuYW1lPXZpZXdwb3J0IGNvbnRlbnQ9ImluaXRpYWwtc2NhbGU9MSwgbWluaW11bS1zY2FsZT0xLCB3aWR0aD1kZXZpY2Utd2lkdGgiPgogIDx0aXRsZT5FcnJvciA0MDQgKE5vdCBGb3VuZCkhITE8L3RpdGxlPgogIDxzdHlsZT4KICAgICp7bWFyZ2luOjA7cGFkZGluZzowfWh0bWwsY29kZXtmb250OjE1cHgvMjJweCBhcmlhbCxzYW5zLXNlcmlmfWh0bWx7YmFja2dyb3VuZDojZmZmO2NvbG9yOiMyMjI7cGFkZGluZzoxNXB4fWJvZHl7bWFyZ2luOjclIGF1dG8gMDttYXgtd2lkdGg6MzkwcHg7bWluLWhlaWdodDoxODBweDtwYWRkaW5nOjMwcHggMCAxNXB4fSogPiBib2R5e2JhY2tncm91bmQ6dXJsKC8vd3d3Lmdvb2dsZS5jb20vaW1hZ2VzL2Vycm9ycy9yb2JvdC5wbmcpIDEwMCUgNXB4IG5vLXJlcGVhdDtwYWRkaW5nLXJpZ2h0OjIwNXB4fXB7bWFyZ2luOjExcHggMCAyMnB4O292ZXJmbG93OmhpZGRlbn1pbnN7Y29sb3I6Izc3Nzt0ZXh0LWRlY29yYXRpb246bm9uZX1hIGltZ3tib3JkZXI6MH1AbWVkaWEgc2NyZWVuIGFuZCAobWF4LXdpZHRoOjc3MnB4KXtib2R5e2JhY2tncm91bmQ6bm9uZTttYXJnaW4tdG9wOjA7bWF4LXdpZHRoOm5vbmU7cGFkZGluZy1yaWdodDowfX0jbG9nb3tiYWNrZ3JvdW5kOnVybCgvL3d3dy5nb29nbGUuY29tL2ltYWdlcy9icmFuZGluZy9nb29nbGVsb2dvLzF4L2dvb2dsZWxvZ29fY29sb3JfMTUweDU0ZHAucG5nKSBuby1yZXBlYXQ7bWFyZ2luLWxlZnQ6LTVweH1AbWVkaWEgb25seSBzY3JlZW4gYW5kIChtaW4tcmVzb2x1dGlvbjoxOTJkcGkpeyNsb2dve2JhY2tncm91bmQ6dXJsKC8vd3d3Lmdvb2dsZS5jb20vaW1hZ2VzL2JyYW5kaW5nL2dvb2dsZWxvZ28vMngvZ29vZ2xlbG9nb19jb2xvcl8xNTB4NTRkcC5wbmcpIG5vLXJlcGVhdCAwJSAwJS8xMDAlIDEwMCU7LW1vei1ib3JkZXItaW1hZ2U6dXJsKC8vd3d3Lmdvb2dsZS5jb20vaW1hZ2VzL2JyYW5kaW5nL2dvb2dsZWxvZ28vMngvZ29vZ2xlbG9nb19jb2xvcl8xNTB4NTRkcC5wbmcpIDB9fUBtZWRpYSBvbmx5IHNjcmVlbiBhbmQgKC13ZWJraXQtbWluLWRldmljZS1waXhlbC1yYXRpbzoyKXsjbG9nb3tiYWNrZ3JvdW5kOnVybCgvL3d3dy5nb29nbGUuY29tL2ltYWdlcy9icmFuZGluZy9nb29nbGVsb2dvLzJ4L2dvb2dsZWxvZ29fY29sb3JfMTUweDU0ZHAucG5nKSBuby1yZXBlYXQ7LXdlYmtpdC1iYWNrZ3JvdW5kLXNpemU6MTAwJSAxMDAlfX0jbG9nb3tkaXNwbGF5OmlubGluZS1ibG9jaztoZWlnaHQ6NTRweDt3aWR0aDoxNTBweH0KICA8L3N0eWxlPgogIDxhIGhyZWY9Ly93d3cuZ29vZ2xlLmNvbS8+PHNwYW4gaWQ9bG9nbyBhcmlhLWxhYmVsPUdvb2dsZT48L3NwYW4+PC9hPgogIDxwPjxiPjQwNC48L2I+IDxpbnM+VGhhdOKAmXMgYW4gZXJyb3IuPC9pbnM+CiAgPHA+VGhlIHJlcXVlc3RlZCBVUkwgPGNvZGU+L2RvZXNfbm90X2V4aXN0LmNzczwvY29kZT4gd2FzIG5vdCBmb3VuZCBvbiB0aGlzIHNlcnZlci4gIDxpbnM+VGhhdOKAmXMgYWxsIHdlIGtub3cuPC9pbnM+Cg== 33 | recorded_at: Sat, 11 Jul 2020 02:47:46 GMT 34 | recorded_with: VCR 6.0.0 35 | -------------------------------------------------------------------------------- /spec/cassettes/ImageScraper_Client/_stylesheet_images/handles_stylesheet_image_with_a_relative_url.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http_interactions: 3 | - request: 4 | method: get 5 | uri: https://raw.github.com/charlotte-ruby/image_scraper/master/spec/support/relative_image_url.html 6 | body: 7 | encoding: US-ASCII 8 | string: '' 9 | headers: 10 | User-Agent: 11 | - Mozilla/5.0 (Macintosh) 12 | Accept-Encoding: 13 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 14 | Accept: 15 | - "*/*" 16 | Host: 17 | - raw.github.com 18 | response: 19 | status: 20 | code: 301 21 | message: Moved Permanently 22 | headers: 23 | Connection: 24 | - keep-alive 25 | Content-Length: 26 | - '0' 27 | Location: 28 | - https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/relative_image_url.html 29 | Accept-Ranges: 30 | - bytes 31 | Date: 32 | - Sat, 11 Jul 2020 02:47:46 GMT 33 | Via: 34 | - 1.1 varnish 35 | Age: 36 | - '0' 37 | X-Served-By: 38 | - cache-fty21334-FTY 39 | X-Cache: 40 | - MISS 41 | X-Cache-Hits: 42 | - '0' 43 | Vary: 44 | - Accept-Encoding 45 | X-Fastly-Request-Id: 46 | - 42defefcb229163cf23f39d4a0e7bb09dc2346c7 47 | body: 48 | encoding: UTF-8 49 | string: '' 50 | recorded_at: Sat, 11 Jul 2020 02:47:46 GMT 51 | - request: 52 | method: get 53 | uri: https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/relative_image_url.html 54 | body: 55 | encoding: US-ASCII 56 | string: '' 57 | headers: 58 | User-Agent: 59 | - Mozilla/5.0 (Macintosh) 60 | Accept-Encoding: 61 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 62 | Accept: 63 | - "*/*" 64 | Host: 65 | - raw.githubusercontent.com 66 | response: 67 | status: 68 | code: 200 69 | message: OK 70 | headers: 71 | Connection: 72 | - keep-alive 73 | Content-Length: 74 | - '361' 75 | Cache-Control: 76 | - max-age=300 77 | Content-Security-Policy: 78 | - default-src 'none'; style-src 'unsafe-inline'; sandbox 79 | Content-Type: 80 | - text/plain; charset=utf-8 81 | Etag: 82 | - W/"5d863c870c1499165ae27fc405308b04c53be69e32f5d1a80e1e1265c1165454" 83 | Strict-Transport-Security: 84 | - max-age=31536000 85 | X-Content-Type-Options: 86 | - nosniff 87 | X-Frame-Options: 88 | - deny 89 | X-Xss-Protection: 90 | - 1; mode=block 91 | Via: 92 | - 1.1 varnish 93 | - 1.1 varnish (Varnish/6.0) 94 | X-Github-Request-Id: 95 | - 8CE0:4149:2B2888:32CDBC:5F092851 96 | Accept-Ranges: 97 | - bytes 98 | Date: 99 | - Sat, 11 Jul 2020 02:47:47 GMT 100 | X-Served-By: 101 | - cache-fty21353-FTY 102 | X-Cache: 103 | - MISS, MISS 104 | X-Cache-Hits: 105 | - 0, 0 106 | X-Timer: 107 | - S1594435667.903284,VS0,VE105 108 | Vary: 109 | - Authorization,Accept-Encoding 110 | Access-Control-Allow-Origin: 111 | - "*" 112 | X-Fastly-Request-Id: 113 | - 53c0ddf688d229bb9907820b623a2f696ff20874 114 | Expires: 115 | - Sat, 11 Jul 2020 02:52:47 GMT 116 | Source-Age: 117 | - '0' 118 | body: 119 | encoding: ASCII-8BIT 120 | string: | 121 | 122 | 123 | 124 | 125 | 126 | stylesheet_test 127 | 128 | 129 | 137 | 138 | 139 | 140 | 141 | recorded_at: Sat, 11 Jul 2020 02:47:47 GMT 142 | - request: 143 | method: get 144 | uri: https://raw.github.com/charlotte-ruby/image_scraper/master/spec/support/relative_image_url.css 145 | body: 146 | encoding: US-ASCII 147 | string: '' 148 | headers: 149 | Accept-Encoding: 150 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 151 | Accept: 152 | - "*/*" 153 | User-Agent: 154 | - Ruby 155 | response: 156 | status: 157 | code: 301 158 | message: Moved Permanently 159 | headers: 160 | Connection: 161 | - keep-alive 162 | Content-Length: 163 | - '0' 164 | Location: 165 | - https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/relative_image_url.css 166 | Accept-Ranges: 167 | - bytes 168 | Date: 169 | - Sat, 11 Jul 2020 02:47:47 GMT 170 | Via: 171 | - 1.1 varnish 172 | Age: 173 | - '0' 174 | X-Served-By: 175 | - cache-fty21382-FTY 176 | X-Cache: 177 | - MISS 178 | X-Cache-Hits: 179 | - '0' 180 | Vary: 181 | - Accept-Encoding 182 | X-Fastly-Request-Id: 183 | - c4d0c44089a9cd9f259ba5dfba01b82886c5bdd3 184 | body: 185 | encoding: UTF-8 186 | string: '' 187 | recorded_at: Sat, 11 Jul 2020 02:47:47 GMT 188 | - request: 189 | method: get 190 | uri: https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/relative_image_url.css 191 | body: 192 | encoding: US-ASCII 193 | string: '' 194 | headers: 195 | Accept-Encoding: 196 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 197 | Accept: 198 | - "*/*" 199 | User-Agent: 200 | - Ruby 201 | response: 202 | status: 203 | code: 200 204 | message: OK 205 | headers: 206 | Connection: 207 | - keep-alive 208 | Content-Length: 209 | - '75' 210 | Cache-Control: 211 | - max-age=300 212 | Content-Security-Policy: 213 | - default-src 'none'; style-src 'unsafe-inline'; sandbox 214 | Content-Type: 215 | - text/plain; charset=utf-8 216 | Etag: 217 | - W/"2c5ae7c2bb86ae5983019ef6e4fdd989bb35f94f20f3fd73e86ed5d9a953c9d1" 218 | Strict-Transport-Security: 219 | - max-age=31536000 220 | X-Content-Type-Options: 221 | - nosniff 222 | X-Frame-Options: 223 | - deny 224 | X-Xss-Protection: 225 | - 1; mode=block 226 | Via: 227 | - 1.1 varnish 228 | - 1.1 varnish (Varnish/6.0) 229 | X-Github-Request-Id: 230 | - A436:5FF4:137A64:1747FA:5F092852 231 | Accept-Ranges: 232 | - bytes 233 | Date: 234 | - Sat, 11 Jul 2020 02:47:47 GMT 235 | X-Served-By: 236 | - cache-fty21326-FTY 237 | X-Cache: 238 | - MISS, MISS 239 | X-Cache-Hits: 240 | - 0, 0 241 | X-Timer: 242 | - S1594435667.266851,VS0,VE115 243 | Vary: 244 | - Authorization,Accept-Encoding 245 | Access-Control-Allow-Origin: 246 | - "*" 247 | X-Fastly-Request-Id: 248 | - d8d1068470ba9ef99c8926302b7cfd66c55e3673 249 | Expires: 250 | - Sat, 11 Jul 2020 02:52:47 GMT 251 | Source-Age: 252 | - '0' 253 | body: 254 | encoding: ASCII-8BIT 255 | string: | 256 | .test { 257 | background-image: url('../images/some_image.png') 258 | } 259 | recorded_at: Sat, 11 Jul 2020 02:47:47 GMT 260 | recorded_with: VCR 6.0.0 261 | -------------------------------------------------------------------------------- /spec/cassettes/ImageScraper_Client/_stylesheet_images/scrapes_stylesheet_images.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http_interactions: 3 | - request: 4 | method: get 5 | uri: https://raw.github.com/charlotte-ruby/image_scraper/master/spec/support/stylesheet_unescaped_image.html 6 | body: 7 | encoding: US-ASCII 8 | string: '' 9 | headers: 10 | User-Agent: 11 | - Mozilla/5.0 (Macintosh) 12 | Accept-Encoding: 13 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 14 | Accept: 15 | - "*/*" 16 | Host: 17 | - raw.github.com 18 | response: 19 | status: 20 | code: 301 21 | message: Moved Permanently 22 | headers: 23 | Connection: 24 | - keep-alive 25 | Content-Length: 26 | - '0' 27 | Location: 28 | - https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/stylesheet_unescaped_image.html 29 | Accept-Ranges: 30 | - bytes 31 | Date: 32 | - Sat, 11 Jul 2020 02:47:46 GMT 33 | Via: 34 | - 1.1 varnish 35 | Age: 36 | - '0' 37 | X-Served-By: 38 | - cache-fty21330-FTY 39 | X-Cache: 40 | - MISS 41 | X-Cache-Hits: 42 | - '0' 43 | Vary: 44 | - Accept-Encoding 45 | X-Fastly-Request-Id: 46 | - 67cf4e16a67c4ff9dc5bc784d87f623f506e97b6 47 | body: 48 | encoding: UTF-8 49 | string: '' 50 | recorded_at: Sat, 11 Jul 2020 02:47:46 GMT 51 | - request: 52 | method: get 53 | uri: https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/stylesheet_unescaped_image.html 54 | body: 55 | encoding: US-ASCII 56 | string: '' 57 | headers: 58 | User-Agent: 59 | - Mozilla/5.0 (Macintosh) 60 | Accept-Encoding: 61 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 62 | Accept: 63 | - "*/*" 64 | Host: 65 | - raw.githubusercontent.com 66 | response: 67 | status: 68 | code: 200 69 | message: OK 70 | headers: 71 | Connection: 72 | - keep-alive 73 | Content-Length: 74 | - '393' 75 | Cache-Control: 76 | - max-age=300 77 | Content-Security-Policy: 78 | - default-src 'none'; style-src 'unsafe-inline'; sandbox 79 | Content-Type: 80 | - text/plain; charset=utf-8 81 | Etag: 82 | - W/"92596301aa24cd3a793a9a17d1d618bcc5b1e7072ce62d94abbf449f3f44e523" 83 | Strict-Transport-Security: 84 | - max-age=31536000 85 | X-Content-Type-Options: 86 | - nosniff 87 | X-Frame-Options: 88 | - deny 89 | X-Xss-Protection: 90 | - 1; mode=block 91 | Via: 92 | - 1.1 varnish 93 | - 1.1 varnish (Varnish/6.0) 94 | X-Github-Request-Id: 95 | - B256:5FF4:137A5F:1747CE:5F09284B 96 | Accept-Ranges: 97 | - bytes 98 | Date: 99 | - Sat, 11 Jul 2020 02:47:46 GMT 100 | X-Served-By: 101 | - cache-fty21365-FTY 102 | X-Cache: 103 | - MISS, MISS 104 | X-Cache-Hits: 105 | - 0, 0 106 | X-Timer: 107 | - S1594435666.161183,VS0,VE114 108 | Vary: 109 | - Authorization,Accept-Encoding 110 | Access-Control-Allow-Origin: 111 | - "*" 112 | X-Fastly-Request-Id: 113 | - b43101a4b597ca7c347a93cff934973b3d17990c 114 | Expires: 115 | - Sat, 11 Jul 2020 02:52:46 GMT 116 | Source-Age: 117 | - '0' 118 | body: 119 | encoding: ASCII-8BIT 120 | string: | 121 | z 122 | 123 | 124 | 125 | 126 | stylesheet_test 127 | 128 | 129 | 137 | 138 | 139 | 140 | 141 | recorded_at: Sat, 11 Jul 2020 02:47:46 GMT 142 | - request: 143 | method: get 144 | uri: https://raw.github.com/charlotte-ruby/image_scraper/master/spec/support/unescaped_image.css 145 | body: 146 | encoding: US-ASCII 147 | string: '' 148 | headers: 149 | Accept-Encoding: 150 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 151 | Accept: 152 | - "*/*" 153 | User-Agent: 154 | - Ruby 155 | response: 156 | status: 157 | code: 301 158 | message: Moved Permanently 159 | headers: 160 | Connection: 161 | - keep-alive 162 | Content-Length: 163 | - '0' 164 | Location: 165 | - https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/unescaped_image.css 166 | Accept-Ranges: 167 | - bytes 168 | Date: 169 | - Sat, 11 Jul 2020 02:47:46 GMT 170 | Via: 171 | - 1.1 varnish 172 | Age: 173 | - '0' 174 | X-Served-By: 175 | - cache-fty21378-FTY 176 | X-Cache: 177 | - MISS 178 | X-Cache-Hits: 179 | - '0' 180 | Vary: 181 | - Accept-Encoding 182 | X-Fastly-Request-Id: 183 | - c798264dfc2e5d7861e0abbe1696d6b1694c6882 184 | body: 185 | encoding: UTF-8 186 | string: '' 187 | recorded_at: Sat, 11 Jul 2020 02:47:46 GMT 188 | - request: 189 | method: get 190 | uri: https://raw.githubusercontent.com/charlotte-ruby/image_scraper/master/spec/support/unescaped_image.css 191 | body: 192 | encoding: US-ASCII 193 | string: '' 194 | headers: 195 | Accept-Encoding: 196 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 197 | Accept: 198 | - "*/*" 199 | User-Agent: 200 | - Ruby 201 | response: 202 | status: 203 | code: 200 204 | message: OK 205 | headers: 206 | Connection: 207 | - keep-alive 208 | Content-Length: 209 | - '117' 210 | Cache-Control: 211 | - max-age=300 212 | Content-Security-Policy: 213 | - default-src 'none'; style-src 'unsafe-inline'; sandbox 214 | Content-Type: 215 | - text/plain; charset=utf-8 216 | Etag: 217 | - W/"a372d59f8b5e966084cad7f1c3389d218c1056d1a6b0cc1f78ce7766543521cd" 218 | Strict-Transport-Security: 219 | - max-age=31536000 220 | X-Content-Type-Options: 221 | - nosniff 222 | X-Frame-Options: 223 | - deny 224 | X-Xss-Protection: 225 | - 1; mode=block 226 | Via: 227 | - 1.1 varnish 228 | - 1.1 varnish (Varnish/6.0) 229 | X-Github-Request-Id: 230 | - 95FC:6E25:146840:18350B:5F092848 231 | Accept-Ranges: 232 | - bytes 233 | Date: 234 | - Sat, 11 Jul 2020 02:47:46 GMT 235 | X-Served-By: 236 | - cache-fty21349-FTY 237 | X-Cache: 238 | - MISS, MISS 239 | X-Cache-Hits: 240 | - 0, 0 241 | X-Timer: 242 | - S1594435667.501147,VS0,VE115 243 | Vary: 244 | - Authorization,Accept-Encoding 245 | Access-Control-Allow-Origin: 246 | - "*" 247 | X-Fastly-Request-Id: 248 | - 05d7f78a7df9449f89a871c44ebb8caf1b56dfbe 249 | Expires: 250 | - Sat, 11 Jul 2020 02:52:46 GMT 251 | Source-Age: 252 | - '0' 253 | body: 254 | encoding: ASCII-8BIT 255 | string: | 256 | .test { 257 | background-image: url('https://raw.github.com/charlotte-ruby/image_scraper/master/some image.png') 258 | } 259 | recorded_at: Sat, 11 Jul 2020 02:47:46 GMT 260 | recorded_with: VCR 6.0.0 261 | -------------------------------------------------------------------------------- /spec/cassettes/ImageScraper_Client/_stylesheets/lists_relative_path_stylesheets.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http_interactions: 3 | - request: 4 | method: get 5 | uri: http://test.com/ 6 | body: 7 | encoding: US-ASCII 8 | string: '' 9 | headers: 10 | User-Agent: 11 | - Mozilla/5.0 (Macintosh) 12 | Accept-Encoding: 13 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 14 | Accept: 15 | - "*/*" 16 | Host: 17 | - test.com 18 | response: 19 | status: 20 | code: 301 21 | message: Moved Permanently 22 | headers: 23 | Server: 24 | - nginx/1.16.1 25 | Date: 26 | - Sat, 11 Jul 2020 03:17:21 GMT 27 | Content-Type: 28 | - text/html; charset=UTF-8 29 | Transfer-Encoding: 30 | - chunked 31 | Connection: 32 | - keep-alive 33 | Keep-Alive: 34 | - timeout=20 35 | X-Dis-Request-Id: 36 | - 903727ea6a57471f663fbe8dabe1c00f 37 | Location: 38 | - http://www.test.com/ 39 | body: 40 | encoding: UTF-8 41 | string: "301 Moved Permanently

301 42 | Moved Permanently

Object moved to here.


DOSarrest 43 | Internet Security
\n" 44 | recorded_at: Sat, 11 Jul 2020 03:17:21 GMT 45 | - request: 46 | method: get 47 | uri: http://www.test.com/ 48 | body: 49 | encoding: US-ASCII 50 | string: '' 51 | headers: 52 | User-Agent: 53 | - Mozilla/5.0 (Macintosh) 54 | Accept-Encoding: 55 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 56 | Accept: 57 | - "*/*" 58 | Host: 59 | - www.test.com 60 | response: 61 | status: 62 | code: 301 63 | message: Moved Permanently 64 | headers: 65 | Server: 66 | - nginx/1.16.1 67 | Date: 68 | - Sat, 11 Jul 2020 03:17:21 GMT 69 | Content-Type: 70 | - text/html; charset=UTF-8 71 | Transfer-Encoding: 72 | - chunked 73 | Connection: 74 | - keep-alive 75 | Keep-Alive: 76 | - timeout=20 77 | X-Dis-Request-Id: 78 | - b654c84e0b7623155267ef12e5fe8d60 79 | Location: 80 | - https://www.test.com/ 81 | body: 82 | encoding: UTF-8 83 | string: "301 Moved Permanently

301 84 | Moved Permanently

Object moved to here.


DOSarrest 85 | Internet Security
\n" 86 | recorded_at: Sat, 11 Jul 2020 03:17:21 GMT 87 | - request: 88 | method: get 89 | uri: https://www.test.com/ 90 | body: 91 | encoding: US-ASCII 92 | string: '' 93 | headers: 94 | User-Agent: 95 | - Mozilla/5.0 (Macintosh) 96 | Accept-Encoding: 97 | - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 98 | Accept: 99 | - "*/*" 100 | Host: 101 | - www.test.com 102 | response: 103 | status: 104 | code: 200 105 | message: OK 106 | headers: 107 | Server: 108 | - nginx/1.16.1 109 | Date: 110 | - Sat, 11 Jul 2020 03:17:22 GMT 111 | Content-Type: 112 | - text/html 113 | Transfer-Encoding: 114 | - chunked 115 | Connection: 116 | - keep-alive 117 | Keep-Alive: 118 | - timeout=20 119 | X-Dis-Request-Id: 120 | - df0d48077779cd58216d5edc81a47f69 121 | P3p: 122 | - CP="NON DSP COR ADMa OUR IND UNI COM NAV INT" 123 | Cache-Control: 124 | - no-cache 125 | body: 126 | encoding: ASCII-8BIT 127 | string: "\n\n\n\n\n\n\n\n\n\n\n" 146 | recorded_at: Sat, 11 Jul 2020 03:17:22 GMT 147 | recorded_with: VCR 6.0.0 148 | -------------------------------------------------------------------------------- /spec/image_scraper/client_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'spec_helper' 4 | 5 | describe ImageScraper::Client, :vcr do 6 | let(:repo_url) { "https://raw.github.com/charlotte-ruby/image_scraper" } 7 | 8 | describe "foo" do 9 | it "something" do 10 | url = "http://www.amazon.com/Planet-Two-Disc-Digital-Combo-Blu-ray/dp/B004LWZW4W/ref=sr_1_1?s=movies-tv&ie=UTF8&qid=1324771542&sr=1-1" 11 | 12 | client = described_class.new(url) 13 | 14 | expect(client.page_images).not_to be_empty 15 | end 16 | end 17 | 18 | describe "#initialize" do 19 | it 'works with invalid URLs' do 20 | allow_any_instance_of(described_class).to receive(:fetch).and_return(nil) 21 | 22 | scraper = described_class.new('bogusurl4444.com') 23 | 24 | expect(scraper.doc).to be_nil 25 | end 26 | 27 | it 'has empty data if URL is invalid' do 28 | allow_any_instance_of(described_class).to receive(:fetch).and_return(nil) 29 | 30 | scraper = described_class.new('bogusurl4444.com') 31 | 32 | expect(scraper.image_urls).to be_empty 33 | expect(scraper.stylesheets).to be_empty 34 | expect(scraper.stylesheet_images).to be_empty 35 | expect(scraper.page_images).to be_empty 36 | end 37 | end 38 | 39 | describe '#image_urls' do 40 | it 'scrapes absolute paths' do 41 | images = [ 42 | 'http://en.wikipedia.org/static/images/poweredby_mediawiki_88x31.png', 43 | 'http://en.wikipedia.org/static/images/wikimedia-button.png', 44 | 'http://en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1', 45 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/SIPI_Jelly_Beans_4.1.07.tiff/lossy-page1-220px-SIPI_Jelly_Beans_4.1.07.tiff.jpg', 46 | 'http://upload.wikimedia.org/wikipedia/en/thumb/5/5c/Symbol_template_class.svg/16px-Symbol_template_class.svg.png' 47 | ] 48 | 49 | url = 'http://en.wikipedia.org/wiki/Standard_test_image' 50 | 51 | client = described_class.new(url, include_css_images: false) 52 | 53 | expect(client.image_urls).to eq(images) 54 | end 55 | 56 | it 'scrapes with whitespace stripped' do 57 | file = 'spec/support/extra_whitespace.html' 58 | 59 | client = described_class.new('') 60 | client.doc = File.open(file) { |f| Nokogiri::HTML(f) } 61 | 62 | images = [ 63 | 'http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter._V164348457_.jpg', 64 | 'http://g-ecx.images-amazon.com/images/G/01/SIMON/IsaacsonWalter.jpg' 65 | ] 66 | 67 | expect(client.image_urls).to eq(images) 68 | end 69 | 70 | it 'scrapes relative paths' do 71 | scraper = described_class.new('http://en.wikipedia.org/wiki/Standard_test_image', 72 | convert_to_absolute_url: false, 73 | include_css_images: false) 74 | 75 | images = [ 76 | 'http://en.wikipedia.org/static/images/poweredby_mediawiki_88x31.png', 77 | 'http://en.wikipedia.org/static/images/wikimedia-button.png', 78 | 'http://en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1', 79 | 'http://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/SIPI_Jelly_Beans_4.1.07.tiff/lossy-page1-220px-SIPI_Jelly_Beans_4.1.07.tiff.jpg', 80 | 'http://upload.wikimedia.org/wikipedia/en/thumb/5/5c/Symbol_template_class.svg/16px-Symbol_template_class.svg.png' 81 | ] 82 | 83 | expect(scraper.image_urls).to eq(images) 84 | end 85 | 86 | it 'handles url with unescaped spaces' do 87 | url = 'https://raw.github.com/syoder/image_scraper/stylesheet_fix/test/resources/space in url.html' 88 | 89 | scraper = described_class.new(url, include_css_images: false) 90 | 91 | expected_url = 'https://raw.github.com/syoder/image_scraper/stylesheet_fix/test/resources/image1.png' 92 | 93 | expect(scraper.image_urls.length).to eq(1) 94 | expect(scraper.image_urls.first).to eq(expected_url) 95 | end 96 | end 97 | 98 | describe '#stylesheets' do 99 | it 'lists relative path stylesheets' do 100 | file = 'spec/support/stylesheet_test.html' 101 | 102 | client = described_class.new('http://test.com') 103 | client.doc = File.open(file) { |f| Nokogiri::HTML(f) } 104 | 105 | stylesheets = [ 106 | 'http://test.com/css/master.css', 107 | 'http://test.com/css/master2.css' 108 | ] 109 | 110 | expect(client.stylesheets).to eq(stylesheets) 111 | end 112 | 113 | it 'handles stylesheet with an unescaped url' do 114 | scraper = described_class.new('') 115 | scraper.url = 'http://test.com' 116 | scraper.doc = Nokogiri::HTML("") 117 | 118 | expect(scraper.stylesheets).to include('http://test.com/unescapedpath.css') 119 | end 120 | end 121 | 122 | describe '#page_images' do 123 | it 'handles unescaped urls' do 124 | scraper = described_class.new('http://test.com') 125 | scraper.doc = Nokogiri::HTML("") 126 | 127 | expect(scraper.page_images.length).to eq(1) 128 | expect(scraper.page_images).to include('http://test.com/unescaped%20path') 129 | end 130 | 131 | it 'handldes image urls that include square brackets' do 132 | scraper = described_class.new('http://google.com') 133 | scraper.doc = Nokogiri::HTML("") 134 | 135 | expect(scraper.page_images).to be_empty 136 | end 137 | end 138 | 139 | describe '#stylesheet_images' do 140 | it 'scrapes stylesheet images' do 141 | url = "#{repo_url}/master/spec/support/stylesheet_unescaped_image.html" 142 | stylesheet_path = "#{repo_url}/master/someimage.png" 143 | # /charlotte-ruby/image_scraper/master/spec/support/unescaped_image.css 144 | scraper = described_class.new(url, include_css_images: true) 145 | 146 | expect(scraper.stylesheet_images).to include(stylesheet_path) 147 | end 148 | 149 | it 'handles 404s' do 150 | scraper = described_class.new('') 151 | scraper.url = 'http://google.com' 152 | scraper.doc = Nokogiri::HTML("") 153 | 154 | expect(scraper.stylesheet_images).to be_empty 155 | end 156 | 157 | it 'handles stylesheet image with a relative url' do 158 | url = "#{repo_url}/master/spec/support/relative_image_url.html" 159 | image_url = "#{repo_url}/master/spec/images/some_image.png" 160 | 161 | scraper = described_class.new(url, include_css_images: true) 162 | 163 | expect(scraper.stylesheet_images).to include(image_url) 164 | end 165 | end 166 | end 167 | -------------------------------------------------------------------------------- /spec/image_scraper/util_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'spec_helper' 4 | 5 | describe ImageScraper::Util do 6 | 7 | describe 'absolute_url' do 8 | it 'parses asset' do 9 | url = 'http://www.test.com' 10 | asset = 'image.gif' 11 | 12 | result = described_class.absolute_url(url, asset) 13 | 14 | expect(result).to eq('http://www.test.com/image.gif') 15 | end 16 | 17 | it 'parses relative asset' do 18 | url = 'http://www.test.com' 19 | asset = 'images/image.gif' 20 | 21 | result = described_class.absolute_url(url, asset) 22 | 23 | expect(result).to eq('http://www.test.com/images/image.gif') 24 | end 25 | 26 | it 'parses absolute asset' do 27 | url = 'http://www.test.com' 28 | asset = '/images/image.gif' 29 | 30 | result = described_class.absolute_url(url, asset) 31 | 32 | expect(result).to eq('http://www.test.com/images/image.gif') 33 | end 34 | 35 | it 'parses root url with no asset' do 36 | result = described_class.absolute_url('http://www.test.com') 37 | 38 | expect(result).to eq('http://www.test.com') 39 | end 40 | 41 | it 'parses url with no asset' do 42 | result = described_class.absolute_url('http://www.test.com/a/test.html') 43 | 44 | expect(result).to eq('http://www.test.com/a/test.html') 45 | end 46 | end 47 | 48 | describe 'strip_quotes' do 49 | it 'parses paths' do 50 | result = described_class.strip_quotes("'/images/test.png'") 51 | 52 | expect(result).to eq('/images/test.png') 53 | end 54 | 55 | it 'parses a full url' do 56 | str = "'http://www.somsite.com/images/test.png'" 57 | 58 | result = described_class.strip_quotes(str) 59 | 60 | expect(result).to eq('http://www.somsite.com/images/test.png') 61 | end 62 | 63 | it 'parses emptyness' do 64 | result = described_class.strip_quotes('') 65 | 66 | expect(result).to be_empty 67 | end 68 | end 69 | 70 | describe 'domain' do 71 | it 'parses the domain of a url' do 72 | u = described_class 73 | 74 | expect(u.domain('http://ug.ly')).to eq('http://ug.ly') 75 | expect(u.domain('http://ug.ly/what')).to eq('http://ug.ly') 76 | expect(u.domain('http://ug.ly/what/is/this/')).to eq('http://ug.ly') 77 | expect(u.domain('http://www.ug.ly/what/is/this/')).to eq('http://www.ug.ly') 78 | expect(u.domain('http://ug.ly/what/is/this.html')).to eq('http://ug.ly') 79 | end 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /spec/image_scraper_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe ImageScraper do 4 | it 'has a version number' do 5 | expect(ImageScraper::VERSION).not_to be_nil 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'webmock/rspec' 4 | require 'vcr' 5 | require 'image_scraper' 6 | 7 | # FIXME: remove when fixed in vcr 8 | # https://github.com/vcr/vcr/pull/907/files 9 | module VCR 10 | class LibraryHooks 11 | # @private 12 | module WebMock 13 | module_function 14 | 15 | def with_global_hook_disabled(request) 16 | global_hook_disabled_requests << request 17 | 18 | begin 19 | yield 20 | ensure 21 | global_hook_disabled_requests.delete(request) 22 | end 23 | end 24 | 25 | def global_hook_disabled?(request) 26 | requests = Thread.current[:_vcr_webmock_disabled_requests] 27 | requests&.include?(request) 28 | end 29 | 30 | def global_hook_disabled_requests 31 | Thread.current[:_vcr_webmock_disabled_requests] ||= [] 32 | end 33 | end 34 | end 35 | end 36 | 37 | VCR.configure do |c| 38 | c.cassette_library_dir = 'spec/cassettes' 39 | c.hook_into :webmock 40 | c.configure_rspec_metadata! 41 | end 42 | 43 | # This file was generated by the `rspec --init` command. Conventionally, all 44 | # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. 45 | # The generated `.rspec` file contains `--require spec_helper` which will cause 46 | # this file to always be loaded, without a need to explicitly require it in any 47 | # files. 48 | # 49 | # Given that it is always loaded, you are encouraged to keep this file as 50 | # light-weight as possible. Requiring heavyweight dependencies from this file 51 | # will add to the boot time of your test suite on EVERY test run, even for an 52 | # individual file that may not need all of that loaded. Instead, consider making 53 | # a separate helper file that requires the additional dependencies and performs 54 | # the additional setup, and require it from the spec files that actually need 55 | # it. 56 | # 57 | # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration 58 | RSpec.configure do |config| 59 | # rspec-expectations config goes here. You can use an alternate 60 | # assertion/expectation library such as wrong or the stdlib/minitest 61 | # assertions if you prefer. 62 | config.expect_with :rspec do |expectations| 63 | # This option will default to `true` in RSpec 4. It makes the `description` 64 | # and `failure_message` of custom matchers include text for helper methods 65 | # defined using `chain`, e.g.: 66 | # be_bigger_than(2).and_smaller_than(4).description 67 | # # => "be bigger than 2 and smaller than 4" 68 | # ...rather than: 69 | # # => "be bigger than 2" 70 | expectations.include_chain_clauses_in_custom_matcher_descriptions = true 71 | end 72 | 73 | # rspec-mocks config goes here. You can use an alternate test double 74 | # library (such as bogus or mocha) by changing the `mock_with` option here. 75 | config.mock_with :rspec do |mocks| 76 | # Prevents you from mocking or stubbing a method that does not exist on 77 | # a real object. This is generally recommended, and will default to 78 | # `true` in RSpec 4. 79 | mocks.verify_partial_doubles = true 80 | end 81 | 82 | # This option will default to `:apply_to_host_groups` in RSpec 4 (and will 83 | # have no way to turn it off -- the option exists only for backwards 84 | # compatibility in RSpec 3). It causes shared context metadata to be 85 | # inherited by the metadata hash of host groups and examples, rather than 86 | # triggering implicit auto-inclusion in groups with matching metadata. 87 | config.shared_context_metadata_behavior = :apply_to_host_groups 88 | 89 | # The settings below are suggested to provide a good initial experience 90 | # with RSpec, but feel free to customize to your heart's content. 91 | # # This allows you to limit a spec run to individual examples or groups 92 | # # you care about by tagging them with `:focus` metadata. When nothing 93 | # # is tagged with `:focus`, all examples get run. RSpec also provides 94 | # # aliases for `it`, `describe`, and `context` that include `:focus` 95 | # # metadata: `fit`, `fdescribe` and `fcontext`, respectively. 96 | config.filter_run_when_matching :focus 97 | # 98 | # # Allows RSpec to persist some state between runs in order to support 99 | # # the `--only-failures` and `--next-failure` CLI options. We recommend 100 | # # you configure your source control system to ignore this file. 101 | # config.example_status_persistence_file_path = "spec/examples.txt" 102 | # 103 | # # Limits the available syntax to the non-monkey patched syntax that is 104 | # # recommended. For more details, see: 105 | # # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ 106 | # # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ 107 | # # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode 108 | # config.disable_monkey_patching! 109 | # 110 | # # This setting enables warnings. It's recommended, but in some cases may 111 | # # be too noisy due to issues in dependencies. 112 | # config.warnings = true 113 | # 114 | # # Many RSpec users commonly either run the entire suite or an individual 115 | # # file, and it's useful to allow more verbose output when running an 116 | # # individual spec file. 117 | # if config.files_to_run.one? 118 | # # Use the documentation formatter for detailed output, 119 | # # unless a formatter has already been configured 120 | # # (e.g. via a command-line flag). 121 | # config.default_formatter = "doc" 122 | # end 123 | # 124 | # # Print the 10 slowest examples and example groups at the 125 | # # end of the spec run, to help surface which specs are running 126 | # # particularly slow. 127 | # config.profile_examples = 10 128 | # 129 | # # Run specs in random order to surface order dependencies. If you find an 130 | # # order dependency and want to debug it, you can fix the order by providing 131 | # # the seed, which is printed after each run. 132 | # # --seed 1234 133 | # config.order = :random 134 | # 135 | # # Seed global randomization in this process using the `--seed` CLI option. 136 | # # Setting this allows you to use `--seed` to deterministically reproduce 137 | # # test failures related to randomization by passing the same `--seed` value 138 | # # as the one that triggered the failure. 139 | # Kernel.srand config.seed 140 | end 141 | -------------------------------------------------------------------------------- /spec/support/extra_whitespace.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /spec/support/relative_image_url.css: -------------------------------------------------------------------------------- 1 | .test { 2 | background-image: url('../images/some_image.png') 3 | } 4 | -------------------------------------------------------------------------------- /spec/support/relative_image_url.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | stylesheet_test 7 | 8 | 9 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /spec/support/space in url.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /spec/support/stylesheet_test.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | stylesheet_test 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /spec/support/stylesheet_unescaped_image.html: -------------------------------------------------------------------------------- 1 | z 2 | 3 | 4 | 5 | 6 | stylesheet_test 7 | 8 | 9 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /spec/support/unescaped_image.css: -------------------------------------------------------------------------------- 1 | .test { 2 | background-image: url('https://raw.github.com/charlotte-ruby/image_scraper/master/some image.png') 3 | } 4 | --------------------------------------------------------------------------------