├── .github └── FUNDING.yml ├── .gitignore ├── .rspec ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── facebook_profile_scraper.gemspec ├── lib ├── facebook_profile_scraper.rb └── facebook_profile_scraper │ ├── scraper.rb │ └── version.rb └── spec ├── scraper_spec.rb └── spec_helper.rb /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: mhluska 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | *.bundle 19 | *.so 20 | *.o 21 | *.a 22 | mkmf.log 23 | 24 | .byebug_history 25 | .env 26 | example.rb 27 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | --require spec_helper 3 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | ruby '2.3.1' 3 | gemspec 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Maros Hluska 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Facebook Profile Scraper 2 | 3 | Scrape your friends' Facebook photos. 4 | 5 | ## Installation 6 | 7 | ``` 8 | gem install facebook_profile_scraper 9 | ``` 10 | 11 | ## Usage 12 | 13 | Create a `.env` file with your Facebook login information: 14 | 15 | ``` 16 | FACEBOOK_EMAIL= 17 | FACEBOOK_PASSWORD= 18 | ``` 19 | 20 | Use the scraper like this: 21 | 22 | ```rb 23 | require 'facebook_profile_scraper' 24 | 25 | scraper = FacebookProfileScraper::Scraper.new 26 | scraper.scrape('facebook_userid') 27 | ``` 28 | 29 | The local `tmp` directory will contain album photos. 30 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/gem_tasks' 2 | 3 | begin 4 | require 'rspec/core/rake_task' 5 | RSpec::Core::RakeTask.new(:spec) 6 | # Handle RSpec not being available on a production environment. 7 | rescue LoadError 8 | end 9 | 10 | -------------------------------------------------------------------------------- /facebook_profile_scraper.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'facebook_profile_scraper/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = 'facebook_profile_scraper' 8 | spec.version = FacebookProfileScraper::VERSION 9 | spec.authors = ['Maros Hluska'] 10 | spec.email = ['mhluska@gmail.com'] 11 | spec.summary = "Scrape a friend's Facebook profile page" 12 | spec.description = "Scrape photos and other data from a friend's Facebook profile page" 13 | spec.homepage = 'http://mhluska.com/' 14 | spec.license = 'MIT' 15 | 16 | spec.files = `git ls-files -z`.split("\x0") 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ['lib'] 20 | 21 | spec.add_dependency 'selenium-webdriver', '~> 2.53' 22 | spec.add_dependency 'dotenv', '~> 2.1' 23 | 24 | spec.add_development_dependency 'bundler', '~> 1.6' 25 | spec.add_development_dependency 'rake', '~> 11.2' 26 | spec.add_development_dependency 'rspec', '~> 3.5' 27 | end 28 | -------------------------------------------------------------------------------- /lib/facebook_profile_scraper.rb: -------------------------------------------------------------------------------- 1 | require 'facebook_profile_scraper/version' 2 | require 'facebook_profile_scraper/scraper' 3 | 4 | module FacebookProfileScraper 5 | end 6 | -------------------------------------------------------------------------------- /lib/facebook_profile_scraper/scraper.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | require 'capybara/dsl' 3 | # require 'capybara/poltergeist' 4 | require 'dotenv' 5 | 6 | Capybara.configure do |config| 7 | config.run_server = false 8 | # config.default_driver = :poltergeist 9 | config.default_driver = :chrome 10 | config.app_host = 'https://www.facebook.com' 11 | config.ignore_hidden_elements = false 12 | end 13 | 14 | Capybara.register_driver :chrome do |app| 15 | prefs = { 16 | download: { 17 | prompt_for_download: false, 18 | default_directory: "#{Dir.pwd}/tmp/downloads" 19 | } 20 | } 21 | Capybara::Selenium::Driver.new(app, browser: :chrome, prefs: prefs) 22 | end 23 | 24 | # TODO(maros): Figure out how to do this with `poltergeist` instead of 25 | # `chromedriver`. We use the latter because `poltergeist` can't seem to 26 | # download images. 27 | # Capybara.register_driver :poltergeist do |app| 28 | # Capybara::Poltergeist::Driver.new(app, phantomjs_logger: '/dev/null', js_errors: false, timeout: 60 * 10) 29 | # end 30 | 31 | Dotenv.load 32 | 33 | module FacebookProfileScraper 34 | class Scraper 35 | include Capybara::DSL 36 | 37 | def scrape(username) 38 | login 39 | scrape_photos(username) 40 | end 41 | 42 | private 43 | 44 | def find_link_elems_with(href) 45 | all('a').select { |elem| elem[:href] && elem[:href].include?(href) } 46 | end 47 | 48 | def find_links_with(href) 49 | find_link_elems_with(href).map { |elem| elem[:href] }.uniq 50 | end 51 | 52 | def download_photo 53 | find('.fbPhotoSnowliftDropdownButton').click 54 | 55 | begin 56 | find('a[data-action-type="download_photo"]', visible: true).click 57 | rescue Capybara::ElementNotFound 58 | # TODO(maros): Is there a download method in Capybara `chromedriver`? 59 | execute_script(" 60 | link = document.createElement('a'); 61 | link.href = document.querySelector('.spotlight').src; 62 | link.setAttribute('download', 'download'); 63 | link.click();") 64 | end 65 | end 66 | 67 | def scrape_album(href) 68 | visit(href) 69 | 70 | # HACK(maros): Make the backdrop for Chrome Notifications go away. Find a 71 | # pref for `chromedriver` to make this disabled by default. 72 | find('._3ixn').click 73 | 74 | photo_links = [] 75 | 76 | # Get all photos to load despite infinite scroll. 77 | loop do 78 | execute_script('window.scrollTo(0, document.body.scrollHeight);') 79 | links = find_link_elems_with('/photo.php') 80 | break if links.length - photo_links.length == 0 81 | photo_links = links 82 | end 83 | 84 | # Remove the cover photo and profile photo. 85 | photo_links.shift 86 | photo_links.shift 87 | 88 | if photo_links.length == 0 89 | return 90 | end 91 | 92 | photo_links.first.click 93 | photo_links.length.times do 94 | download_photo 95 | 96 | begin 97 | find('.snowliftPager.next').click 98 | 99 | # If the photo album has only one image, this element won't exist. 100 | rescue Selenium::WebDriver::Error::ElementNotVisibleError 101 | break 102 | end 103 | end 104 | 105 | # Add directory name for album. 106 | title = find('.fbPhotoAlbumTitle').text.downcase.gsub(' ', '_') 107 | 108 | begin 109 | File.rename("#{Dir.pwd}/tmp/downloads", "#{Dir.pwd}/tmp/#{title}") 110 | 111 | # This will fail if no files were downloaded because the `tmp/downloads` 112 | # directory will not exist. 113 | rescue Errno::ENOENT 114 | end 115 | end 116 | 117 | def scrape_photos(username) 118 | visit("/#{username}/photos_albums") 119 | find_links_with('/media/set').each { |link| scrape_album(link) } 120 | end 121 | 122 | def login 123 | visit('/login') 124 | fill_in('email', with: ENV['FACEBOOK_EMAIL']) 125 | fill_in('pass', with: ENV['FACEBOOK_PASSWORD']) 126 | click_button('loginbutton') 127 | 128 | while has_css?('#approvals_code') 129 | print 'Enter your 6-digit login code: ' 130 | fill_in('approvals_code', with: gets.chomp) 131 | click_button('checkpointSubmitButton') 132 | end 133 | 134 | while has_css?('#checkpointSubmitButton') 135 | click_button('checkpointSubmitButton') 136 | end 137 | end 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /lib/facebook_profile_scraper/version.rb: -------------------------------------------------------------------------------- 1 | module FacebookProfileScraper 2 | VERSION = '0.0.3' 3 | end 4 | -------------------------------------------------------------------------------- /spec/scraper_spec.rb: -------------------------------------------------------------------------------- 1 | require 'facebook_profile_scraper' 2 | 3 | describe FacebookProfileScraper::Scraper, js: true do 4 | it 'logs in' do 5 | # subject.login 6 | # expect(subject.page.find('#pagelet_welcome_box')).to be_true 7 | end 8 | end -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # This file was generated by the `rspec --init` command. Conventionally, all 2 | # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. 3 | # The generated `.rspec` file contains `--require spec_helper` which will cause 4 | # this file to always be loaded, without a need to explicitly require it in any 5 | # files. 6 | # 7 | # Given that it is always loaded, you are encouraged to keep this file as 8 | # light-weight as possible. Requiring heavyweight dependencies from this file 9 | # will add to the boot time of your test suite on EVERY test run, even for an 10 | # individual file that may not need all of that loaded. Instead, consider making 11 | # a separate helper file that requires the additional dependencies and performs 12 | # the additional setup, and require it from the spec files that actually need 13 | # it. 14 | # 15 | # The `.rspec` file also contains a few flags that are not defaults but that 16 | # users commonly want. 17 | # 18 | # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration 19 | RSpec.configure do |config| 20 | # rspec-expectations config goes here. You can use an alternate 21 | # assertion/expectation library such as wrong or the stdlib/minitest 22 | # assertions if you prefer. 23 | config.expect_with :rspec do |expectations| 24 | # This option will default to `true` in RSpec 4. It makes the `description` 25 | # and `failure_message` of custom matchers include text for helper methods 26 | # defined using `chain`, e.g.: 27 | # be_bigger_than(2).and_smaller_than(4).description 28 | # # => "be bigger than 2 and smaller than 4" 29 | # ...rather than: 30 | # # => "be bigger than 2" 31 | expectations.include_chain_clauses_in_custom_matcher_descriptions = true 32 | end 33 | 34 | # rspec-mocks config goes here. You can use an alternate test double 35 | # library (such as bogus or mocha) by changing the `mock_with` option here. 36 | config.mock_with :rspec do |mocks| 37 | # Prevents you from mocking or stubbing a method that does not exist on 38 | # a real object. This is generally recommended, and will default to 39 | # `true` in RSpec 4. 40 | mocks.verify_partial_doubles = true 41 | end 42 | 43 | # This option will default to `:apply_to_host_groups` in RSpec 4 (and will 44 | # have no way to turn it off -- the option exists only for backwards 45 | # compatibility in RSpec 3). It causes shared context metadata to be 46 | # inherited by the metadata hash of host groups and examples, rather than 47 | # triggering implicit auto-inclusion in groups with matching metadata. 48 | config.shared_context_metadata_behavior = :apply_to_host_groups 49 | 50 | # The settings below are suggested to provide a good initial experience 51 | # with RSpec, but feel free to customize to your heart's content. 52 | =begin 53 | # This allows you to limit a spec run to individual examples or groups 54 | # you care about by tagging them with `:focus` metadata. When nothing 55 | # is tagged with `:focus`, all examples get run. RSpec also provides 56 | # aliases for `it`, `describe`, and `context` that include `:focus` 57 | # metadata: `fit`, `fdescribe` and `fcontext`, respectively. 58 | config.filter_run_when_matching :focus 59 | 60 | # Allows RSpec to persist some state between runs in order to support 61 | # the `--only-failures` and `--next-failure` CLI options. We recommend 62 | # you configure your source control system to ignore this file. 63 | config.example_status_persistence_file_path = "spec/examples.txt" 64 | 65 | # Limits the available syntax to the non-monkey patched syntax that is 66 | # recommended. For more details, see: 67 | # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ 68 | # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ 69 | # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode 70 | config.disable_monkey_patching! 71 | 72 | # This setting enables warnings. It's recommended, but in some cases may 73 | # be too noisy due to issues in dependencies. 74 | config.warnings = true 75 | 76 | # Many RSpec users commonly either run the entire suite or an individual 77 | # file, and it's useful to allow more verbose output when running an 78 | # individual spec file. 79 | if config.files_to_run.one? 80 | # Use the documentation formatter for detailed output, 81 | # unless a formatter has already been configured 82 | # (e.g. via a command-line flag). 83 | config.default_formatter = 'doc' 84 | end 85 | 86 | # Print the 10 slowest examples and example groups at the 87 | # end of the spec run, to help surface which specs are running 88 | # particularly slow. 89 | config.profile_examples = 10 90 | 91 | # Run specs in random order to surface order dependencies. If you find an 92 | # order dependency and want to debug it, you can fix the order by providing 93 | # the seed, which is printed after each run. 94 | # --seed 1234 95 | config.order = :random 96 | 97 | # Seed global randomization in this process using the `--seed` CLI option. 98 | # Setting this allows you to use `--seed` to deterministically reproduce 99 | # test failures related to randomization by passing the same `--seed` value 100 | # as the one that triggered the failure. 101 | Kernel.srand config.seed 102 | =end 103 | end 104 | --------------------------------------------------------------------------------