├── lib ├── panner │ ├── tumblr.rb │ ├── version.rb │ ├── panner.rb │ ├── package.rb │ ├── package_file_saver.rb │ ├── cli.rb │ ├── tumblr │ │ ├── feed.rb │ │ └── post.rb │ └── pans │ │ └── wordpress.rb └── panner.rb ├── Rakefile ├── Gemfile ├── bin ├── panner ├── setup └── console ├── .gitignore ├── panner.gemspec ├── LICENSE.txt └── README.md /lib/panner/tumblr.rb: -------------------------------------------------------------------------------- 1 | module Panner::Tumblr 2 | end -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | task :default => :spec 3 | -------------------------------------------------------------------------------- /lib/panner/version.rb: -------------------------------------------------------------------------------- 1 | module Panner 2 | VERSION = "0.1.0" 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in panner.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /bin/panner: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "panner" 5 | 6 | Panner::CLI.new.run(ARGV) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "panner" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start 15 | -------------------------------------------------------------------------------- /lib/panner.rb: -------------------------------------------------------------------------------- 1 | require "json" 2 | 3 | require "mechanize" 4 | require "deba" 5 | 6 | module Panner 7 | module Pans 8 | end 9 | 10 | # Your code goes here... 11 | end 12 | 13 | require "panner/version" 14 | require "panner/package" 15 | require "panner/package_file_saver" 16 | require "panner/tumblr" 17 | require "panner/tumblr/post" 18 | require "panner/tumblr/feed" 19 | require "panner/pans/wordpress" 20 | require "panner/panner" 21 | require "panner/cli" 22 | -------------------------------------------------------------------------------- /lib/panner/panner.rb: -------------------------------------------------------------------------------- 1 | class Panner::Panner 2 | def initialize(options) 3 | @options = options 4 | puts "@options: #{@options.inspect}" 5 | end 6 | 7 | def start 8 | pan = Panner::Pans::Wordpress.new(@options[:url]) 9 | 10 | nuggets = [] 11 | 12 | loop do 13 | fresh_nuggets = pan.download 14 | break if fresh_nuggets.nil? 15 | 16 | nuggets.concat(fresh_nuggets) 17 | pan.next 18 | end 19 | 20 | nuggets.each do |nugget| 21 | puts nugget.inspect 22 | puts "=======================================================================================================" 23 | end 24 | end 25 | end -------------------------------------------------------------------------------- /lib/panner/package.rb: -------------------------------------------------------------------------------- 1 | class Panner::Package 2 | attr_reader :root, :paths 3 | 4 | def initialize(root) 5 | raise "root must be an instance of Pathname" unless root.is_a?(Pathname) 6 | 7 | @root = root 8 | @paths = [] 9 | end 10 | 11 | def agent 12 | @agent ||= create_agent 13 | end 14 | 15 | def add(path, content = nil) 16 | file = (@root + path) 17 | file.write(content) unless content.nil? 18 | @paths << file 19 | end 20 | 21 | def finish 22 | add("package.json", @paths.map { |path| path.relative_path_from(@root).to_s }.to_json) 23 | end 24 | 25 | def create_agent 26 | agent = Mechanize.new 27 | agent.pluggable_parser['image'] = Panner::PackageFileSaver.for(self) 28 | agent 29 | end 30 | end -------------------------------------------------------------------------------- /lib/panner/package_file_saver.rb: -------------------------------------------------------------------------------- 1 | class Panner::PackageFileSaver < Mechanize::Download 2 | def self.package 3 | @package 4 | end 5 | 6 | def initialize(*args) 7 | super(*args) 8 | 9 | @package = self.class.package 10 | 11 | if(path.to_s.bytes.length > 256) 12 | @filename = "#{SecureRandom.hex(16)}#{Pathname.new(@filename).extname}}" 13 | end 14 | 15 | if(path.to_s.bytes.length > 256) 16 | @filename = SecureRandom.hex(16) 17 | end 18 | 19 | save(path) 20 | 21 | @package.add(@filename) 22 | end 23 | 24 | def path 25 | @package.root + @filename 26 | end 27 | 28 | def self.for(package) 29 | Class.new self do |klass| 30 | klass.instance_variable_set :@package, package 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/panner/cli.rb: -------------------------------------------------------------------------------- 1 | require "optparse" 2 | 3 | class Panner::CLI 4 | def run(arguments) 5 | options = {} 6 | 7 | opt_parser = OptionParser.new do |opts| 8 | opts.banner = "Usage: panner [options] URL" 9 | 10 | opts.on("-uUSERNAME", "--username=USERNAME", "Username") do |username| 11 | options[:username] = username 12 | end 13 | 14 | opts.on("-pPASSWORD", "--password=PASSWORD", "Username") do |password| 15 | options[:password] = password 16 | end 17 | 18 | opts.on("-h", "--help", "Prints this help") do 19 | puts opts 20 | exit 21 | end 22 | end 23 | 24 | opt_parser.parse!(arguments) 25 | options[:url] = arguments.first 26 | 27 | Panner::Panner.new(options).start 28 | end 29 | end 30 | 31 | -------------------------------------------------------------------------------- /lib/panner/tumblr/feed.rb: -------------------------------------------------------------------------------- 1 | class Panner::Tumblr::Feed 2 | attr_reader :posts 3 | 4 | def initialize(package, url) 5 | url =~ /^https?:\/\/(.*?).tumblr.com/ 6 | @username = $1 7 | @index = 1 8 | 9 | @package = package 10 | end 11 | 12 | def scrape 13 | loop { break unless next_page } 14 | end 15 | 16 | def next_page 17 | new_posts = scrape_page(@package.agent.get(url)) 18 | @index += 1 19 | new_posts.length > 0 20 | end 21 | 22 | def scrape_page(page) 23 | page.search("article[data-post-id]").map do |post| 24 | scrape_post(post) 25 | end 26 | end 27 | 28 | def scrape_post(post_node) 29 | post = Panner::Tumblr::Post.new(@package, post_node) 30 | post.scrape 31 | post 32 | end 33 | 34 | def url 35 | if @index == 1 36 | "https://#{@username}.tumblr.com/" 37 | else 38 | "https://#{@username}.tumblr.com/page/#{@index}" 39 | end 40 | end 41 | end -------------------------------------------------------------------------------- /lib/panner/pans/wordpress.rb: -------------------------------------------------------------------------------- 1 | class Panner::Pans::Wordpress 2 | def self.eligible?(url) 3 | url =~ /^https?:\/\/[^\/]+\.wordpress\.com/ 4 | end 5 | 6 | def initialize(url) 7 | @agent = Mechanize.new 8 | @next_url = url 9 | @page = nil 10 | end 11 | 12 | def authenticate(options) 13 | end 14 | 15 | def download 16 | @page = @agent.get(@next_url) 17 | puts "got page content" 18 | 19 | if @next_url.nil? 20 | puts "no more content" 21 | return 22 | end 23 | 24 | @page.search("article.post").map do |article| 25 | parse_article(article) 26 | end 27 | end 28 | 29 | def next 30 | link = @page.search("div.nav-links div.nav-previous a").first 31 | @next_url = link ? link['href'] : nil 32 | puts "next_url: @next_url" 33 | end 34 | 35 | def parse_article(article) 36 | out = {} 37 | out[:title] = article.at_css(".entry-title").text 38 | out[:body] = Deba.extract(article.at_css(".entry-content").inner_html) 39 | 40 | out 41 | end 42 | end -------------------------------------------------------------------------------- /panner.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'panner/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "panner" 8 | spec.version = Panner::VERSION 9 | spec.authors = ["Brenton \"B-Train\" Fletcher"] 10 | spec.email = ["i@bloople.net"] 11 | 12 | spec.summary = %q{Panner pans for website gold.} 13 | spec.description = %q{Panner pans for website gold.} 14 | spec.homepage = "http://panner.com" 15 | spec.license = "MIT" 16 | 17 | spec.files = `git ls-files -z`.split("\x0").reject do |f| 18 | f.match(%r{^(test|spec|features)/}) 19 | end 20 | spec.bindir = "exe" 21 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 22 | spec.require_paths = ["lib"] 23 | 24 | spec.add_development_dependency "bundler", "~> 1.13" 25 | spec.add_development_dependency "rake", "~> 10.0" 26 | spec.add_dependency "mechanize" 27 | spec.add_dependency "deba", "~> 0.10" 28 | end 29 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Brenton "B-Train" Fletcher 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/panner/tumblr/post.rb: -------------------------------------------------------------------------------- 1 | class Panner::Tumblr::Post 2 | attr_reader :id, :image_urls, :html, :text 3 | 4 | def initialize(package, node) 5 | @package = package 6 | @node = node 7 | end 8 | 9 | def scrape 10 | @id = @node['data-post-id'] 11 | 12 | post_content = @node.search(".post-content").first 13 | 14 | @html = post_content.to_html 15 | @text = Deba.extract(@html) 16 | 17 | @dependent_urls = [] 18 | 19 | iframe = post_content.at("iframe") 20 | scrape_images(@package.agent.get(iframe['src'])) if iframe 21 | 22 | scrape_images(post_content) 23 | 24 | scrape_dependencies 25 | 26 | @package.add("#{@id}.json", serialize.to_json) 27 | end 28 | 29 | def serialize 30 | { 31 | id: @id, 32 | html: @html, 33 | text: @text, 34 | dependent_urls_map: @dependent_urls_map 35 | } 36 | end 37 | 38 | def scrape_images(container) 39 | images = container.search("img").reject { |image| image.matches?(".avatar img") } 40 | @dependent_urls.concat(images.map { |image| image['src'] }) 41 | end 42 | 43 | def scrape_dependencies 44 | @dependent_urls_map = {} 45 | 46 | @dependent_urls.each do |url| 47 | begin 48 | file_saver = @package.agent.get(url) 49 | @dependent_urls_map[url] = file_saver.filename 50 | rescue Exception => e 51 | puts "#{e.message}" 52 | end 53 | end 54 | end 55 | end -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Panner 2 | 3 | Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/panner`. To experiment with that code, run `bin/console` for an interactive prompt. 4 | 5 | TODO: Delete this and the text above, and describe your gem 6 | 7 | ## Installation 8 | 9 | Add this line to your application's Gemfile: 10 | 11 | ```ruby 12 | gem 'panner' 13 | ``` 14 | 15 | And then execute: 16 | 17 | $ bundle 18 | 19 | Or install it yourself as: 20 | 21 | $ gem install panner 22 | 23 | ## Usage 24 | 25 | TODO: Write usage instructions here 26 | 27 | ## Development 28 | 29 | After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 30 | 31 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 32 | 33 | ## Contributing 34 | 35 | Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/panner. 36 | 37 | 38 | ## License 39 | 40 | The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT). 41 | 42 | --------------------------------------------------------------------------------