├── .gitignore ├── .rspec ├── .travis.yml ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── lib ├── pdftotext.rb └── pdftotext │ ├── cli.rb │ ├── document.rb │ ├── page.rb │ └── version.rb ├── pdftotext.gemspec └── spec ├── fixtures └── pdf.pdf ├── pdftotext_cli_spec.rb ├── pdftotext_document_spec.rb ├── pdftotext_page_spec.rb ├── pdftotext_spec.rb └── spec_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | *.gem 11 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | sudo: required 3 | 4 | cache: 5 | - bundler 6 | - apt 7 | 8 | before_install: 9 | - sudo apt-get update 10 | - sudo apt-get install -y poppler-utils 11 | - gem install bundler --version '1.10.6' 12 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in pdftotext.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Ben Balter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pdftotext 2 | 3 | *A Ruby wrapper for the `pdftotext` command line library* 4 | 5 | [![Build Status](https://travis-ci.org/benbalter/pdftotext.svg)](https://travis-ci.org/benbalter/pdftotext) 6 | 7 | ## Installation 8 | 9 | 1. You must first install [Poppler](http://poppler.freedesktop.org/). On OS X this can be done with `brew install poppler` if you have Homebrew installed 10 | 2. Add `gem "pdftotext"` to your project's Gemfile 11 | 3. `bundle install` 12 | 13 | ## Usage 14 | 15 | ```ruby 16 | text = Pdftotext.text('path-to.pdf') 17 | => "The text of the PDF" 18 | 19 | pages = Pdftotext.pages('path-to.pdf') 20 | pages.first.number 21 | => 1 22 | pages.first.text 23 | => "The text of the PDF" 24 | ``` 25 | 26 | Both methods take an optional hash of command line arguments to pass to `pdftotext`. The only one by default being `-layout`. 27 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "pdftotext" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | require "pry" 11 | Pry.start 12 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | 5 | bundle install 6 | 7 | # Do any other automated setup that you need to do here 8 | -------------------------------------------------------------------------------- /lib/pdftotext.rb: -------------------------------------------------------------------------------- 1 | require "cliver" 2 | require "open3" 3 | require "pdftotext/version" 4 | require "pdftotext/document" 5 | require "pdftotext/cli" 6 | require "pdftotext/page" 7 | 8 | module Pdftotext 9 | 10 | def self.text(path, options={}) 11 | Document.new(path).text(options) 12 | end 13 | 14 | def self.pages(path, options={}) 15 | Document.new(path).pages(options) 16 | end 17 | 18 | def self.cli 19 | @cli ||= CLI.new 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/pdftotext/cli.rb: -------------------------------------------------------------------------------- 1 | module Pdftotext 2 | class CLI 3 | 4 | DEFAULT_OPTIONS = { 5 | :layout => true 6 | } 7 | 8 | def run_command(*args) 9 | options = DEFAULT_OPTIONS.merge(args.pop) 10 | args = args.concat options_to_args(options) 11 | output, status = Open3.capture2e(bin_path, *args) 12 | raise "Command `#{bin_path} #{args.join(" ")}` failed: #{output}" if status.exitstatus != 0 13 | output 14 | end 15 | 16 | private 17 | 18 | def bin_path 19 | @bin_path ||= Cliver.detect!('pdftotext') 20 | end 21 | 22 | def options_to_args(options) 23 | args = [] 24 | options.each do |key, value| 25 | next if value === false 26 | args.push "-#{key}" 27 | args.push value.to_s unless value === true 28 | end 29 | args 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /lib/pdftotext/document.rb: -------------------------------------------------------------------------------- 1 | module Pdftotext 2 | class Document 3 | attr_reader :path 4 | 5 | def initialize(path) 6 | @path = File.expand_path(path) 7 | end 8 | 9 | def text(options={}) 10 | Tempfile.open(['pdftotext', '.txt']) do |file| 11 | Pdftotext.cli.run_command path, file.path, options 12 | file.read 13 | end 14 | end 15 | 16 | def pages(options={}) 17 | pages = text(options).split("\f") 18 | pages.each_with_index.map { |t,i| Page.new text: t, number: i+1 } 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/pdftotext/page.rb: -------------------------------------------------------------------------------- 1 | module Pdftotext 2 | class Page 3 | attr_reader :text, :number 4 | 5 | def initialize(options) 6 | @text = options[:text] 7 | @number = options[:number] 8 | end 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/pdftotext/version.rb: -------------------------------------------------------------------------------- 1 | module Pdftotext 2 | VERSION = "0.2.1" 3 | end 4 | -------------------------------------------------------------------------------- /pdftotext.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'pdftotext/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "pdftotext" 8 | spec.version = Pdftotext::VERSION 9 | spec.authors = ["Ben Balter"] 10 | spec.email = ["ben.balter@github.com"] 11 | 12 | spec.summary = "A Ruby wrapper for the `pdftotext` command line library" 13 | spec.homepage = "https://github.com/benbalter/pdftotext" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 17 | spec.bindir = "exe" 18 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_dependency "cliver", "~> 0.3" 22 | spec.add_development_dependency "bundler", "~> 1.10" 23 | spec.add_development_dependency "rake", "~> 10.0" 24 | spec.add_development_dependency "rspec", "~> 3.4" 25 | spec.add_development_dependency "pry", "~> 0.10" 26 | end 27 | -------------------------------------------------------------------------------- /spec/fixtures/pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/benbalter/pdftotext/013bbddb9a05f25b9f24bd3c5cdf1ac53dd4c257/spec/fixtures/pdf.pdf -------------------------------------------------------------------------------- /spec/pdftotext_cli_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Pdftotext::CLI do 4 | it "knows the bin path" do 5 | expect(subject.send(:bin_path)).to match(/\/pdftotext\z/) 6 | end 7 | 8 | it "converts options to args" do 9 | options = {:foo => true, :bar => false, :page => 1} 10 | args = subject.send(:options_to_args, options) 11 | expect(args).to eql(["-foo", "-page", "1"]) 12 | end 13 | 14 | it "runs a command" do 15 | output = subject.run_command "v" => true 16 | expect(output).to match(/\Apdftotext version \d+\.\d+\.\d+/) 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/pdftotext_document_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Pdftotext::Document do 4 | 5 | subject { Pdftotext::Document.new(fixture_path) } 6 | 7 | it "knows the path" do 8 | expect(subject.path).to eql(fixture_path) 9 | end 10 | 11 | it "return the text" do 12 | text = subject.text 13 | expect(text).to eql("This is a test.\n\fThis is another test.\n\f") 14 | end 15 | 16 | it "returns the pages" do 17 | pages = subject.pages 18 | expect(pages.count).to eql(2) 19 | 20 | expect(pages.first.text).to eql("This is a test.\n") 21 | expect(pages.first.number).to eql(1) 22 | 23 | expect(pages.last.text).to eql("This is another test.\n") 24 | expect(pages.last.number).to eql(2) 25 | end 26 | 27 | it "respects non-default command line arguments" do 28 | pages = subject.pages({l: 1}) 29 | expect(pages.count).to eql(1) 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /spec/pdftotext_page_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Pdftotext::Page do 4 | 5 | subject { Pdftotext::Page.new text: "foo", number: 10 } 6 | 7 | it "exposes the text" do 8 | expect(subject.text).to eql("foo") 9 | end 10 | 11 | it "exposes the page number" do 12 | expect(subject.number).to eql(10) 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /spec/pdftotext_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Pdftotext do 4 | it 'has a version number' do 5 | expect(Pdftotext::VERSION).not_to be nil 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) 2 | require 'pdftotext' 3 | 4 | def fixture_path 5 | File.expand_path "./fixtures/pdf.pdf", File.dirname(__FILE__) 6 | end 7 | --------------------------------------------------------------------------------