├── .gitignore ├── vendor ├── logging.properties ├── jodconverter │ ├── juh-3.2.1.jar │ ├── jurt-3.2.1.jar │ ├── ridl-3.2.1.jar │ ├── unoil-3.2.1.jar │ ├── json-20090211.jar │ ├── commons-cli-1.1.jar │ ├── commons-io-1.4.jar │ └── jodconverter-core-3.0-beta-4.jar └── conf │ └── document-formats.js ├── test ├── fixtures │ ├── image.png │ ├── unicode.pdf │ ├── corrosion.pdf │ ├── encrypted.pdf │ ├── Faktura 10.pdf │ ├── obama_arts.pdf │ ├── obama_veterans.doc │ ├── completely_encrypted.pdf │ ├── corrosion.reoriented.pdf │ ├── with_pdf_extension │ │ ├── actually_a_doc.pdf │ │ ├── actually_an_image.pdf │ │ ├── this_ones_a_real_pdf.pdf │ │ └── actually_an_rtf.pdf │ ├── without_pdf_extension │ │ └── indesign │ │ │ ├── test_pdf_1_3 │ │ │ ├── test_pdf_1_4 │ │ │ ├── test_pdf_1_5 │ │ │ ├── test_pdf_1_6 │ │ │ └── test_pdf_1_7 │ ├── PDF file with spaces 'single' and "double quotes".doc │ ├── PDF file with spaces 'single' and "double quotes".pdf │ └── obama_hopes.rtf ├── test_helper.rb └── unit │ ├── test_extract_pages.rb │ ├── test_transparent_pdfs.rb │ ├── test_convert_to_pdf.rb │ ├── test_extract_info.rb │ ├── test_extract_images.rb │ └── test_extract_text.rb ├── bin └── docsplit ├── Rakefile ├── README ├── lib ├── docsplit │ ├── transparent_pdfs.rb │ ├── page_extractor.rb │ ├── info_extractor.rb │ ├── text_cleaner.rb │ ├── image_extractor.rb │ ├── command_line.rb │ ├── text_extractor.rb │ └── pdf_extractor.rb └── docsplit.rb ├── noto_bolt.svg ├── docsplit.gemspec ├── LICENSE └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /vendor/logging.properties: -------------------------------------------------------------------------------- 1 | .level=WARNING 2 | -------------------------------------------------------------------------------- /test/fixtures/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/image.png -------------------------------------------------------------------------------- /test/fixtures/unicode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/unicode.pdf -------------------------------------------------------------------------------- /test/fixtures/corrosion.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/corrosion.pdf -------------------------------------------------------------------------------- /test/fixtures/encrypted.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/encrypted.pdf -------------------------------------------------------------------------------- /test/fixtures/Faktura 10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/Faktura 10.pdf -------------------------------------------------------------------------------- /test/fixtures/obama_arts.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/obama_arts.pdf -------------------------------------------------------------------------------- /test/fixtures/obama_veterans.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/obama_veterans.doc -------------------------------------------------------------------------------- /vendor/jodconverter/juh-3.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/juh-3.2.1.jar -------------------------------------------------------------------------------- /vendor/jodconverter/jurt-3.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/jurt-3.2.1.jar -------------------------------------------------------------------------------- /vendor/jodconverter/ridl-3.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/ridl-3.2.1.jar -------------------------------------------------------------------------------- /vendor/jodconverter/unoil-3.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/unoil-3.2.1.jar -------------------------------------------------------------------------------- /vendor/jodconverter/json-20090211.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/json-20090211.jar -------------------------------------------------------------------------------- /test/fixtures/completely_encrypted.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/completely_encrypted.pdf -------------------------------------------------------------------------------- /test/fixtures/corrosion.reoriented.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/corrosion.reoriented.pdf -------------------------------------------------------------------------------- /vendor/jodconverter/commons-cli-1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/commons-cli-1.1.jar -------------------------------------------------------------------------------- /vendor/jodconverter/commons-io-1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/commons-io-1.4.jar -------------------------------------------------------------------------------- /bin/docsplit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb" 4 | 5 | Docsplit::CommandLine.new -------------------------------------------------------------------------------- /test/fixtures/with_pdf_extension/actually_a_doc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/with_pdf_extension/actually_a_doc.pdf -------------------------------------------------------------------------------- /vendor/jodconverter/jodconverter-core-3.0-beta-4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -------------------------------------------------------------------------------- /test/fixtures/with_pdf_extension/actually_an_image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/with_pdf_extension/actually_an_image.pdf -------------------------------------------------------------------------------- /test/fixtures/with_pdf_extension/this_ones_a_real_pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/with_pdf_extension/this_ones_a_real_pdf.pdf -------------------------------------------------------------------------------- /test/fixtures/without_pdf_extension/indesign/test_pdf_1_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_3 -------------------------------------------------------------------------------- /test/fixtures/without_pdf_extension/indesign/test_pdf_1_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_4 -------------------------------------------------------------------------------- /test/fixtures/without_pdf_extension/indesign/test_pdf_1_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_5 -------------------------------------------------------------------------------- /test/fixtures/without_pdf_extension/indesign/test_pdf_1_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_6 -------------------------------------------------------------------------------- /test/fixtures/without_pdf_extension/indesign/test_pdf_1_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_7 -------------------------------------------------------------------------------- /test/fixtures/PDF file with spaces 'single' and "double quotes".doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/PDF file with spaces 'single' and "double quotes".doc -------------------------------------------------------------------------------- /test/fixtures/PDF file with spaces 'single' and "double quotes".pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/PDF file with spaces 'single' and "double quotes".pdf -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'fileutils' 2 | require 'rake/testtask' 3 | 4 | desc 'Run all tests' 5 | task :test do 6 | require 'minitest/autorun' 7 | Dir['./test/*/**/test_*.rb'].each {|test| require test } 8 | end 9 | 10 | namespace :gem do 11 | 12 | desc 'Build and install the docsplit gem' 13 | task :install do 14 | sh "gem build docsplit.gemspec" 15 | sh "sudo gem install #{Dir['*.gem'].join(' ')} --local --no-ri --no-rdoc" 16 | end 17 | 18 | desc 'Uninstall the docsplit gem' 19 | task :uninstall do 20 | sh "sudo gem uninstall -x docsplit" 21 | end 22 | 23 | end 24 | 25 | task :default => :test 26 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | == 2 | __ ___ __ 3 | ____/ /___ ______________ / (_) /_ 4 | / __ / __ \/ ___/ ___/ __ \/ / / __/ 5 | / /_/ / /_/ / /__(__ ) /_/ / / / /_ 6 | \____/\____/\___/____/ .___/_/_/\__/ 7 | /_/ 8 | 9 | Docsplit is a command-line utility and Ruby library for splitting apart 10 | documents into their component parts: searchable UTF-8 plain text, page 11 | images or thumbnails in any format, PDFs, single pages, and document 12 | metadata (title, author, number of pages...) 13 | 14 | Installation: 15 | gem install docsplit 16 | 17 | For documentation, usage, and examples, see: 18 | https://documentcloud.github.io/docsplit/ 19 | 20 | To suggest a feature or report a bug: 21 | http://github.com/documentcloud/docsplit/issues/ 22 | 23 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | here = File.dirname(__FILE__) 2 | require File.join(here, '..', 'lib', 'docsplit') 3 | require 'fileutils' 4 | require 'minitest' 5 | require "minitest/autorun" 6 | 7 | class Minitest::Test 8 | include Docsplit 9 | 10 | OUTPUT = 'test/output' 11 | 12 | def clear_output 13 | FileUtils.rm_r(OUTPUT) if File.exist?(OUTPUT) 14 | end 15 | 16 | def teardown 17 | clear_output 18 | end 19 | 20 | def assert_directory_contains(dir, files) 21 | files_in_directory = Dir["#{dir}/*"] 22 | if files.kind_of?(Array) 23 | assert files_in_directory.length == files.length, "Expected directory to contain exactly #{files.length} files" 24 | else 25 | files = [files] 26 | end 27 | files.each { |f| assert files_in_directory.include?(File.join(dir, f)), "Expected directory #{dir} to contain file #{f}, but it contains #{files_in_directory.inspect}" } 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/docsplit/transparent_pdfs.rb: -------------------------------------------------------------------------------- 1 | module Docsplit 2 | 3 | # Include a method to transparently convert non-PDF arguments to temporary 4 | # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on. 5 | module TransparentPDFs 6 | 7 | # Temporarily convert any non-PDF documents to PDFs before running them 8 | # through further extraction. 9 | def ensure_pdfs(docs) 10 | [docs].flatten.map do |doc| 11 | if is_pdf?(doc) 12 | doc 13 | else 14 | tempdir = File.join(Dir.tmpdir, 'docsplit') 15 | extract_pdf([doc], {:output => tempdir}) 16 | File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf') 17 | end 18 | end 19 | end 20 | 21 | def is_pdf?(doc) 22 | File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/ 23 | end 24 | 25 | end 26 | 27 | extend TransparentPDFs 28 | 29 | end 30 | -------------------------------------------------------------------------------- /test/unit/test_extract_pages.rb: -------------------------------------------------------------------------------- 1 | here = File.expand_path(File.dirname(__FILE__)) 2 | require File.join(here, '..', 'test_helper') 3 | 4 | class ExtractPagesTest < Minitest::Test 5 | 6 | def test_multi_page_extraction 7 | Docsplit.extract_pages('test/fixtures/obama_arts.pdf', :output => OUTPUT) 8 | assert Dir["#{OUTPUT}/*.pdf"].length == 2 9 | end 10 | 11 | def test_password_protected 12 | assert_raises(ExtractionFailed) do 13 | Docsplit.extract_pages('test/fixtures/completely_encrypted.pdf') 14 | end 15 | end 16 | 17 | def test_doc_page_extraction 18 | Docsplit.extract_pages('test/fixtures/obama_veterans.doc', :output => OUTPUT) 19 | assert Dir["#{OUTPUT}/*.pdf"].length == 7 20 | end 21 | 22 | def test_name_escaping_while_extracting_pages 23 | Docsplit.extract_pages('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :output => OUTPUT) 24 | assert Dir["#{OUTPUT}/*.pdf"].length == 2 25 | end 26 | 27 | end 28 | -------------------------------------------------------------------------------- /noto_bolt.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 11 | 12 | -------------------------------------------------------------------------------- /docsplit.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.name = 'docsplit' 3 | s.version = '0.7.6' # Keep version in sync with docsplit.rb 4 | s.date = '2014-11-17' 5 | 6 | s.homepage = "http://documentcloud.github.com/docsplit/" 7 | s.summary = "Break Apart Documents into Images, Text, Pages and PDFs" 8 | s.description = <<-EOS 9 | Docsplit is a command-line utility and Ruby library for splitting apart 10 | documents into their component parts: searchable UTF-8 plain text, page 11 | images or thumbnails in any format, PDFs, single pages, and document 12 | metadata (title, author, number of pages...) 13 | EOS 14 | 15 | s.authors = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han'] 16 | s.email = 'opensource@documentcloud.org' 17 | s.rubyforge_project = 'docsplit' 18 | s.license = 'MIT' 19 | 20 | s.require_paths = ['lib'] 21 | s.executables = ['docsplit'] 22 | 23 | s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*', 24 | 'docsplit.gemspec', 'LICENSE', 'README'] 25 | end -------------------------------------------------------------------------------- /test/unit/test_transparent_pdfs.rb: -------------------------------------------------------------------------------- 1 | here = File.expand_path(File.dirname(__FILE__)) 2 | require File.join(here, '..', 'test_helper') 3 | require 'tmpdir' 4 | 5 | class TransparentPDFsTest < Minitest::Test 6 | 7 | def setup 8 | @klass = Class.new 9 | @klass.send(:include, Docsplit::TransparentPDFs) 10 | @detector = @klass.new 11 | end 12 | 13 | def test_files_with_pdf_extension_are_always_considered_a_pdf 14 | pdfs = Dir.glob('test/fixtures/with_pdf_extension/*.pdf').select { |path| File.file?(path) } 15 | assert pdfs.any?, 'ensure pdfs with extensions are available to test with' 16 | pdfs.each do |pdf| 17 | assert @detector.is_pdf?(pdf), "#{pdf} with '.pdf' extension is identified as a PDF (regardless of its file contents)" 18 | end 19 | end 20 | 21 | def test_pdfs_without_the_pdf_file_extension_is_considerd_a_pdf 22 | pdfs = Dir.glob('test/fixtures/without_pdf_extension/*/*').select { |path| File.file?(path) } 23 | assert pdfs.any?, 'ensure pdfs without extensions are available to test with' 24 | pdfs.each do |pdf| 25 | assert @detector.is_pdf?(pdf), "#{pdf} with '.pdf' extension is identified as a PDF" 26 | end 27 | end 28 | 29 | end 30 | -------------------------------------------------------------------------------- /lib/docsplit/page_extractor.rb: -------------------------------------------------------------------------------- 1 | module Docsplit 2 | 3 | # Delegates to **pdftk** in order to create bursted single pages from 4 | # a PDF document. 5 | class PageExtractor 6 | 7 | # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`. 8 | def extract(pdfs, opts) 9 | extract_options opts 10 | [pdfs].flatten.each do |pdf| 11 | pdf_name = File.basename(pdf, File.extname(pdf)) 12 | page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf" 13 | FileUtils.mkdir_p @output unless File.exist?(@output) 14 | 15 | cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability 16 | "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1" 17 | else 18 | "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1" 19 | end 20 | result = `#{cmd}`.chomp 21 | FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt') 22 | raise ExtractionFailed, result if $? != 0 23 | result 24 | end 25 | end 26 | 27 | 28 | private 29 | 30 | def extract_options(options) 31 | @output = options[:output] || '.' 32 | end 33 | 34 | end 35 | 36 | end 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html 2 | 3 | Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud 4 | Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors 5 | 6 | Permission is hereby granted, free of charge, to any person 7 | obtaining a copy of this software and associated documentation 8 | files (the "Software"), to deal in the Software without 9 | restriction, including without limitation the rights to use, 10 | copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the 12 | Software is furnished to do so, subject to the following 13 | conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /test/unit/test_convert_to_pdf.rb: -------------------------------------------------------------------------------- 1 | here = File.expand_path(File.dirname(__FILE__)) 2 | require File.join(here, '..', 'test_helper') 3 | 4 | class ConvertToPdfTest < Minitest::Test 5 | 6 | def test_doc_conversion 7 | Docsplit.extract_pdf('test/fixtures/obama_veterans.doc', :output => OUTPUT) 8 | assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/obama_veterans.pdf"] 9 | end 10 | 11 | def test_rtf_conversion 12 | Docsplit.extract_pdf('test/fixtures/obama_hopes.rtf', :output => OUTPUT) 13 | assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/obama_hopes.pdf"] 14 | end 15 | 16 | def test_png_conversion 17 | Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT) 18 | assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"] 19 | end 20 | def test_png_conversion 21 | Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT) 22 | assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"] 23 | end 24 | 25 | def test_conversion_then_page_extraction 26 | Docsplit.extract_pdf('test/fixtures/obama_veterans.doc', :output => OUTPUT) 27 | Docsplit.extract_pages("#{OUTPUT}/obama_veterans.pdf", :output => OUTPUT) 28 | assert Dir["#{OUTPUT}/*.pdf"].length == 8 29 | end 30 | 31 | def test_name_escaping_while_converting 32 | Docsplit.extract_pdf('test/fixtures/PDF file with spaces \'single\' and "double quotes".doc', :output => OUTPUT) 33 | assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/PDF file with spaces 'single' and \"double quotes\".pdf"] 34 | end 35 | 36 | end 37 | -------------------------------------------------------------------------------- /lib/docsplit/info_extractor.rb: -------------------------------------------------------------------------------- 1 | module Docsplit 2 | 3 | # Delegates to **pdfinfo** in order to extract information about a PDF file. 4 | class InfoExtractor 5 | 6 | # Regex matchers for different bits of information. 7 | MATCHERS = { 8 | :author => /^Author:\s+([^\n]+)/, 9 | :date => /^CreationDate:\s+([^\n]+)/, 10 | :creator => /^Creator:\s+([^\n]+)/, 11 | :keywords => /^Keywords:\s+([^\n]+)/, 12 | :producer => /^Producer:\s+([^\n]+)/, 13 | :subject => /^Subject:\s+([^\n]+)/, 14 | :title => /^Title:\s+([^\n]+)/, 15 | :length => /^Pages:\s+([^\n]+)/, 16 | } 17 | 18 | # Pull out a single datum from a pdf. 19 | def extract(key, pdfs, opts) 20 | extract_all(pdfs, opts)[key] 21 | end 22 | 23 | def extract_all(pdfs, opts) 24 | pdf = [pdfs].flatten.first 25 | cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1" 26 | result = `#{cmd}`.chomp 27 | raise ExtractionFailed, result if $? != 0 28 | # ruby 1.8 (iconv) and 1.9 (String#encode) : 29 | if String.method_defined?(:encode) 30 | result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding? 31 | else 32 | require 'iconv' unless defined?(Iconv) 33 | ic = Iconv.new('UTF-8//IGNORE','UTF-8') 34 | result = ic.iconv(result) 35 | end 36 | info = {} 37 | MATCHERS.each do |key, matcher| 38 | match = result.match(matcher) 39 | answer = match && match[1] 40 | if answer 41 | answer = answer.to_i if key == :length 42 | info[key] = answer 43 | end 44 | end 45 | info 46 | end 47 | 48 | end 49 | 50 | end 51 | -------------------------------------------------------------------------------- /test/unit/test_extract_info.rb: -------------------------------------------------------------------------------- 1 | here = File.expand_path(File.dirname(__FILE__)) 2 | require File.join(here, '..', 'test_helper') 3 | 4 | class ExtractInfoTest < Minitest::Test 5 | 6 | def test_title 7 | assert "PDF Pieces" == Docsplit.extract_title('test/fixtures/encrypted.pdf') 8 | end 9 | 10 | def test_doc_title 11 | assert "Remarks of President Barack Obama" == Docsplit.extract_title('test/fixtures/obama_veterans.doc') 12 | end 13 | 14 | def test_author 15 | assert "Jeremy Ashkenas" == Docsplit.extract_author('test/fixtures/encrypted.pdf') 16 | end 17 | 18 | def test_date 19 | assert "Thu Nov 29 14:54:46 2007" == Docsplit.extract_date('test/fixtures/obama_arts.pdf') 20 | end 21 | 22 | def test_length 23 | assert 2 == Docsplit.extract_length('test/fixtures/obama_arts.pdf') 24 | end 25 | 26 | def test_producer 27 | assert "Mac OS X 10.6.2 Quartz PDFContext" == Docsplit.extract_producer('test/fixtures/encrypted.pdf') 28 | end 29 | 30 | def test_password_protected 31 | assert_raises(ExtractionFailed) do 32 | Docsplit.extract_author('test/fixtures/completely_encrypted.pdf') 33 | end 34 | end 35 | 36 | def test_name_escaping_while_extracting_info 37 | assert 2 == Docsplit.extract_length('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf') 38 | end 39 | 40 | def test_malformed_unicode 41 | assert Docsplit.extract_date('test/fixtures/Faktura 10.pdf') 42 | end 43 | 44 | def test_extract_all 45 | metadata = Docsplit.extract_info('test/fixtures/obama_arts.pdf') 46 | assert metadata[:author] == "mkommareddi" 47 | assert metadata[:date] == "Thu Nov 29 14:54:46 2007" 48 | assert metadata[:creator] == "PScript5.dll Version 5.2" 49 | assert metadata[:producer] == "Acrobat Distiller 8.1.0 (Windows)" 50 | assert metadata[:title] == "Microsoft Word - Fact Sheet Arts 112907 FINAL.doc" 51 | assert metadata[:length] == 2 52 | assert metadata.length == 6 53 | end 54 | 55 | end 56 | -------------------------------------------------------------------------------- /test/unit/test_extract_images.rb: -------------------------------------------------------------------------------- 1 | here = File.expand_path(File.dirname(__FILE__)) 2 | require File.join(here, '..', 'test_helper') 3 | 4 | class ExtractImagesTest < Minitest::Test 5 | 6 | def test_basic_image_extraction 7 | Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "250x", :output => OUTPUT) 8 | assert_directory_contains(OUTPUT, ['obama_arts_1.gif', 'obama_arts_2.gif']) 9 | end 10 | 11 | def test_image_formatting 12 | Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "250x", :output => OUTPUT) 13 | assert Dir["#{OUTPUT}/*.gif"].length == 2 14 | assert Dir["#{OUTPUT}/*.jpg"].length == 2 15 | end 16 | 17 | def test_page_ranges 18 | Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) 19 | assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"] 20 | end 21 | 22 | def test_image_sizes 23 | Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :rolling => true, :size => ["150x", "50x"], :output => OUTPUT) 24 | assert File.size("#{OUTPUT}/50x/obama_arts_1.gif") < File.size("#{OUTPUT}/150x/obama_arts_1.gif") 25 | end 26 | 27 | def test_encrypted_images 28 | Docsplit.extract_images('test/fixtures/encrypted.pdf', :format => :gif, :size => "50x", :output => OUTPUT) 29 | assert File.size("#{OUTPUT}/encrypted_1.gif") > 100 30 | end 31 | 32 | def test_password_protected_extraction 33 | assert_raises(ExtractionFailed) do 34 | Docsplit.extract_images('test/fixtures/completely_encrypted.pdf') 35 | end 36 | end 37 | 38 | def test_repeated_extraction_in_the_same_directory 39 | Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "250x", :output => OUTPUT) 40 | assert_directory_contains(OUTPUT, ['obama_arts_1.gif', 'obama_arts_2.gif']) 41 | Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "250x", :output => OUTPUT) 42 | assert_directory_contains(OUTPUT, ['obama_arts_1.gif', 'obama_arts_2.gif']) 43 | end 44 | 45 | def test_name_escaping_while_extracting_images 46 | Docsplit.extract_images('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :format => :gif, :size => "250x", :output => OUTPUT) 47 | assert_directory_contains(OUTPUT, ['PDF file with spaces \'single\' and "double quotes"_1.gif', 48 | 'PDF file with spaces \'single\' and "double quotes"_1.gif']) 49 | end 50 | 51 | end 52 | -------------------------------------------------------------------------------- /test/unit/test_extract_text.rb: -------------------------------------------------------------------------------- 1 | here = File.expand_path(File.dirname(__FILE__)) 2 | require File.join(here, '..', 'test_helper') 3 | require 'tmpdir' 4 | 5 | class ExtractTextTest < Minitest::Test 6 | 7 | def test_paged_extraction 8 | Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT) 9 | assert Dir["#{OUTPUT}/*.txt"].length == 2 10 | assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America") 11 | end 12 | 13 | def test_page_only_extraction 14 | Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 2..2, :output => OUTPUT) 15 | assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/obama_arts_2.txt"] 16 | end 17 | 18 | def test_capitalized_pdf_extraction 19 | Dir["#{OUTPUT}/*.txt"].each {|previous| FileUtils.rm(previous) } 20 | Dir.mktmpdir do |dir| 21 | FileUtils.cp('test/fixtures/obama_arts.pdf', "#{dir}/OBAMA_ARTS.PDF") 22 | Docsplit.extract_text("#{dir}/OBAMA_ARTS.PDF", :pages => 2..2, :output => OUTPUT) 23 | end 24 | assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/OBAMA_ARTS_2.txt"] 25 | end 26 | 27 | def test_unicode_extraction 28 | Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT) 29 | assert Dir["#{OUTPUT}/*.txt"].length == 3 30 | end 31 | 32 | def test_ocr_extraction 33 | Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT) 34 | 4.times do |i| 35 | file = "corrosion_#{i + 1}.txt" 36 | assert_directory_contains(OUTPUT, file) 37 | assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size" 38 | end 39 | end 40 | 41 | def test_ocr_extraction_in_mock_language 42 | exception = assert_raises(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")} 43 | assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'" 44 | end 45 | 46 | def test_password_protected 47 | assert_raises(ExtractionFailed) do 48 | Docsplit.extract_text('test/fixtures/completely_encrypted.pdf') 49 | end 50 | end 51 | 52 | def test_name_escaping_while_extracting_text 53 | Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT) 54 | assert Dir["#{OUTPUT}/*.txt"].length == 2 55 | end 56 | 57 | def test_orientation_detected_ocr_extraction 58 | if Docsplit::DEPENDENCIES[:osd] 59 | pages = 1..4 60 | Docsplit.extract_text('test/fixtures/corrosion.reoriented.pdf', :output => OUTPUT, :pages=>pages, :force_ocr => true) 61 | letters = Hash.new(0) 62 | nonletters = Hash.new(0) 63 | 64 | pages.each do |number| 65 | File.open(File.join(OUTPUT,"corrosion.reoriented_#{number}.txt")).each_char do |c| 66 | case c 67 | when /[A-Za-z]/ 68 | letters[c] += 1 69 | when /\s/ 70 | else 71 | nonletters[c] += 1 72 | end 73 | end 74 | end 75 | 76 | # the corrosion.pdf has 6160 letters & 362 nonletters, or ~17:1 77 | # so lets give a fudge factor of ~half of that or 8:1 78 | assert letters.values.reduce(0,:+)/8 > nonletters.values.reduce(0,:+), "Expected that text extracted with orientation detection would have more letters." 79 | else 80 | skip "Orientation detection module (osd) for Tesseract isn't installed" 81 | end 82 | end 83 | 84 | end 85 | -------------------------------------------------------------------------------- /lib/docsplit/text_cleaner.rb: -------------------------------------------------------------------------------- 1 | require 'strscan' 2 | 3 | module Docsplit 4 | 5 | # Cleans up OCR'd text by using a series of heuristics to remove garbage 6 | # words. Algorithms taken from: 7 | # 8 | # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation 9 | # -- Taghva, Nartker, Condit, and Borsack 10 | # 11 | # Improving Search and Retrieval Performance through Shortening Documents, 12 | # Detecting Garbage, and Throwing out Jargon 13 | # -- Kulp 14 | # 15 | class TextCleaner 16 | 17 | # Cached regexes we plan on using. 18 | WORD = /\S+/ 19 | SPACE = /\s+/ 20 | NEWLINE = /[\r\n]/ 21 | ALNUM = /[a-z0-9]/i 22 | PUNCT = /[[:punct:]]/i 23 | REPEAT = /([^0-9])\1{2,}/ 24 | UPPER = /[A-Z]/ 25 | LOWER = /[a-z]/ 26 | ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/ 27 | ALL_ALPHA = /^[a-z]+$/i 28 | CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i 29 | VOWEL = /([aeiou]|y$)/i 30 | CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i 31 | VOWEL_5 = /[aeiou]{5}/i 32 | REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/ 33 | SINGLETONS = /^[AaIi]$/ 34 | 35 | # For the time being, `clean` uses the regular StringScanner, and not the 36 | # multibyte-aware version, coercing to ASCII first. 37 | def clean(text) 38 | if String.method_defined?(:encode) 39 | text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?') 40 | else 41 | require 'iconv' unless defined?(Iconv) 42 | text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first 43 | end 44 | 45 | scanner = StringScanner.new(text) 46 | cleaned = [] 47 | spaced = false 48 | loop do 49 | if space = scanner.scan(SPACE) 50 | cleaned.push(space) unless spaced && (space !~ NEWLINE) 51 | spaced = true 52 | elsif word = scanner.scan(WORD) 53 | unless garbage(word) 54 | cleaned.push(word) 55 | spaced = false 56 | end 57 | elsif scanner.eos? 58 | return cleaned.join('').gsub(REPEATED, '') 59 | end 60 | end 61 | end 62 | 63 | # Is a given word OCR garbage? 64 | def garbage(w) 65 | acronym = w =~ ACRONYM 66 | 67 | # More than 30 bytes in length. 68 | (w.length > 30) || 69 | 70 | # If there are three or more identical characters in a row in the string. 71 | (w =~ REPEAT) || 72 | 73 | # More punctuation than alpha numerics. 74 | (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) || 75 | 76 | # Ignoring the first and last characters in the string, if there are three or 77 | # more different punctuation characters in the string. 78 | (w[1...-1].scan(PUNCT).uniq.length >= 3) || 79 | 80 | # Four or more consecutive vowels, or five or more consecutive consonants. 81 | ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) || 82 | 83 | # Number of uppercase letters greater than lowercase letters, but the word is 84 | # not all uppercase + punctuation. 85 | (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) || 86 | 87 | # Single letters that are not A or I. 88 | (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) || 89 | 90 | # All characters are alphabetic and there are 8 times more vowels than 91 | # consonants, or 8 times more consonants than vowels. 92 | (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) && 93 | (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) || 94 | (cons > vows * 8))) 95 | end 96 | 97 | end 98 | 99 | end 100 | -------------------------------------------------------------------------------- /lib/docsplit/image_extractor.rb: -------------------------------------------------------------------------------- 1 | module Docsplit 2 | 3 | # Delegates to GraphicsMagick in order to convert PDF documents into 4 | # nicely sized images. 5 | class ImageExtractor 6 | 7 | MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" 8 | DEFAULT_FORMAT = :png 9 | DEFAULT_DENSITY = '150' 10 | 11 | # Extract a list of PDFs as rasterized page images, according to the 12 | # configuration in options. 13 | def extract(pdfs, options) 14 | @pdfs = [pdfs].flatten 15 | extract_options(options) 16 | @pdfs.each do |pdf| 17 | previous = nil 18 | @sizes.each_with_index do |size, i| 19 | @formats.each {|format| convert(pdf, size, format, previous) } 20 | previous = size if @rolling 21 | end 22 | end 23 | end 24 | 25 | # Convert a single PDF into page images at the specified size and format. 26 | # If `--rolling`, and we have a previous image at a larger size to work with, 27 | # we simply downsample that image, instead of re-rendering the entire PDF. 28 | # Now we generate one page at a time, a counterintuitive opimization 29 | # suggested by the GraphicsMagick list, that seems to work quite well. 30 | def convert(pdf, size, format, previous=nil) 31 | tempdir = Dir.mktmpdir 32 | basename = File.basename(pdf, File.extname(pdf)) 33 | directory = directory_for(size) 34 | pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s 35 | escaped_pdf = ESCAPE[pdf] 36 | FileUtils.mkdir_p(directory) unless File.exist?(directory) 37 | common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" 38 | if previous 39 | FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) 40 | result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp 41 | raise ExtractionFailed, result if $? != 0 42 | else 43 | page_list(pages).each do |page| 44 | out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] 45 | cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp 46 | result = `#{cmd}`.chomp 47 | raise ExtractionFailed, result if $? != 0 48 | end 49 | end 50 | ensure 51 | FileUtils.remove_entry_secure tempdir if File.exist?(tempdir) 52 | end 53 | 54 | 55 | private 56 | 57 | # Extract the relevant GraphicsMagick options from the options hash. 58 | def extract_options(options) 59 | @output = options[:output] || '.' 60 | @pages = options[:pages] 61 | @density = options[:density] || DEFAULT_DENSITY 62 | @formats = [options[:format] || DEFAULT_FORMAT].flatten 63 | @sizes = [options[:size]].flatten.compact 64 | @sizes = [nil] if @sizes.empty? 65 | @rolling = !!options[:rolling] 66 | end 67 | 68 | # If there's only one size requested, generate the images directly into 69 | # the output directory. Multiple sizes each get a directory of their own. 70 | def directory_for(size) 71 | path = @sizes.length == 1 ? @output : File.join(@output, size) 72 | File.expand_path(path) 73 | end 74 | 75 | # Generate the resize argument. 76 | def resize_arg(size) 77 | size.nil? ? '' : "-resize #{size}" 78 | end 79 | 80 | # Generate the appropriate quality argument for the image format. 81 | def quality_arg(format) 82 | case format.to_s 83 | when /jpe?g/ then "-quality 85" 84 | when /png/ then "-quality 100" 85 | else "" 86 | end 87 | end 88 | 89 | # Generate the expanded list of requested page numbers. 90 | def page_list(pages) 91 | pages.split(',').map { |range| 92 | if range.include?('-') 93 | range = range.split('-') 94 | Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i } 95 | else 96 | range.to_i 97 | end 98 | }.flatten.uniq.sort 99 | end 100 | 101 | end 102 | 103 | end 104 | -------------------------------------------------------------------------------- /lib/docsplit.rb: -------------------------------------------------------------------------------- 1 | require 'tmpdir' 2 | require 'fileutils' 3 | require 'shellwords' 4 | 5 | # The Docsplit module delegates to the Java PDF extractors. 6 | module Docsplit 7 | 8 | VERSION = '0.7.6' # Keep in sync with gemspec. 9 | 10 | ESCAPE = lambda {|x| Shellwords.shellescape(x) } 11 | 12 | ROOT = File.expand_path(File.dirname(__FILE__) + '/..') 13 | ESCAPED_ROOT = ESCAPE[ROOT] 14 | 15 | METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length] 16 | 17 | GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"] 18 | 19 | DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false} 20 | 21 | # Check for all dependencies, and note their absence. 22 | dirs = ENV['PATH'].split(File::PATH_SEPARATOR) 23 | DEPENDENCIES.each_key do |dep| 24 | dirs.each do |dir| 25 | if File.executable?(File.join(dir, dep.to_s)) 26 | DEPENDENCIES[dep] = true 27 | break 28 | end 29 | end 30 | end 31 | 32 | # if tesseract is found check for the osd plugin so that we can do orientation independent OCR. 33 | if DEPENDENCIES[:tesseract] 34 | # osd will be listed in tesseract --listlangs 35 | val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ] 36 | DEPENDENCIES[:osd] = true if val =~ /\bosd\b/ 37 | end 38 | 39 | # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise 40 | # broke. 41 | class ExtractionFailed < StandardError; end 42 | 43 | # Use the ExtractPages Java class to burst a PDF into single pages. 44 | def self.extract_pages(pdfs, opts={}) 45 | pdfs = ensure_pdfs(pdfs) 46 | PageExtractor.new.extract(pdfs, opts) 47 | end 48 | 49 | # Use the ExtractText Java class to write out all embedded text. 50 | def self.extract_text(pdfs, opts={}) 51 | pdfs = ensure_pdfs(pdfs) 52 | TextExtractor.new.extract(pdfs, opts) 53 | end 54 | 55 | # Use the ExtractImages Java class to rasterize a PDF into each page's image. 56 | def self.extract_images(pdfs, opts={}) 57 | pdfs = ensure_pdfs(pdfs) 58 | opts[:pages] = normalize_value(opts[:pages]) if opts[:pages] 59 | ImageExtractor.new.extract(pdfs, opts) 60 | end 61 | 62 | # Use JODCConverter to extract the documents as PDFs. 63 | # If the document is in an image format, use GraphicsMagick to extract the PDF. 64 | def self.extract_pdf(docs, opts={}) 65 | PdfExtractor.new.extract(docs, opts) 66 | end 67 | 68 | # Define custom methods for each of the metadata keys that we support. 69 | # Use the ExtractInfo Java class to print out a single bit of metadata. 70 | METADATA_KEYS.each do |key| 71 | instance_eval <<-EOS 72 | def self.extract_#{key}(pdfs, opts={}) 73 | pdfs = ensure_pdfs(pdfs) 74 | InfoExtractor.new.extract(:#{key}, pdfs, opts) 75 | end 76 | EOS 77 | end 78 | 79 | def self.extract_info(pdfs, opts={}) 80 | pdfs = ensure_pdfs(pdfs) 81 | InfoExtractor.new.extract_all(pdfs, opts) 82 | end 83 | 84 | # Utility method to clean OCR'd text with garbage characters. 85 | def self.clean_text(text) 86 | TextCleaner.new.clean(text) 87 | end 88 | 89 | private 90 | 91 | # Normalize a value in an options hash for the command line. 92 | # Ranges look like: 1-10, Arrays like: 1,2,3. 93 | def self.normalize_value(value) 94 | case value 95 | when Range then value.to_a.join(',') 96 | when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',') 97 | else value.to_s 98 | end 99 | end 100 | 101 | end 102 | 103 | require "#{Docsplit::ROOT}/lib/docsplit/image_extractor" 104 | require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs" 105 | require "#{Docsplit::ROOT}/lib/docsplit/text_extractor" 106 | require "#{Docsplit::ROOT}/lib/docsplit/page_extractor" 107 | require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor" 108 | require "#{Docsplit::ROOT}/lib/docsplit/info_extractor" 109 | require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner" 110 | -------------------------------------------------------------------------------- /lib/docsplit/command_line.rb: -------------------------------------------------------------------------------- 1 | require 'optparse' 2 | require File.expand_path(File.dirname(__FILE__) + '/../docsplit') 3 | 4 | module Docsplit 5 | 6 | # A single command-line utility to separate a PDF into all its component parts. 7 | class CommandLine 8 | 9 | BANNER = <<-EOS 10 | docsplit breaks apart documents into images, text, or individual pages. 11 | It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter. 12 | 13 | Usage: 14 | docsplit COMMAND [OPTIONS] path/to/doc.pdf 15 | Main commands: 16 | pages, images, text, pdf. 17 | Metadata commands: 18 | author, date, creator, keywords, producer, subject, title, length. 19 | 20 | Example: 21 | docsplit images --size 700x --format jpg document.pdf 22 | 23 | Dependencies: 24 | Ruby, Java, A working GraphicsMagick (gm) command, 25 | and a headless OpenOffice server for non-PDF documents. 26 | 27 | Options: 28 | (size, pages and format can take comma-separated values) 29 | 30 | EOS 31 | 32 | # Creating a CommandLine runs off of the contents of ARGV. 33 | def initialize 34 | parse_options 35 | cmd = ARGV.shift 36 | @command = cmd && cmd.to_sym 37 | run 38 | end 39 | 40 | # Delegate to the Docsplit Ruby API to perform all extractions. 41 | def run 42 | begin 43 | case @command 44 | when :images then Docsplit.extract_images(ARGV, @options) 45 | when :pages then Docsplit.extract_pages(ARGV, @options) 46 | when :text then Docsplit.extract_text(ARGV, @options) 47 | when :pdf then Docsplit.extract_pdf(ARGV, @options) 48 | else 49 | if METADATA_KEYS.include?(@command) 50 | value = Docsplit.send("extract_#{@command}", ARGV, @options) 51 | puts value unless value.nil? 52 | else 53 | usage 54 | end 55 | end 56 | rescue ExtractionFailed => e 57 | puts e.message.chomp 58 | exit(1) 59 | end 60 | end 61 | 62 | # Print out the usage help message. 63 | def usage 64 | puts "\n#{@option_parser}\n" 65 | exit 66 | end 67 | 68 | 69 | private 70 | 71 | # Use the OptionParser library to parse out all supported options. Return 72 | # options formatted for the Ruby API. 73 | def parse_options 74 | @options = {:ocr => :default, :clean => true} 75 | @option_parser = OptionParser.new do |opts| 76 | opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d| 77 | @options[:output] = d 78 | end 79 | opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p| 80 | @options[:pages] = p 81 | end 82 | opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s| 83 | @options[:size] = s.split(',') 84 | end 85 | opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t| 86 | @options[:format] = t.split(',') 87 | end 88 | opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d| 89 | @options[:density] = d 90 | end 91 | opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o| 92 | @options[:ocr] = o 93 | end 94 | opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c| 95 | @options[:clean] = false 96 | end 97 | opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l| 98 | @options[:language] = l 99 | end 100 | opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n| 101 | @options[:detect_orientation] = false 102 | end 103 | opts.on('-r', '--rolling', 'generate images from each previous image') do |r| 104 | @options[:rolling] = true 105 | end 106 | opts.on_tail('-v', '--version', 'display docsplit version') do 107 | puts "Docsplit version #{Docsplit::VERSION}" 108 | exit 109 | end 110 | opts.on_tail('-h', '--help', 'display this help message') do 111 | usage 112 | end 113 | end 114 | @option_parser.banner = BANNER 115 | begin 116 | @option_parser.parse!(ARGV) 117 | rescue OptionParser::InvalidOption => e 118 | puts e.message 119 | exit(1) 120 | end 121 | end 122 | 123 | end 124 | 125 | end -------------------------------------------------------------------------------- /lib/docsplit/text_extractor.rb: -------------------------------------------------------------------------------- 1 | module Docsplit 2 | 3 | # Delegates to **pdftotext** and **tesseract** in order to extract text from 4 | # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or 5 | # forbid OCR extraction, but by default the heuristic works like this: 6 | # 7 | # * Check for the presence of fonts in the PDF. If no fonts are detected, 8 | # OCR is used automatically. 9 | # * Extract the text of each page with **pdftotext**, if the page has less 10 | # than 100 bytes of text (a scanned image page, or a page that just 11 | # contains a filename and a page number), then add it to the list of 12 | # `@pages_to_ocr`. 13 | # * Re-OCR each page in the `@pages_to_ocr` list at the end. 14 | # 15 | class TextExtractor 16 | 17 | NO_TEXT_DETECTED = /---------\n\Z/ 18 | 19 | OCR_FLAGS = '-density 400x400 -colorspace GRAY' 20 | MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB' 21 | 22 | MIN_TEXT_PER_PAGE = 100 # in bytes 23 | 24 | def initialize 25 | @pages_to_ocr = [] 26 | end 27 | 28 | # Extract text from a list of PDFs. 29 | def extract(pdfs, opts) 30 | extract_options opts 31 | FileUtils.mkdir_p @output unless File.exist?(@output) 32 | [pdfs].flatten.each do |pdf| 33 | @pdf_name = File.basename(pdf, File.extname(pdf)) 34 | pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages 35 | if @force_ocr || (!@forbid_ocr && !contains_text?(pdf)) 36 | extract_from_ocr(pdf, pages) 37 | else 38 | extract_from_pdf(pdf, pages) 39 | if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty? 40 | extract_from_ocr(pdf, @pages_to_ocr) 41 | end 42 | end 43 | end 44 | end 45 | 46 | # Does a PDF have any text embedded? 47 | def contains_text?(pdf) 48 | fonts = `pdffonts #{ESCAPE[pdf]} 2>&1` 49 | !fonts.match(NO_TEXT_DETECTED) 50 | end 51 | 52 | # Extract a page range worth of text from a PDF, directly. 53 | def extract_from_pdf(pdf, pages) 54 | return extract_full(pdf) unless pages 55 | pages.each {|page| extract_page(pdf, page) } 56 | end 57 | 58 | # Extract a page range worth of text from a PDF via OCR. 59 | def extract_from_ocr(pdf, pages) 60 | tempdir = Dir.mktmpdir 61 | base_path = File.join(@output, @pdf_name) 62 | escaped_pdf = ESCAPE[pdf] 63 | psm = @detect_orientation ? "-psm 1" : "" 64 | if pages 65 | pages.each do |page| 66 | tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" 67 | escaped_tiff = ESCAPE[tiff] 68 | file = "#{base_path}_#{page}" 69 | run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" 70 | run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" 71 | clean_text(file + '.txt') if @clean_ocr 72 | FileUtils.remove_entry_secure tiff 73 | end 74 | else 75 | tiff = "#{tempdir}/#{@pdf_name}.tif" 76 | escaped_tiff = ESCAPE[tiff] 77 | run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" 78 | #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 79 | run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" 80 | clean_text(base_path + '.txt') if @clean_ocr 81 | end 82 | ensure 83 | FileUtils.remove_entry_secure tempdir if File.exist?(tempdir) 84 | end 85 | 86 | 87 | private 88 | 89 | def clean_text(file) 90 | File.open(file, 'r+') do |f| 91 | text = f.read 92 | f.truncate(0) 93 | f.rewind 94 | f.write(Docsplit.clean_text(text)) 95 | end 96 | end 97 | 98 | # Run an external process and raise an exception if it fails. 99 | def run(command) 100 | result = `#{command}` 101 | raise ExtractionFailed, result if $? != 0 102 | result 103 | end 104 | 105 | # Run pdftotext command 106 | def run_pdftotext(pdf, text_path, options=[]) 107 | options << '-enc UTF-8' 108 | options << '-layout' if @keep_layout 109 | 110 | run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" 111 | end 112 | 113 | # Extract the full contents of a pdf as a single file, directly. 114 | def extract_full(pdf) 115 | text_path = File.join(@output, "#{@pdf_name}.txt") 116 | run_pdftotext pdf, text_path 117 | end 118 | 119 | # Extract the contents of a single page of text, directly, adding it to 120 | # the `@pages_to_ocr` list if the text length is inadequate. 121 | def extract_page(pdf, page) 122 | text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") 123 | run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"] 124 | 125 | unless @forbid_ocr 126 | @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE 127 | end 128 | end 129 | 130 | def extract_options(options) 131 | @output = options[:output] || '.' 132 | @pages = options[:pages] 133 | @force_ocr = options[:ocr] == true 134 | @forbid_ocr = options[:ocr] == false 135 | @language = options[:language] || 'eng' 136 | @clean_ocr = (!(options[:clean] == false) and @language == 'eng') 137 | @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd]) 138 | @keep_layout = options.fetch(:layout, false) 139 | end 140 | 141 | end 142 | 143 | end 144 | -------------------------------------------------------------------------------- /lib/docsplit/pdf_extractor.rb: -------------------------------------------------------------------------------- 1 | require 'rbconfig' 2 | 3 | module Docsplit 4 | class PdfExtractor 5 | @@executable = nil 6 | @@version_string = nil 7 | 8 | # Provide a set of helper functions to determine the OS. 9 | HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os'] 10 | def windows? 11 | !!HOST_OS.match(/mswin|windows|cygwin/i) 12 | end 13 | def osx? 14 | !!HOST_OS.match(/darwin/i) 15 | end 16 | def linux? 17 | !!HOST_OS.match(/linux/i) 18 | end 19 | 20 | # The first line of the help output holds the name and version number 21 | # of the office software to be used for extraction. 22 | def version_string 23 | unless @@version_string 24 | null = windows? ? "NUL" : "/dev/null" 25 | @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first 26 | if !!@@version_string.to_s.match(/[0-9]*/) 27 | @@version_string = `#{office_executable} --version`.split("\n").first 28 | end 29 | end 30 | @@version_string 31 | end 32 | def libre_office? 33 | !!version_string.match(/^LibreOffice/) 34 | end 35 | def open_office? 36 | !!version_string.match(/^OpenOffice.org/) 37 | end 38 | 39 | # A set of default locations to search for office software 40 | # These have been extracted from JODConverter. Each listed 41 | # path should contain a directory "program" which in turn 42 | # contains the "soffice" executable. 43 | # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91 44 | def office_search_paths 45 | if windows? 46 | office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"] 47 | program_files_path = ENV["CommonProgramFiles"] 48 | search_paths = office_names.map{ |program| File.join(program_files_path, program) } 49 | elsif osx? 50 | search_paths = %w( 51 | /Applications/LibreOffice.app/Contents 52 | /Applications/OpenOffice.org.app/Contents 53 | ) 54 | else # probably linux/unix 55 | # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice 56 | search_paths = %w( 57 | /usr/lib/libreoffice 58 | /usr/lib64/libreoffice 59 | /opt/libreoffice 60 | /usr/lib/openoffice 61 | /usr/lib64/openoffice 62 | /opt/openoffice.org3 63 | /app/vendor/libreoffice 64 | /usr/bin/libreoffice 65 | /usr/local/bin 66 | /usr/lib64/libreoffice 67 | /usr/lib64/openoffice.org3 68 | ) 69 | end 70 | search_paths 71 | end 72 | 73 | # Identify the path to a working office executable. 74 | def office_executable 75 | paths = office_search_paths 76 | 77 | # If an OFFICE_PATH has been specified on the commandline 78 | # raise an error if that path isn't valid, otherwise, add 79 | # it to the front of our search paths. 80 | if ENV['OFFICE_PATH'] 81 | raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH'] 82 | paths.unshift(ENV['OFFICE_PATH']) 83 | end 84 | 85 | # The location of the office executable is OS dependent 86 | path_pieces = ["soffice"] 87 | if windows? 88 | path_pieces += [["program", "soffice.bin"]] 89 | elsif osx? 90 | path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]] 91 | else 92 | path_pieces += [["program", "soffice"]] 93 | end 94 | 95 | # Search for the first suitable office executable 96 | # and short circuit an executable is found. 97 | paths.each do |path| 98 | if File.exist? path 99 | @@executable ||= path unless File.directory? path 100 | path_pieces.each do |pieces| 101 | check_path = File.join(path, pieces) 102 | @@executable ||= check_path if File.exist? check_path 103 | end 104 | end 105 | break if @@executable 106 | end 107 | raise OfficeNotFound, "No office software found" unless @@executable 108 | @@executable 109 | end 110 | 111 | # Used to specify the office location for JODConverter 112 | def office_path 113 | File.dirname(File.dirname(office_executable)) 114 | end 115 | 116 | # Convert documents to PDF. 117 | def extract(docs, opts) 118 | out = opts[:output] || '.' 119 | FileUtils.mkdir_p out unless File.exist?(out) 120 | [docs].flatten.each do |doc| 121 | ext = File.extname(doc) 122 | basename = File.basename(doc, ext) 123 | escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE) 124 | 125 | if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0]) 126 | `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf` 127 | else 128 | if libre_office? 129 | # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other. 130 | ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}" 131 | 132 | options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}" 133 | cmd = "#{office_executable} #{options} 2>&1" 134 | result = `#{cmd}`.chomp 135 | raise ExtractionFailed, result if $? != 0 136 | true 137 | else # open office presumably, rely on JODConverter to figure it out. 138 | options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js" 139 | run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {} 140 | end 141 | end 142 | end 143 | end 144 | 145 | CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'" 146 | 147 | LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties" 148 | 149 | HEADLESS = "-Djava.awt.headless=true" 150 | 151 | private 152 | 153 | # Runs a Java command, with quieted logging, and the classpath set properly. 154 | def run_jod(command, pdfs, opts, return_output=false) 155 | 156 | pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ') 157 | office = osx? ? "-Doffice.home=#{office_path}" : office_path 158 | cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1" 159 | result = `#{cmd}`.chomp 160 | raise ExtractionFailed, result if $? != 0 161 | return return_output ? (result.empty? ? nil : result) : true 162 | end 163 | 164 | class OfficeNotFound < StandardError; end 165 | end 166 | end 167 | -------------------------------------------------------------------------------- /test/fixtures/with_pdf_extension/actually_an_rtf.pdf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;} 2 | {\f357\froman\fcharset238\fprq2 Times New Roman CE;}{\f358\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f360\froman\fcharset161\fprq2 Times New Roman Greek;}{\f361\froman\fcharset162\fprq2 Times New Roman Tur;} 3 | {\f362\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f363\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f364\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f365\fswiss\fcharset238\fprq2 Arial CE;} 4 | {\f366\fswiss\fcharset204\fprq2 Arial Cyr;}{\f368\fswiss\fcharset161\fprq2 Arial Greek;}{\f369\fswiss\fcharset162\fprq2 Arial Tur;}{\f370\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f371\fswiss\fcharset178\fprq2 Arial (Arabic);} 5 | {\f372\fswiss\fcharset186\fprq2 Arial Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128; 6 | \red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 7 | \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \snext0 Normal;}{\*\cs10 \additive Default Paragraph Font;}{\s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 8 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext15 Normal (Web);}{\*\cs16 \additive \i \sbasedon10 Emphasis;}{\*\cs17 \additive \ul\cf2 \sbasedon10 Hyperlink;}{\*\cs18 \additive \b \sbasedon10 Strong;}}{\info 9 | {\title The USA - how\'92s Obama doing}{\author foster}{\operator foster}{\creatim\yr2009\mo10\dy7\hr9\min50}{\revtim\yr2009\mo10\dy7\hr9\min52}{\version1}{\edmins1}{\nofpages1}{\nofwords0}{\nofchars0}{\*\company home}{\nofcharsws0}{\vern8247}} 10 | \paperw11906\paperh16838 \widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1800\dgvorigin1440\dghshow1\dgvshow1 11 | \jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl 12 | {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5 13 | \pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang 14 | {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain \s15\qc \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 15 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\f1\fs20 The USA -\~ how\rquote s Obama doing?}{\i\f1\fs20 \line }{\cs16\f1\fs20 ODD Circle\~\~ Tue\~ 28th Apl 09\~\~ at The Blue Mugge Pub}{\i\f1\fs20 16 | \par }\pard \s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\cs16\f1\fs20 Notes using Obama\rquote s book \~The Audacity of Hope\~ and articles from }{\field\fldedit{\*\fldinst {\cs16\f1\fs20 17 | HYPERLINK "http://www.opendemocracy.net/" }}{\fldrslt {\cs17\i\f1\fs20\ul\cf2 www.opendemocracy.net}}}{\cs16\f1\fs20 }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 1.\~ Hope\rquote is the word, from his book 18 | and from his speeches which carries infectious resonance.\~\~ Yet, already\~ Naomi Klein writing in The Nation this month has sign-posted \lquote hopebroken and hopesick\rquote \'85\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~ 19 | What are our hopes, reservations and fears?}{\i\f1\fs20 20 | \par }{\cs16\f1\fs20 2.\~ Geoffrey Hodgson,\~ director of Reuters\rquote Foundation Programme, Oxford}{\i\f1\fs20 \line }{\cs16\f1\fs20 \'93 21 | No American president has started with more personal ability or more sheer good-will from around the world than Barack Obama\'85\'94\~\~ }{\i\f1\fs20 \line }{\cs16\f1\fs20 At the recent G20\~ \'93Obama spoke of a new, more subtle, m 22 | ore truthful style of leadership\'94.\~\~ }{\i\f1\fs20 23 | \par }{\cs16\f1\fs20 3.\~ We\rquote ll seek to list actions, plans and policies outlined since inauguration and assess merits.}{\i\f1\fs20 24 | \par }{\cs16\f1\fs20 4.\~\~ Seeing Obama from another nation\rquote s perspective:\~\~\~ Daniel Lichanian on\~ France\rquote s Obama fixation:\~ \'93During the past eight years the French thought of their homeland as far superior \'85.\~ 25 | Now they celebrate the USA for\'85 an elevated politics that many fear is unattainable in France\'94.\~\~\~ }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 5.\~ OpenDemocracy\~ sought views from their authors from around the world on these thre 26 | e questions regarding Obama:\~ a)\~\~ one thing you hope for\~\~\~\~ b)\~ one thing you fear\~\~ c) one piece of advice.\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~ Here\rquote s a summary:\~\~\~ }{\i\f1\fs20 27 | \par }{\cs16\f1\fs20 From a }{\cs18\b\i\f1\fs20 British}{\cs16\f1\fs20 professor:\~\~\~ hope;\~\~ \lquote take immediate and sustained action on climate change\rquote ;\~\~ fear\'85\~\~\~ \lquote unable to break free of past policy on Israel and Afghanistan 28 | \rquote ;\~\~ advice\'85 \lquote play it long\rquote }{\i\f1\fs20 29 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 African}{\cs16\f1\fs20 Foundation for Development:\~\~\~ hope\'85\~ \lquote at last, a formal apology for slavery and dispossession of native Americans \endash the two original sins of the Republic\rquote ;\~\~\~ 30 | fear\'85\~ \lquote business as usual\rquote ;\~\~\~ advice\'85\~ \lquote Trust your instincts\'85your appeal for change\rquote .}{\i\f1\fs20 31 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Germany}{\cs16\f1\fs20 :\~\~ hope\'85\~ \lquote the USA actively engages in conflict resolution, starting in the middle-east.\rquote \~\~\~\~\~ fear\'85\~ \lquote protectionist tendencies\'85 US spe 32 | cial interest groups and other international players\'85\rquote deflecting, de-railing Obama.\~\~ advice:\~\~ \lquote Pursue global policies in the most inclusive way\rquote .}{\i\f1\fs20 33 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Egyp}{\cs16\f1\fs20 t:\~\~ hope\~ \lquote steer a course\'85 energetic and ambitious \'85 but not aggressive;\~\~ fear:\~ \lquote could yield to the \lquote wounded lion\rquote impulse in US politics;\~ advice:\~ 34 | \lquote be yourself\rquote .}{\i\f1\fs20 35 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Japan}{\cs16\f1\fs20 :\~\~ hope:\~\~ \lquote the world is in fact round\'85 there are people living beyond America\rquote s horizons\rquote ;\~\~ fear:\~ \lquote Obama\rquote 36 | s America regains confidence in the wrong way\rquote ;\~ advice:}{\i\f1\fs20 \line }{\cs16\f1\fs20 \lquote he remains true to his acceptance speech\~\~ \'93I\rquote ll always be honest with you\'94.}{\i\f1\fs20 37 | \par }{\cs16\f1\fs20 5.\~ A quote from The Audacity of Hope:\~ at the end of the }{\cs16\f1\fs20\ul Politics}{\cs16\f1\fs20 chapter:}{\i\f1\fs20 38 | \par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\fs20 \~\~ \'93\'85 in a democracy, the most important office is the office of citizen\'94.}{ 39 | \par }} -------------------------------------------------------------------------------- /test/fixtures/obama_hopes.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;} 2 | {\f357\froman\fcharset238\fprq2 Times New Roman CE;}{\f358\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f360\froman\fcharset161\fprq2 Times New Roman Greek;}{\f361\froman\fcharset162\fprq2 Times New Roman Tur;} 3 | {\f362\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f363\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f364\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f365\fswiss\fcharset238\fprq2 Arial CE;} 4 | {\f366\fswiss\fcharset204\fprq2 Arial Cyr;}{\f368\fswiss\fcharset161\fprq2 Arial Greek;}{\f369\fswiss\fcharset162\fprq2 Arial Tur;}{\f370\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f371\fswiss\fcharset178\fprq2 Arial (Arabic);} 5 | {\f372\fswiss\fcharset186\fprq2 Arial Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128; 6 | \red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 7 | \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \snext0 Normal;}{\*\cs10 \additive Default Paragraph Font;}{\s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 8 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext15 Normal (Web);}{\*\cs16 \additive \i \sbasedon10 Emphasis;}{\*\cs17 \additive \ul\cf2 \sbasedon10 Hyperlink;}{\*\cs18 \additive \b \sbasedon10 Strong;}}{\info 9 | {\title The USA - how\'92s Obama doing}{\author foster}{\operator foster}{\creatim\yr2009\mo10\dy7\hr9\min50}{\revtim\yr2009\mo10\dy7\hr9\min52}{\version1}{\edmins1}{\nofpages1}{\nofwords0}{\nofchars0}{\*\company home}{\nofcharsws0}{\vern8247}} 10 | \paperw11906\paperh16838 \widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1800\dgvorigin1440\dghshow1\dgvshow1 11 | \jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl 12 | {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5 13 | \pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang 14 | {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain \s15\qc \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 15 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\f1\fs20 The USA -\~ how\rquote s Obama doing?}{\i\f1\fs20 \line }{\cs16\f1\fs20 ODD Circle\~\~ Tue\~ 28th Apl 09\~\~ at The Blue Mugge Pub}{\i\f1\fs20 16 | \par }\pard \s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\cs16\f1\fs20 Notes using Obama\rquote s book \~The Audacity of Hope\~ and articles from }{\field\fldedit{\*\fldinst {\cs16\f1\fs20 17 | HYPERLINK "http://www.opendemocracy.net/" }}{\fldrslt {\cs17\i\f1\fs20\ul\cf2 www.opendemocracy.net}}}{\cs16\f1\fs20 }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 1.\~ Hope\rquote is the word, from his book 18 | and from his speeches which carries infectious resonance.\~\~ Yet, already\~ Naomi Klein writing in The Nation this month has sign-posted \lquote hopebroken and hopesick\rquote \'85\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~ 19 | What are our hopes, reservations and fears?}{\i\f1\fs20 20 | \par }{\cs16\f1\fs20 2.\~ Geoffrey Hodgson,\~ director of Reuters\rquote Foundation Programme, Oxford}{\i\f1\fs20 \line }{\cs16\f1\fs20 \'93 21 | No American president has started with more personal ability or more sheer good-will from around the world than Barack Obama\'85\'94\~\~ }{\i\f1\fs20 \line }{\cs16\f1\fs20 At the recent G20\~ \'93Obama spoke of a new, more subtle, m 22 | ore truthful style of leadership\'94.\~\~ }{\i\f1\fs20 23 | \par }{\cs16\f1\fs20 3.\~ We\rquote ll seek to list actions, plans and policies outlined since inauguration and assess merits.}{\i\f1\fs20 24 | \par }{\cs16\f1\fs20 4.\~\~ Seeing Obama from another nation\rquote s perspective:\~\~\~ Daniel Lichanian on\~ France\rquote s Obama fixation:\~ \'93During the past eight years the French thought of their homeland as far superior \'85.\~ 25 | Now they celebrate the USA for\'85 an elevated politics that many fear is unattainable in France\'94.\~\~\~ }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 5.\~ OpenDemocracy\~ sought views from their authors from around the world on these thre 26 | e questions regarding Obama:\~ a)\~\~ one thing you hope for\~\~\~\~ b)\~ one thing you fear\~\~ c) one piece of advice.\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~ Here\rquote s a summary:\~\~\~ }{\i\f1\fs20 27 | \par }{\cs16\f1\fs20 From a }{\cs18\b\i\f1\fs20 British}{\cs16\f1\fs20 professor:\~\~\~ hope;\~\~ \lquote take immediate and sustained action on climate change\rquote ;\~\~ fear\'85\~\~\~ \lquote unable to break free of past policy on Israel and Afghanistan 28 | \rquote ;\~\~ advice\'85 \lquote play it long\rquote }{\i\f1\fs20 29 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 African}{\cs16\f1\fs20 Foundation for Development:\~\~\~ hope\'85\~ \lquote at last, a formal apology for slavery and dispossession of native Americans \endash the two original sins of the Republic\rquote ;\~\~\~ 30 | fear\'85\~ \lquote business as usual\rquote ;\~\~\~ advice\'85\~ \lquote Trust your instincts\'85your appeal for change\rquote .}{\i\f1\fs20 31 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Germany}{\cs16\f1\fs20 :\~\~ hope\'85\~ \lquote the USA actively engages in conflict resolution, starting in the middle-east.\rquote \~\~\~\~\~ fear\'85\~ \lquote protectionist tendencies\'85 US spe 32 | cial interest groups and other international players\'85\rquote deflecting, de-railing Obama.\~\~ advice:\~\~ \lquote Pursue global policies in the most inclusive way\rquote .}{\i\f1\fs20 33 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Egyp}{\cs16\f1\fs20 t:\~\~ hope\~ \lquote steer a course\'85 energetic and ambitious \'85 but not aggressive;\~\~ fear:\~ \lquote could yield to the \lquote wounded lion\rquote impulse in US politics;\~ advice:\~ 34 | \lquote be yourself\rquote .}{\i\f1\fs20 35 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Japan}{\cs16\f1\fs20 :\~\~ hope:\~\~ \lquote the world is in fact round\'85 there are people living beyond America\rquote s horizons\rquote ;\~\~ fear:\~ \lquote Obama\rquote 36 | s America regains confidence in the wrong way\rquote ;\~ advice:}{\i\f1\fs20 \line }{\cs16\f1\fs20 \lquote he remains true to his acceptance speech\~\~ \'93I\rquote ll always be honest with you\'94.}{\i\f1\fs20 37 | \par }{\cs16\f1\fs20 5.\~ A quote from The Audacity of Hope:\~ at the end of the }{\cs16\f1\fs20\ul Politics}{\cs16\f1\fs20 chapter:}{\i\f1\fs20 38 | \par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\fs20 \~\~ \'93\'85 in a democracy, the most important office is the office of citizen\'94.}{ 39 | \par }} -------------------------------------------------------------------------------- /vendor/conf/document-formats.js: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "Portable Document Format", 4 | "extension": "pdf", 5 | "mediaType": "application/pdf", 6 | "storePropertiesByFamily": { 7 | "DRAWING": {"FilterName": "draw_pdf_Export"}, 8 | "SPREADSHEET": {"FilterName": "calc_pdf_Export"}, 9 | "PRESENTATION": {"FilterName": "impress_pdf_Export"}, 10 | "TEXT": {"FilterName": "writer_pdf_Export"} 11 | } 12 | }, 13 | { 14 | "name": "Macromedia Flash", 15 | "extension": "swf", 16 | "mediaType": "application/x-shockwave-flash", 17 | "storePropertiesByFamily": { 18 | "DRAWING": {"FilterName": "draw_flash_Export"}, 19 | "PRESENTATION": {"FilterName": "impress_flash_Export"} 20 | } 21 | }, 22 | { 23 | "name": "HTML", 24 | "extension": "html", 25 | "mediaType": "text/html", 26 | "inputFamily": "TEXT", 27 | "storePropertiesByFamily": { 28 | "SPREADSHEET": {"FilterName": "HTML (StarCalc)"}, 29 | "PRESENTATION": {"FilterName": "impress_html_Export"}, 30 | "TEXT": {"FilterName": "HTML (StarWriter)"} 31 | } 32 | }, 33 | { 34 | "name": "OpenDocument Text", 35 | "extension": "odt", 36 | "mediaType": "application/vnd.oasis.opendocument.text", 37 | "inputFamily": "TEXT", 38 | "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}} 39 | }, 40 | { 41 | "name": "OpenOffice.org 1.0 Text Document", 42 | "extension": "sxw", 43 | "mediaType": "application/vnd.sun.xml.writer", 44 | "inputFamily": "TEXT", 45 | "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}} 46 | }, 47 | { 48 | "name": "Microsoft Word", 49 | "extension": "doc", 50 | "mediaType": "application/msword", 51 | "inputFamily": "TEXT", 52 | "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}} 53 | }, 54 | { 55 | "name": "Microsoft Word 2007 XML", 56 | "extension": "docx", 57 | "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 58 | "inputFamily": "TEXT" 59 | }, 60 | { 61 | "name": "Rich Text Format", 62 | "extension": "rtf", 63 | "mediaType": "text/rtf", 64 | "inputFamily": "TEXT", 65 | "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}} 66 | }, 67 | { 68 | "name": "WordPerfect", 69 | "extension": "wpd", 70 | "mediaType": "application/wordperfect", 71 | "inputFamily": "TEXT" 72 | }, 73 | { 74 | "name": "Plain Text", 75 | "extension": "txt", 76 | "mediaType": "text/plain", 77 | "inputFamily": "TEXT", 78 | "loadProperties": { 79 | "FilterName": "Text (encoded)", 80 | "FilterOptions": "utf8" 81 | }, 82 | "storePropertiesByFamily": {"TEXT": { 83 | "FilterName": "Text (encoded)", 84 | "FilterOptions": "utf8" 85 | }} 86 | }, 87 | { 88 | "name": "MediaWiki wikitext", 89 | "extension": "wiki", 90 | "mediaType": "text/x-wiki", 91 | "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}} 92 | }, 93 | { 94 | "name": "OpenDocument Spreadsheet", 95 | "extension": "ods", 96 | "mediaType": "application/vnd.oasis.opendocument.spreadsheet", 97 | "inputFamily": "SPREADSHEET", 98 | "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}} 99 | }, 100 | { 101 | "name": "OpenOffice.org 1.0 Spreadsheet", 102 | "extension": "sxc", 103 | "mediaType": "application/vnd.sun.xml.calc", 104 | "inputFamily": "SPREADSHEET", 105 | "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}} 106 | }, 107 | { 108 | "name": "Microsoft Excel", 109 | "extension": "xls", 110 | "mediaType": "application/vnd.ms-excel", 111 | "inputFamily": "SPREADSHEET", 112 | "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}} 113 | }, 114 | { 115 | "name": "Microsoft Excel 2007 XML", 116 | "extension": "xlsx", 117 | "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 118 | "inputFamily": "SPREADSHEET" 119 | }, 120 | { 121 | "name": "Comma Separated Values", 122 | "extension": "csv", 123 | "mediaType": "text/csv", 124 | "inputFamily": "SPREADSHEET", 125 | "loadProperties": { 126 | "FilterName": "Text - txt - csv (StarCalc)", 127 | "FilterOptions": "44,34,0" 128 | }, 129 | "storePropertiesByFamily": {"SPREADSHEET": { 130 | "FilterName": "Text - txt - csv (StarCalc)", 131 | "FilterOptions": "44,34,0" 132 | }} 133 | }, 134 | { 135 | "name": "Tab Separated Values", 136 | "extension": "tsv", 137 | "mediaType": "text/tab-separated-values", 138 | "inputFamily": "SPREADSHEET", 139 | "loadProperties": { 140 | "FilterName": "Text - txt - csv (StarCalc)", 141 | "FilterOptions": "9,34,0" 142 | }, 143 | "storePropertiesByFamily": {"SPREADSHEET": { 144 | "FilterName": "Text - txt - csv (StarCalc)", 145 | "FilterOptions": "9,34,0" 146 | }} 147 | }, 148 | { 149 | "name": "OpenDocument Presentation", 150 | "extension": "odp", 151 | "mediaType": "application/vnd.oasis.opendocument.presentation", 152 | "inputFamily": "PRESENTATION", 153 | "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}} 154 | }, 155 | { 156 | "name": "OpenOffice.org 1.0 Presentation", 157 | "extension": "sxi", 158 | "mediaType": "application/vnd.sun.xml.impress", 159 | "inputFamily": "PRESENTATION", 160 | "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}} 161 | }, 162 | { 163 | "name": "Microsoft PowerPoint", 164 | "extension": "ppt", 165 | "mediaType": "application/vnd.ms-powerpoint", 166 | "inputFamily": "PRESENTATION", 167 | "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}} 168 | }, 169 | { 170 | "name": "Microsoft PowerPoint 2007 XML", 171 | "extension": "pptx", 172 | "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation", 173 | "inputFamily": "PRESENTATION" 174 | }, 175 | { 176 | "name": "OpenDocument Drawing", 177 | "extension": "odg", 178 | "mediaType": "application/vnd.oasis.opendocument.graphics", 179 | "inputFamily": "DRAWING", 180 | "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}} 181 | }, 182 | { 183 | "name": "Scalable Vector Graphics", 184 | "extension": "svg", 185 | "mediaType": "image/svg+xml", 186 | "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}} 187 | }, 188 | { 189 | "name": "Portable Network Graphic", 190 | "extension": "png", 191 | "mediaType": "image/png", 192 | "storePropertiesByFamily": { 193 | "DRAWING": {"FilterName": "draw_png_Export"}, 194 | "PRESENTATION": {"FilterName": "impress_png_Export"} 195 | } 196 | }, 197 | { 198 | "name": "Graphics Interchange Format", 199 | "extension": "gif", 200 | "mediaType": "image/gif", 201 | "storePropertiesByFamily": { 202 | "DRAWING": {"FilterName": "draw_gif_Export"}, 203 | "PRESENTATION": {"FilterName": "impress_gif_Export"} 204 | } 205 | }, 206 | { 207 | "name": "Joint Photographic Experts Group", 208 | "extension": "jpg", 209 | "mediaType": "image/jpeg", 210 | "storePropertiesByFamily": { 211 | "DRAWING": {"FilterName": "draw_jpg_Export"}, 212 | "PRESENTATION": {"FilterName": "impress_jpg_Export"} 213 | } 214 | }, 215 | { 216 | "name": "Windows Bitmap", 217 | "extension": "bmp", 218 | "mediaType": "image/bmp", 219 | "storePropertiesByFamily": { 220 | "DRAWING": {"FilterName": "draw_bmp_Export"}, 221 | "PRESENTATION": {"FilterName": "impress_bmp_Export"} 222 | } 223 | }, 224 | { 225 | "name": "Tagged Image File Format", 226 | "extension": "tif", 227 | "mediaType": "image/tiff", 228 | "storePropertiesByFamily": { 229 | "DRAWING": {"FilterName": "draw_tif_Export"}, 230 | "PRESENTATION": {"FilterName": "impress_tif_Export"} 231 | } 232 | } 233 | ] 234 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Doc⚡split 7 | 85 | 86 | 87 | 88 |
89 | 90 |

Docsplit

91 | 92 |

93 | Docsplit 94 | is a command-line utility and Ruby library for splitting apart 95 | documents into their component parts: searchable UTF-8 plain text 96 | via OCR if necessary, page images or thumbnails in any format, 97 | PDFs, single pages, and document metadata 98 | (title, author, number of pages...) 99 |

100 | 101 |

Docsplit is currently at version 0.7.6.

102 | 103 |

104 | Docsplit is an open-source component of DocumentCloud. 105 |

106 | 107 |

108 | Installation & Dependencies | 109 | Usage | 110 | Internals | 111 | Change Log 112 |

113 | 114 |

Installation & Dependencies

115 | 116 |
    117 |
  1. 118 | Grab the gem:
    119 | gem install docsplit 120 |
  2. 121 |
  3. 122 | Install GraphicsMagick. 123 | Its ‘gm’ command is used to generate images.
    124 | Either compile it from 125 | source, 126 | or use a package manager: 127 |
    128 | [aptitude | port | brew] install graphicsmagick
    129 |
  4. 130 |
  5. 131 | Install Poppler. 132 | On Linux, use aptitude, apt-get or yum:
    133 | aptitude install poppler-utils poppler-data
    134 | On the Mac, you can install from source or use MacPorts:
    135 | sudo port install poppler | brew install poppler
    136 |
  6. 137 |
  7. 138 | (Optional) Install Ghostscript:
    139 | [aptitude | port | brew] install ghostscript
    140 | Ghostscript is required to convert PDF and Postscript files. 141 |
  8. 142 |
  9. 143 | (Optional) Install Tesseract:
    144 | [aptitude | port | brew] install [tesseract | tesseract-ocr]
    145 | Without Tesseract installed, you'll still be able to extract text from 146 | documents, but you won't be able to automatically OCR them. 147 |
  10. 148 |
  11. 149 | (Optional) Install pdftk. 150 | On Linux, use aptitude, apt-get or yum:
    151 | aptitude install pdftk
    152 | On the Mac, you can download a recent installer for the binary. 153 | Without pdftk installed, you can use Docsplit, but won't be able 154 | to split apart a multi-page PDF into single-page PDFs. 155 |
  12. 156 |
  13. 157 | (Optional) Install LibreOffice. 158 | On Linux, use aptitude, apt-get or yum:
    159 | aptitude install libreoffice
    160 | On the Mac, download and install the latest release. 161 |
  14. 162 |
  15. 163 | (Optional) Install fonts to process documents that use Chinese, Japanese, and Korean Fonts. 164 | On Linux, use aptitude, apt-get or yum:
    165 | aptitude install ttf-wqy-microhei ttf-wqy-zenhei ttf-kochi-gothic ttf-kochi-mincho fonts-nanum
    166 | On the Mac, the fonts should already be present. However you can always download the TTF files and install them using Font Book. 167 |
  16. 168 |
169 | 170 |

171 | Note: the gem will take a minute to download — the 172 | JODConverter jar file tips the scales at 2MB. 173 |

174 | 175 |

Usage

176 | 177 |

178 | The Docsplit gem includes both the docsplit command-line utility 179 | as well as a Ruby API. The available commands and options are identical in both.
180 | --output or -o can be passed to any command in order to 181 | store the generated files in a directory of your choosing. 182 |

183 | 184 |

185 | images--size --format --pages --density 186 | Ruby: extract_images 187 |
188 | Generates an image for each page in the document at the specified resolution 189 | and format. Pass --pages or -p to choose the specific pages to 190 | image. Passing
--size or -s will specify the desired 191 | image resolution, --density or -d will specify the DPI to rasterize the images 192 | at during conversion by GraphicsMagick, and --format or -f 193 | will select the format of the final images. 194 |

195 |
196 | docsplit images example.pdf
197 | docsplit images docs/*.pdf --size 700x,50x50 --format gif --pages 3,10-15,42
198 |
199 | Docsplit.extract_images('example.doc', :size => '1000x', :format => [:png, :jpg])
200 | 201 |

202 | text--pages --ocr --no-ocr --no-clean --language --no-orientation-detection 203 | Ruby: extract_text 204 |
205 | Extract the complete UTF-8-encoded plain text of a document to a 206 | single file. If you'd like to extract the text for each page separately, 207 | pass --pages all. You can use the --ocr and --no-ocr 208 | flags to force OCR, or disable it, respectively. By default (if Tesseract is installed) 209 | Docsplit will OCR the text of each page for which it fails to extract text 210 | directly from the document. Docsplit will also attempt to clean up garbage 211 | characters in the OCR'd text — to disable this, pass the 212 | --no-clean flag. 213 |

214 |

215 | By default Tesseract ships only with english extraction data. 216 | If 217 | any additional language models are installed you can select one using 218 | the --language flag. 219 | 220 | If 221 | Tesseract's orientation detection model Docsplit will automatically use it 222 | unless you specify not to with the --no-orientation-detection. 223 |

224 |
225 | docsplit text path/to/doc.pdf --pages all --language deu
226 |
227 | docs = Dir['storage/originals/*.doc']
228 | Docsplit.extract_text(docs, :ocr => false, :output => 'storage/text')
229 | 230 |

231 | pages--pages 232 | Ruby: extract_pages 233 |
234 | Burst apart a document into single-page PDFs. Use --pages to 235 | specify the individual pages (or ranges of pages) you'd like to generate. 236 |

237 |
238 | docsplit pages path/to/doc.pdf --pages 1-10
239 |
240 | Docsplit.extract_pages('path/to/presentation.ppt')
241 | Docsplit.extract_pages('doc.pdf', :pages => 1..10)
242 | 243 |

244 | pdf 245 | Ruby: extract_pdf 246 |
247 | Convert documents into PDFs. Any type of document that LibreOffice can read 248 | may be converted. These include the Microsoft Office formats: doc, docx, ppt, 249 | xls and so on, as well as html, odf, rtf, swf, svg, and wpd. 250 | The first time that you convert a new file type, LibreOffice will lazy-load 251 | the code that processes it — subsequent conversions will be much faster. 252 |

253 |
254 | docsplit pdf documentation/*.html
255 |
256 | Docsplit.extract_pdf('expense_report.xls')
257 | 258 |

259 | author, date, creator, keywords, producer, subject, title, length
260 | Ruby: extract_... 261 |
262 | Retrieve a piece of metadata about the document. The docsplit 263 | utility will print to stdout, the Ruby API will return the value. 264 |

265 |
266 | docsplit title path/to/stooges.pdf
267 | => Disorder in the Court
268 |
269 | Docsplit.extract_length('path/to/stooges.pdf')
270 | => 36
271 | 272 | 273 |

Internals

274 | 275 |

276 | Under the hood, Docsplit is a thin wrapper around the excellent 277 | GraphicsMagick, 278 | Poppler, 279 | PDFTK, 280 | Tesseract, and 281 | LibreOffice libraries. 282 | Poppler is used to extract text and metadata from PDF documents, 283 | PDFTK is used to split them apart into pages, and GraphicsMagick is used to generate 284 | the page images (internally, it's rendering them with 285 | GhostScript). 286 | LibreOffice and GraphicsMagick convert documents and images to PDF. 287 | Tesseract provides the transparent OCR fallback support, if the document 288 | is a simple scan, and the file doesn't contain any embedded text. 289 |

290 | 291 |

292 | Because documents need to be in PDF format before any metadata, text, 293 | or images are extracted, it's faster to use docsplit pdf 294 | to convert it up front, if you're planning to run more than one extraction. 295 | Otherwise Docsplit will write out the PDF version to a temporary file before 296 | proceeding with each command. 297 |

298 | 299 |

Change Log

300 | 301 |

302 | 0.7.6 – Nov. 16, 2014
303 | Docsplit will now automatically use Tesseract's orientation detection model 304 | if it is installed. 305 |

306 | 307 |

308 | 0.7.5 – May 28, 2014
309 | Docsplit will detect PDFs regardless of extension using magic number-based 310 | detection. 311 |

312 | 313 |

314 | 0.7.2 – Feb. 23, 2013
315 | Bug fixes for LibreOffice support. 316 |

317 | 318 |

319 | 0.7.0 – Feb. 23, 2013
320 | Docsplit now expresses a preference for LibreOffice over OpenOffice, with 321 | an eye to removing JODConverter and OpenOffice support in future versions 322 | (direct LibreOffice support is substantially faster than JODConverter). 323 | Improved unicode support now correctly collects non-ascii characters from 324 | pdfinfo. 325 |

326 | 327 |

328 | 0.6.4 – Nov. 12, 2012
329 | Added a language flag for the Docsplit commandline, fixed several bugs, 330 | and began preparations for the deprecation of pdftk. 331 |

332 | 333 |

334 | 0.6.2 – Nov. 22, 2011
335 | Bugfix to escape document names during file type detection. 336 |

337 | 338 |

339 | 0.6.1 – Nov. 18, 2011
340 | Docsplit now supports converting documents using LibreOffice 341 | as well as OpenOffice, through JODConverter 3.0 beta4. 342 |

343 | 344 |

345 | 0.6.0 – Sept. 13, 2011
346 | Docsplit should now handle shelling out for documents with arbitrary 347 | characters in their filenames correctly, thanks to a series of 348 | epic patches from Vladimir Rybas. 349 | A --density option was added for specifying the resolution of 350 | rasterization when generating images from documents. 351 | The image resolution for OCR has been doubled from 200 to 400 DPI — 352 | this shouldn't make a noticeable difference for normal docs, but will make 353 | a world of difference for the fine print. 354 | Docsplit now uses GraphicsMagick's --despeckle before OCR. 355 |

356 | 357 |

358 | 0.5.2 – May 13, 2011
359 | For transparent conversion to PDF, made Docsplit prefer GraphicsMagick 360 | over OpenOffice, when the file format is one that GraphicsMagick is able 361 | to read: (png, gif, jpg, jpeg, tif, tiff, bmp, pnm, ppm, svg, eps). 362 |

363 | 364 |

365 | 0.5.1 – April 26, 2011
366 | Minor tweaks to the TextCleaner to be more lenient about acryonms 367 | with hyphens, and words with four vowels in a row. 368 |

369 | 370 |

371 | 0.5.0
372 | Added a Docsplit::TextCleaner class which is used to post-process 373 | OCR'd text, and remove garbage characters that are created when Tesseract 374 | encounters non-english text. To disable the cleanup, pass --no-clean. 375 |

376 | 377 |

378 | 0.4.1
379 | Upgraded the JODConverter dependency for PDF conversion via OpenOffice to 380 | 3.0 beta. Added PNG, GIF, TIF, JPG, and BMP to the list of supported 381 | formats. 382 |

383 | 384 |

385 | 0.3.4
386 | Adding a suggested optimization from the GraphicsMagick list -- only ever 387 | generate one page image per GraphicsMagick call. Saves large amounts of 388 | disk space for tempfiles on long documents. 389 |

390 | 391 |

392 | 0.3.3
393 | Start using the MAGICK_TMPDIR environment variable to prevent parallel 394 | Docsplit runs from having the potential to clobber each other's temporary 395 | image files. 396 |

397 | 398 |

399 | 0.3.1
400 | Added a memory limit to GraphicsMagick while generating the TIFFs for 401 | Tesseract OCR -- prevents gm from gobbling up all available memory 402 | on large files. 403 |

404 | 405 |

406 | 0.3.0
407 | OCR support added via Tesseract, and the --ocr and --no-ocr 408 | flags. PDFBox is no longer a dependency, and the gem is many megabytes 409 | lighter for it. 410 |

411 | 412 |

413 | 0.2.0
414 | Moving to Poppler's pdftotext. PDFBox had issues with Unicode in PDFs 415 | and incorrectly split individual pages of text. 416 |

417 | 418 |

419 | 0.1.3
420 | Fixing a bug with specifying explicit page ranges for image extraction. 421 |

422 | 423 |

424 | 0.1.2
425 | Limiting the memory usage of GraphicsMagick to avoid out of memory errors 426 | on very large PDFs. 427 |

428 | 429 |

430 | 0.1.1
431 | Upgraded for compatibility with GraphicsMagick 1.3.11. 432 |

433 | 434 |

435 | 0.1.0
436 | Initial Docsplit release. 437 |

438 | 439 |

440 |
441 | 442 | A DocumentCloud Project 443 | 444 |

445 | 446 |
447 | 448 | 449 | 450 | 451 | 452 | --------------------------------------------------------------------------------