├── .gitignore
├── vendor
    ├── logging.properties
    ├── jodconverter
    │   ├── juh-3.2.1.jar
    │   ├── jurt-3.2.1.jar
    │   ├── ridl-3.2.1.jar
    │   ├── unoil-3.2.1.jar
    │   ├── json-20090211.jar
    │   ├── commons-cli-1.1.jar
    │   ├── commons-io-1.4.jar
    │   └── jodconverter-core-3.0-beta-4.jar
    └── conf
    │   └── document-formats.js
├── test
    ├── fixtures
    │   ├── image.png
    │   ├── unicode.pdf
    │   ├── corrosion.pdf
    │   ├── encrypted.pdf
    │   ├── Faktura 10.pdf
    │   ├── obama_arts.pdf
    │   ├── obama_veterans.doc
    │   ├── completely_encrypted.pdf
    │   ├── corrosion.reoriented.pdf
    │   ├── with_pdf_extension
    │   │   ├── actually_a_doc.pdf
    │   │   ├── actually_an_image.pdf
    │   │   ├── this_ones_a_real_pdf.pdf
    │   │   └── actually_an_rtf.pdf
    │   ├── without_pdf_extension
    │   │   └── indesign
    │   │   │   ├── test_pdf_1_3
    │   │   │   ├── test_pdf_1_4
    │   │   │   ├── test_pdf_1_5
    │   │   │   ├── test_pdf_1_6
    │   │   │   └── test_pdf_1_7
    │   ├── PDF file with spaces 'single' and "double quotes".doc
    │   ├── PDF file with spaces 'single' and "double quotes".pdf
    │   └── obama_hopes.rtf
    ├── test_helper.rb
    └── unit
    │   ├── test_extract_pages.rb
    │   ├── test_transparent_pdfs.rb
    │   ├── test_convert_to_pdf.rb
    │   ├── test_extract_info.rb
    │   ├── test_extract_images.rb
    │   └── test_extract_text.rb
├── bin
    └── docsplit
├── Rakefile
├── README
├── lib
    ├── docsplit
    │   ├── transparent_pdfs.rb
    │   ├── page_extractor.rb
    │   ├── info_extractor.rb
    │   ├── text_cleaner.rb
    │   ├── image_extractor.rb
    │   ├── command_line.rb
    │   ├── text_extractor.rb
    │   └── pdf_extractor.rb
    └── docsplit.rb
├── noto_bolt.svg
├── docsplit.gemspec
├── LICENSE
└── index.html


/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/vendor/logging.properties:
--------------------------------------------------------------------------------
1 | .level=WARNING
2 | 


--------------------------------------------------------------------------------
/test/fixtures/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/image.png


--------------------------------------------------------------------------------
/test/fixtures/unicode.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/unicode.pdf


--------------------------------------------------------------------------------
/test/fixtures/corrosion.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/corrosion.pdf


--------------------------------------------------------------------------------
/test/fixtures/encrypted.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/encrypted.pdf


--------------------------------------------------------------------------------
/test/fixtures/Faktura 10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/Faktura 10.pdf


--------------------------------------------------------------------------------
/test/fixtures/obama_arts.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/obama_arts.pdf


--------------------------------------------------------------------------------
/test/fixtures/obama_veterans.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/obama_veterans.doc


--------------------------------------------------------------------------------
/vendor/jodconverter/juh-3.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/juh-3.2.1.jar


--------------------------------------------------------------------------------
/vendor/jodconverter/jurt-3.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/jurt-3.2.1.jar


--------------------------------------------------------------------------------
/vendor/jodconverter/ridl-3.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/ridl-3.2.1.jar


--------------------------------------------------------------------------------
/vendor/jodconverter/unoil-3.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/unoil-3.2.1.jar


--------------------------------------------------------------------------------
/vendor/jodconverter/json-20090211.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/json-20090211.jar


--------------------------------------------------------------------------------
/test/fixtures/completely_encrypted.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/completely_encrypted.pdf


--------------------------------------------------------------------------------
/test/fixtures/corrosion.reoriented.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/corrosion.reoriented.pdf


--------------------------------------------------------------------------------
/vendor/jodconverter/commons-cli-1.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/commons-cli-1.1.jar


--------------------------------------------------------------------------------
/vendor/jodconverter/commons-io-1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/commons-io-1.4.jar


--------------------------------------------------------------------------------
/bin/docsplit:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | 
3 | require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
4 | 
5 | Docsplit::CommandLine.new


--------------------------------------------------------------------------------
/test/fixtures/with_pdf_extension/actually_a_doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/with_pdf_extension/actually_a_doc.pdf


--------------------------------------------------------------------------------
/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar


--------------------------------------------------------------------------------
/test/fixtures/with_pdf_extension/actually_an_image.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/with_pdf_extension/actually_an_image.pdf


--------------------------------------------------------------------------------
/test/fixtures/with_pdf_extension/this_ones_a_real_pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/with_pdf_extension/this_ones_a_real_pdf.pdf


--------------------------------------------------------------------------------
/test/fixtures/without_pdf_extension/indesign/test_pdf_1_3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_3


--------------------------------------------------------------------------------
/test/fixtures/without_pdf_extension/indesign/test_pdf_1_4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_4


--------------------------------------------------------------------------------
/test/fixtures/without_pdf_extension/indesign/test_pdf_1_5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_5


--------------------------------------------------------------------------------
/test/fixtures/without_pdf_extension/indesign/test_pdf_1_6:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_6


--------------------------------------------------------------------------------
/test/fixtures/without_pdf_extension/indesign/test_pdf_1_7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/without_pdf_extension/indesign/test_pdf_1_7


--------------------------------------------------------------------------------
/test/fixtures/PDF file with spaces 'single' and "double quotes".doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/PDF file with spaces 'single' and "double quotes".doc


--------------------------------------------------------------------------------
/test/fixtures/PDF file with spaces 'single' and "double quotes".pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/documentcloud/docsplit/HEAD/test/fixtures/PDF file with spaces 'single' and "double quotes".pdf


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'fileutils'
 2 | require 'rake/testtask'
 3 | 
 4 | desc 'Run all tests'
 5 | task :test do
 6 |   require 'minitest/autorun'
 7 |   Dir['./test/*/**/test_*.rb'].each {|test| require test }
 8 | end
 9 | 
10 | namespace :gem do
11 | 
12 |   desc 'Build and install the docsplit gem'
13 |   task :install do
14 |     sh "gem build docsplit.gemspec"
15 |     sh "sudo gem install #{Dir['*.gem'].join(' ')} --local --no-ri --no-rdoc"
16 |   end
17 | 
18 |   desc 'Uninstall the docsplit gem'
19 |   task :uninstall do
20 |     sh "sudo gem uninstall -x docsplit"
21 |   end
22 | 
23 | end
24 | 
25 | task :default => :test
26 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | ==
 2 |          __                      ___ __ 
 3 |     ____/ /___  ______________  / (_) /_
 4 |    / __  / __ \/ ___/ ___/ __ \/ / / __/
 5 |   / /_/ / /_/ / /__(__  ) /_/ / / / /_  
 6 |   \____/\____/\___/____/ .___/_/_/\__/  
 7 |                       /_/
 8 |                       
 9 |   Docsplit is a command-line utility and Ruby library for splitting apart
10 |   documents into their component parts: searchable UTF-8 plain text, page 
11 |   images or thumbnails in any format, PDFs, single pages, and document 
12 |   metadata (title, author, number of pages...)
13 |   
14 |   Installation:
15 |   gem install docsplit
16 |   
17 |   For documentation, usage, and examples, see:
18 |   https://documentcloud.github.io/docsplit/
19 |   
20 |   To suggest a feature or report a bug: 
21 |   http://github.com/documentcloud/docsplit/issues/
22 | 
23 | 


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
 1 | here = File.dirname(__FILE__)
 2 | require File.join(here, '..', 'lib', 'docsplit')
 3 | require 'fileutils'
 4 | require 'minitest'
 5 | require "minitest/autorun"
 6 | 
 7 | class Minitest::Test
 8 |   include Docsplit
 9 | 
10 |   OUTPUT = 'test/output'
11 | 
12 |   def clear_output
13 |     FileUtils.rm_r(OUTPUT) if File.exist?(OUTPUT)
14 |   end
15 | 
16 |   def teardown
17 |     clear_output
18 |   end
19 | 
20 |   def assert_directory_contains(dir, files)
21 |     files_in_directory = Dir["#{dir}/*"]
22 |     if files.kind_of?(Array)
23 |       assert files_in_directory.length == files.length, "Expected directory to contain exactly #{files.length} files"
24 |     else
25 |       files = [files]
26 |     end
27 |     files.each { |f| assert files_in_directory.include?(File.join(dir, f)), "Expected directory #{dir} to contain file #{f}, but it contains #{files_in_directory.inspect}" }
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/docsplit/transparent_pdfs.rb:
--------------------------------------------------------------------------------
 1 | module Docsplit
 2 | 
 3 |   # Include a method to transparently convert non-PDF arguments to temporary
 4 |   # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
 5 |   module TransparentPDFs
 6 | 
 7 |     # Temporarily convert any non-PDF documents to PDFs before running them
 8 |     # through further extraction.
 9 |     def ensure_pdfs(docs)
10 |       [docs].flatten.map do |doc|
11 |         if is_pdf?(doc)
12 |           doc
13 |         else
14 |           tempdir = File.join(Dir.tmpdir, 'docsplit')
15 |           extract_pdf([doc], {:output => tempdir})
16 |           File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
17 |         end
18 |       end
19 |     end
20 | 
21 |     def is_pdf?(doc)
22 |       File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
23 |     end
24 | 
25 |   end
26 | 
27 |   extend TransparentPDFs
28 | 
29 | end
30 | 


--------------------------------------------------------------------------------
/test/unit/test_extract_pages.rb:
--------------------------------------------------------------------------------
 1 | here = File.expand_path(File.dirname(__FILE__))
 2 | require File.join(here, '..', 'test_helper')
 3 | 
 4 | class ExtractPagesTest < Minitest::Test
 5 | 
 6 |   def test_multi_page_extraction
 7 |     Docsplit.extract_pages('test/fixtures/obama_arts.pdf', :output => OUTPUT)
 8 |     assert Dir["#{OUTPUT}/*.pdf"].length == 2
 9 |   end
10 | 
11 |   def test_password_protected
12 |     assert_raises(ExtractionFailed) do
13 |       Docsplit.extract_pages('test/fixtures/completely_encrypted.pdf')
14 |     end
15 |   end
16 | 
17 |   def test_doc_page_extraction
18 |     Docsplit.extract_pages('test/fixtures/obama_veterans.doc', :output => OUTPUT)
19 |     assert Dir["#{OUTPUT}/*.pdf"].length == 7
20 |   end
21 | 
22 |   def test_name_escaping_while_extracting_pages
23 |     Docsplit.extract_pages('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :output => OUTPUT)
24 |     assert Dir["#{OUTPUT}/*.pdf"].length == 2
25 |   end
26 | 
27 | end
28 | 


--------------------------------------------------------------------------------
/noto_bolt.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 16.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <!-- This file is Apache2 licensed see https://code.google.com/p/noto/ -->
 5 | <svg version="1.1" id="レイヤー_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px"
 6 | 	 y="0px" width="128px" height="128px" viewBox="0 0 128 128" enable-background="new 0 0 128 128" xml:space="preserve">
 7 | <path fill="#FCC21B" d="M115.36,61.84L70.22,50.49L114.45,2.4c0.41-0.45,0.43-1.13,0.05-1.6c-0.39-0.48-1.07-0.59-1.59-0.27
 8 | 	L12.3,61.98c-0.41,0.25-0.64,0.72-0.57,1.2c0.06,0.48,0.4,0.87,0.87,1.01l45.07,13.25L13.38,125.6c-0.42,0.46-0.44,1.15-0.04,1.61
 9 | 	c0.24,0.29,0.58,0.44,0.94,0.44c0.22,0,0.45-0.06,0.65-0.19l100.78-63.41c0.42-0.26,0.64-0.75,0.56-1.22
10 | 	C116.19,62.34,115.84,61.95,115.36,61.84z"/>
11 | </svg>
12 | 


--------------------------------------------------------------------------------
/docsplit.gemspec:
--------------------------------------------------------------------------------
 1 | Gem::Specification.new do |s|
 2 |   s.name      = 'docsplit'
 3 |   s.version   = '0.7.6'         # Keep version in sync with docsplit.rb
 4 |   s.date      = '2014-11-17'
 5 | 
 6 |   s.homepage    = "http://documentcloud.github.com/docsplit/"
 7 |   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
 8 |   s.description = <<-EOS
 9 |     Docsplit is a command-line utility and Ruby library for splitting apart
10 |     documents into their component parts: searchable UTF-8 plain text, page
11 |     images or thumbnails in any format, PDFs, single pages, and document
12 |     metadata (title, author, number of pages...)
13 |   EOS
14 | 
15 |   s.authors           = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
16 |   s.email             = 'opensource@documentcloud.org'
17 |   s.rubyforge_project = 'docsplit'
18 |   s.license           = 'MIT'
19 | 
20 |   s.require_paths     = ['lib']
21 |   s.executables       = ['docsplit']
22 | 
23 |   s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
24 |                 'docsplit.gemspec', 'LICENSE', 'README']
25 | end


--------------------------------------------------------------------------------
/test/unit/test_transparent_pdfs.rb:
--------------------------------------------------------------------------------
 1 | here = File.expand_path(File.dirname(__FILE__))
 2 | require File.join(here, '..', 'test_helper')
 3 | require 'tmpdir'
 4 | 
 5 | class TransparentPDFsTest < Minitest::Test
 6 | 
 7 |   def setup
 8 |     @klass = Class.new
 9 |     @klass.send(:include, Docsplit::TransparentPDFs)
10 |     @detector = @klass.new
11 |   end
12 | 
13 |   def test_files_with_pdf_extension_are_always_considered_a_pdf
14 |     pdfs = Dir.glob('test/fixtures/with_pdf_extension/*.pdf').select { |path| File.file?(path) }
15 |     assert pdfs.any?, 'ensure pdfs with extensions are available to test with'
16 |     pdfs.each do |pdf|
17 |       assert @detector.is_pdf?(pdf), "#{pdf} with '.pdf' extension is identified as a PDF (regardless of its file contents)"
18 |     end
19 |   end
20 | 
21 |   def test_pdfs_without_the_pdf_file_extension_is_considerd_a_pdf
22 |     pdfs = Dir.glob('test/fixtures/without_pdf_extension/*/*').select { |path| File.file?(path) }
23 |     assert pdfs.any?, 'ensure pdfs without extensions are available to test with'
24 |     pdfs.each do |pdf|
25 |       assert @detector.is_pdf?(pdf), "#{pdf} with '.pdf' extension is identified as a PDF"
26 |     end
27 |   end
28 | 
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/docsplit/page_extractor.rb:
--------------------------------------------------------------------------------
 1 | module Docsplit
 2 | 
 3 |   # Delegates to **pdftk** in order to create bursted single pages from
 4 |   # a PDF document.
 5 |   class PageExtractor
 6 | 
 7 |     # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
 8 |     def extract(pdfs, opts)
 9 |       extract_options opts
10 |       [pdfs].flatten.each do |pdf|
11 |         pdf_name = File.basename(pdf, File.extname(pdf))
12 |         page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
13 |         FileUtils.mkdir_p @output unless File.exist?(@output)
14 | 
15 |         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16 |           "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
17 |         else
18 |           "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
19 |         end
20 |         result = `#{cmd}`.chomp
21 |         FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
22 |         raise ExtractionFailed, result if $? != 0
23 |         result
24 |       end
25 |     end
26 | 
27 | 
28 |     private
29 | 
30 |     def extract_options(options)
31 |       @output = options[:output] || '.'
32 |     end
33 | 
34 |   end
35 | 
36 | end
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
 2 | 
 3 | Copyright (c) 2009-2011 Jeremy Ashkenas, DocumentCloud
 4 | Copyright (c) 2011-2013 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
 5 | 
 6 | Permission is hereby granted, free of charge, to any person
 7 | obtaining a copy of this software and associated documentation
 8 | files (the "Software"), to deal in the Software without
 9 | restriction, including without limitation the rights to use,
10 | copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the
12 | Software is furnished to do so, subject to the following
13 | conditions:
14 | 
15 | The above copyright notice and this permission notice shall be
16 | included in all copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/test/unit/test_convert_to_pdf.rb:
--------------------------------------------------------------------------------
 1 | here = File.expand_path(File.dirname(__FILE__))
 2 | require File.join(here, '..', 'test_helper')
 3 | 
 4 | class ConvertToPdfTest < Minitest::Test
 5 | 
 6 |   def test_doc_conversion
 7 |     Docsplit.extract_pdf('test/fixtures/obama_veterans.doc', :output => OUTPUT)
 8 |     assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/obama_veterans.pdf"]
 9 |   end
10 | 
11 |   def test_rtf_conversion
12 |     Docsplit.extract_pdf('test/fixtures/obama_hopes.rtf', :output => OUTPUT)
13 |     assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/obama_hopes.pdf"]
14 |   end
15 | 
16 |   def test_png_conversion
17 |     Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
18 |     assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
19 |   end
20 |   def test_png_conversion
21 |     Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
22 |     assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
23 |   end
24 | 
25 |   def test_conversion_then_page_extraction
26 |     Docsplit.extract_pdf('test/fixtures/obama_veterans.doc', :output => OUTPUT)
27 |     Docsplit.extract_pages("#{OUTPUT}/obama_veterans.pdf", :output => OUTPUT)
28 |     assert Dir["#{OUTPUT}/*.pdf"].length == 8
29 |   end
30 | 
31 |   def test_name_escaping_while_converting
32 |     Docsplit.extract_pdf('test/fixtures/PDF file with spaces \'single\' and "double quotes".doc', :output => OUTPUT)
33 |     assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/PDF file with spaces 'single' and \"double quotes\".pdf"]
34 |   end
35 | 
36 | end
37 | 


--------------------------------------------------------------------------------
/lib/docsplit/info_extractor.rb:
--------------------------------------------------------------------------------
 1 | module Docsplit
 2 | 
 3 |   # Delegates to **pdfinfo** in order to extract information about a PDF file.
 4 |   class InfoExtractor
 5 | 
 6 |     # Regex matchers for different bits of information.
 7 |     MATCHERS = {
 8 |       :author   => /^Author:\s+([^\n]+)/,
 9 |       :date     => /^CreationDate:\s+([^\n]+)/,
10 |       :creator  => /^Creator:\s+([^\n]+)/,
11 |       :keywords => /^Keywords:\s+([^\n]+)/,
12 |       :producer => /^Producer:\s+([^\n]+)/,
13 |       :subject  => /^Subject:\s+([^\n]+)/,
14 |       :title    => /^Title:\s+([^\n]+)/,
15 |       :length   => /^Pages:\s+([^\n]+)/,
16 |     }
17 | 
18 |     # Pull out a single datum from a pdf.
19 |     def extract(key, pdfs, opts)
20 |       extract_all(pdfs, opts)[key]
21 |     end
22 |     
23 |     def extract_all(pdfs, opts)
24 |       pdf = [pdfs].flatten.first
25 |       cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
26 |       result = `#{cmd}`.chomp
27 |       raise ExtractionFailed, result if $? != 0
28 |       # ruby  1.8 (iconv) and 1.9 (String#encode) :
29 |       if String.method_defined?(:encode)
30 |         result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
31 |       else
32 |         require 'iconv' unless defined?(Iconv)
33 |         ic = Iconv.new('UTF-8//IGNORE','UTF-8')
34 |         result = ic.iconv(result)
35 |       end
36 |       info = {}
37 |       MATCHERS.each do |key, matcher|
38 |         match = result.match(matcher)
39 |         answer = match && match[1]
40 |         if answer
41 |           answer = answer.to_i if key == :length
42 |           info[key] = answer
43 |         end
44 |       end
45 |       info
46 |     end
47 | 
48 |   end
49 | 
50 | end
51 | 


--------------------------------------------------------------------------------
/test/unit/test_extract_info.rb:
--------------------------------------------------------------------------------
 1 | here = File.expand_path(File.dirname(__FILE__))
 2 | require File.join(here, '..', 'test_helper')
 3 | 
 4 | class ExtractInfoTest < Minitest::Test
 5 | 
 6 |   def test_title
 7 |     assert "PDF Pieces" == Docsplit.extract_title('test/fixtures/encrypted.pdf')
 8 |   end
 9 | 
10 |   def test_doc_title
11 |     assert "Remarks of President Barack Obama" == Docsplit.extract_title('test/fixtures/obama_veterans.doc')
12 |   end
13 | 
14 |   def test_author
15 |     assert "Jeremy Ashkenas" == Docsplit.extract_author('test/fixtures/encrypted.pdf')
16 |   end
17 | 
18 |   def test_date
19 |     assert "Thu Nov 29 14:54:46 2007" == Docsplit.extract_date('test/fixtures/obama_arts.pdf')
20 |   end
21 | 
22 |   def test_length
23 |     assert 2 == Docsplit.extract_length('test/fixtures/obama_arts.pdf')
24 |   end
25 | 
26 |   def test_producer
27 |     assert "Mac OS X 10.6.2 Quartz PDFContext" == Docsplit.extract_producer('test/fixtures/encrypted.pdf')
28 |   end
29 | 
30 |   def test_password_protected
31 |     assert_raises(ExtractionFailed) do
32 |       Docsplit.extract_author('test/fixtures/completely_encrypted.pdf')
33 |     end
34 |   end
35 | 
36 |   def test_name_escaping_while_extracting_info
37 |     assert 2 == Docsplit.extract_length('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf')
38 |   end
39 |   
40 |   def test_malformed_unicode
41 |     assert Docsplit.extract_date('test/fixtures/Faktura 10.pdf')
42 |   end
43 |   
44 |   def test_extract_all
45 |     metadata = Docsplit.extract_info('test/fixtures/obama_arts.pdf')
46 |     assert metadata[:author] == "mkommareddi"
47 |     assert metadata[:date] == "Thu Nov 29 14:54:46 2007"
48 |     assert metadata[:creator] == "PScript5.dll Version 5.2"
49 |     assert metadata[:producer] == "Acrobat Distiller 8.1.0 (Windows)"
50 |     assert metadata[:title] == "Microsoft Word - Fact Sheet Arts 112907 FINAL.doc"
51 |     assert metadata[:length] == 2
52 |     assert metadata.length == 6
53 |   end
54 | 
55 | end
56 | 


--------------------------------------------------------------------------------
/test/unit/test_extract_images.rb:
--------------------------------------------------------------------------------
 1 | here = File.expand_path(File.dirname(__FILE__))
 2 | require File.join(here, '..', 'test_helper')
 3 | 
 4 | class ExtractImagesTest < Minitest::Test
 5 | 
 6 |   def test_basic_image_extraction
 7 |     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "250x", :output => OUTPUT)
 8 |     assert_directory_contains(OUTPUT, ['obama_arts_1.gif', 'obama_arts_2.gif'])
 9 |   end
10 | 
11 |   def test_image_formatting
12 |     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "250x", :output => OUTPUT)
13 |     assert Dir["#{OUTPUT}/*.gif"].length == 2
14 |     assert Dir["#{OUTPUT}/*.jpg"].length == 2
15 |   end
16 | 
17 |   def test_page_ranges
18 |     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
19 |     assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"]
20 |   end
21 | 
22 |   def test_image_sizes
23 |     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :rolling => true, :size => ["150x", "50x"], :output => OUTPUT)
24 |     assert File.size("#{OUTPUT}/50x/obama_arts_1.gif") < File.size("#{OUTPUT}/150x/obama_arts_1.gif")
25 |   end
26 | 
27 |   def test_encrypted_images
28 |     Docsplit.extract_images('test/fixtures/encrypted.pdf', :format => :gif, :size => "50x", :output => OUTPUT)
29 |     assert File.size("#{OUTPUT}/encrypted_1.gif") > 100
30 |   end
31 | 
32 |   def test_password_protected_extraction
33 |     assert_raises(ExtractionFailed) do
34 |       Docsplit.extract_images('test/fixtures/completely_encrypted.pdf')
35 |     end
36 |   end
37 | 
38 |   def test_repeated_extraction_in_the_same_directory
39 |     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "250x", :output => OUTPUT)
40 |     assert_directory_contains(OUTPUT, ['obama_arts_1.gif', 'obama_arts_2.gif'])
41 |     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "250x", :output => OUTPUT)
42 |     assert_directory_contains(OUTPUT, ['obama_arts_1.gif', 'obama_arts_2.gif'])
43 |   end
44 | 
45 |   def test_name_escaping_while_extracting_images
46 |     Docsplit.extract_images('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :format => :gif, :size => "250x", :output => OUTPUT)
47 |     assert_directory_contains(OUTPUT, ['PDF file with spaces \'single\' and "double quotes"_1.gif',
48 |                                        'PDF file with spaces \'single\' and "double quotes"_1.gif'])
49 |   end
50 | 
51 | end
52 | 


--------------------------------------------------------------------------------
/test/unit/test_extract_text.rb:
--------------------------------------------------------------------------------
 1 | here = File.expand_path(File.dirname(__FILE__))
 2 | require File.join(here, '..', 'test_helper')
 3 | require 'tmpdir'
 4 | 
 5 | class ExtractTextTest < Minitest::Test
 6 | 
 7 |   def test_paged_extraction
 8 |     Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
 9 |     assert Dir["#{OUTPUT}/*.txt"].length == 2
10 |     assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
11 |   end
12 | 
13 |   def test_page_only_extraction
14 |     Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 2..2, :output => OUTPUT)
15 |     assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/obama_arts_2.txt"]
16 |   end
17 | 
18 |   def test_capitalized_pdf_extraction
19 |     Dir["#{OUTPUT}/*.txt"].each {|previous| FileUtils.rm(previous) }
20 |     Dir.mktmpdir do |dir|
21 |       FileUtils.cp('test/fixtures/obama_arts.pdf', "#{dir}/OBAMA_ARTS.PDF")
22 |       Docsplit.extract_text("#{dir}/OBAMA_ARTS.PDF", :pages => 2..2, :output => OUTPUT)
23 |     end
24 |     assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/OBAMA_ARTS_2.txt"]
25 |   end
26 | 
27 |   def test_unicode_extraction
28 |     Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
29 |     assert Dir["#{OUTPUT}/*.txt"].length == 3
30 |   end
31 | 
32 |   def test_ocr_extraction
33 |     Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
34 |     4.times do |i|
35 |       file = "corrosion_#{i + 1}.txt"
36 |       assert_directory_contains(OUTPUT, file)
37 |       assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
38 |     end
39 |   end
40 | 
41 |   def test_ocr_extraction_in_mock_language
42 |     exception = assert_raises(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
43 |     assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
44 |   end
45 | 
46 |   def test_password_protected
47 |     assert_raises(ExtractionFailed) do
48 |       Docsplit.extract_text('test/fixtures/completely_encrypted.pdf')
49 |     end
50 |   end
51 | 
52 |   def test_name_escaping_while_extracting_text
53 |     Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
54 |     assert Dir["#{OUTPUT}/*.txt"].length == 2
55 |   end
56 |   
57 |   def test_orientation_detected_ocr_extraction
58 |     if Docsplit::DEPENDENCIES[:osd]
59 |       pages = 1..4
60 |       Docsplit.extract_text('test/fixtures/corrosion.reoriented.pdf', :output => OUTPUT, :pages=>pages, :force_ocr => true)
61 |       letters = Hash.new(0)
62 |       nonletters = Hash.new(0)
63 |       
64 |       pages.each do |number|
65 |         File.open(File.join(OUTPUT,"corrosion.reoriented_#{number}.txt")).each_char do |c| 
66 |           case c
67 |           when /[A-Za-z]/
68 |             letters[c] += 1
69 |           when /\s/
70 |           else
71 |             nonletters[c] += 1
72 |           end
73 |         end
74 |       end
75 |       
76 |       # the corrosion.pdf has 6160 letters & 362 nonletters, or ~17:1
77 |       # so lets give a fudge factor of ~half of that or 8:1
78 |       assert letters.values.reduce(0,:+)/8 > nonletters.values.reduce(0,:+), "Expected that text extracted with orientation detection would have more letters."
79 |     else
80 |       skip "Orientation detection module (osd) for Tesseract isn't installed"
81 |     end
82 |   end
83 | 
84 | end
85 | 


--------------------------------------------------------------------------------
/lib/docsplit/text_cleaner.rb:
--------------------------------------------------------------------------------
  1 | require 'strscan'
  2 | 
  3 | module Docsplit
  4 | 
  5 |   # Cleans up OCR'd text by using a series of heuristics to remove garbage
  6 |   # words. Algorithms taken from:
  7 |   #
  8 |   #     Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
  9 |   #       -- Taghva, Nartker, Condit, and Borsack
 10 |   #
 11 |   #     Improving Search and Retrieval Performance through Shortening Documents,
 12 |   #     Detecting Garbage, and Throwing out Jargon
 13 |   #       -- Kulp
 14 |   #
 15 |   class TextCleaner
 16 | 
 17 |     # Cached regexes we plan on using.
 18 |     WORD        = /\S+/
 19 |     SPACE       = /\s+/
 20 |     NEWLINE     = /[\r\n]/
 21 |     ALNUM       = /[a-z0-9]/i
 22 |     PUNCT       = /[[:punct:]]/i
 23 |     REPEAT      = /([^0-9])\1{2,}/
 24 |     UPPER       = /[A-Z]/
 25 |     LOWER       = /[a-z]/
 26 |     ACRONYM     = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
 27 |     ALL_ALPHA   = /^[a-z]+$/i
 28 |     CONSONANT   = /(^y|[bcdfghjklmnpqrstvwxz])/i
 29 |     VOWEL       = /([aeiou]|y$)/i
 30 |     CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
 31 |     VOWEL_5     = /[aeiou]{5}/i
 32 |     REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
 33 |     SINGLETONS  = /^[AaIi]$/
 34 | 
 35 |     # For the time being, `clean` uses the regular StringScanner, and not the
 36 |     # multibyte-aware version, coercing to ASCII first.
 37 |     def clean(text)
 38 |       if String.method_defined?(:encode)
 39 |         text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
 40 |       else
 41 |         require 'iconv' unless defined?(Iconv)
 42 |         text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
 43 |       end
 44 | 
 45 |       scanner = StringScanner.new(text)
 46 |       cleaned = []
 47 |       spaced  = false
 48 |       loop do
 49 |         if space = scanner.scan(SPACE)
 50 |           cleaned.push(space) unless spaced && (space !~ NEWLINE)
 51 |           spaced = true
 52 |         elsif word = scanner.scan(WORD)
 53 |           unless garbage(word)
 54 |             cleaned.push(word)
 55 |             spaced = false
 56 |           end
 57 |         elsif scanner.eos?
 58 |           return cleaned.join('').gsub(REPEATED, '')
 59 |         end
 60 |       end
 61 |     end
 62 | 
 63 |     # Is a given word OCR garbage?
 64 |     def garbage(w)
 65 |       acronym = w =~ ACRONYM
 66 | 
 67 |       # More than 30 bytes in length.
 68 |       (w.length > 30) ||
 69 | 
 70 |       # If there are three or more identical characters in a row in the string.
 71 |       (w =~ REPEAT) ||
 72 | 
 73 |       # More punctuation than alpha numerics.
 74 |       (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
 75 | 
 76 |       # Ignoring the first and last characters in the string, if there are three or
 77 |       # more different punctuation characters in the string.
 78 |       (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
 79 | 
 80 |       # Four or more consecutive vowels, or five or more consecutive consonants.
 81 |       ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
 82 | 
 83 |       # Number of uppercase letters greater than lowercase letters, but the word is
 84 |       # not all uppercase + punctuation.
 85 |       (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
 86 | 
 87 |       # Single letters that are not A or I.
 88 |       (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
 89 | 
 90 |       # All characters are alphabetic and there are 8 times more vowels than
 91 |       # consonants, or 8 times more consonants than vowels.
 92 |       (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
 93 |         (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
 94 |           (cons > vows * 8)))
 95 |     end
 96 | 
 97 |   end
 98 | 
 99 | end
100 | 


--------------------------------------------------------------------------------
/lib/docsplit/image_extractor.rb:
--------------------------------------------------------------------------------
  1 | module Docsplit
  2 | 
  3 |   # Delegates to GraphicsMagick in order to convert PDF documents into
  4 |   # nicely sized images.
  5 |   class ImageExtractor
  6 | 
  7 |     MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
  8 |     DEFAULT_FORMAT  = :png
  9 |     DEFAULT_DENSITY = '150'
 10 | 
 11 |     # Extract a list of PDFs as rasterized page images, according to the
 12 |     # configuration in options.
 13 |     def extract(pdfs, options)
 14 |       @pdfs = [pdfs].flatten
 15 |       extract_options(options)
 16 |       @pdfs.each do |pdf|
 17 |         previous = nil
 18 |         @sizes.each_with_index do |size, i|
 19 |           @formats.each {|format| convert(pdf, size, format, previous) }
 20 |           previous = size if @rolling
 21 |         end
 22 |       end
 23 |     end
 24 | 
 25 |     # Convert a single PDF into page images at the specified size and format.
 26 |     # If `--rolling`, and we have a previous image at a larger size to work with,
 27 |     # we simply downsample that image, instead of re-rendering the entire PDF.
 28 |     # Now we generate one page at a time, a counterintuitive opimization
 29 |     # suggested by the GraphicsMagick list, that seems to work quite well.
 30 |     def convert(pdf, size, format, previous=nil)
 31 |       tempdir   = Dir.mktmpdir
 32 |       basename  = File.basename(pdf, File.extname(pdf))
 33 |       directory = directory_for(size)
 34 |       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
 35 |       escaped_pdf = ESCAPE[pdf]
 36 |       FileUtils.mkdir_p(directory) unless File.exist?(directory)
 37 |       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
 38 |       if previous
 39 |         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
 40 |         result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
 41 |         raise ExtractionFailed, result if $? != 0
 42 |       else
 43 |         page_list(pages).each do |page|
 44 |           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
 45 |           cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
 46 |           result = `#{cmd}`.chomp
 47 |           raise ExtractionFailed, result if $? != 0
 48 |         end
 49 |       end
 50 |     ensure
 51 |       FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
 52 |     end
 53 | 
 54 | 
 55 |     private
 56 | 
 57 |     # Extract the relevant GraphicsMagick options from the options hash.
 58 |     def extract_options(options)
 59 |       @output  = options[:output]  || '.'
 60 |       @pages   = options[:pages]
 61 |       @density = options[:density] || DEFAULT_DENSITY
 62 |       @formats = [options[:format] || DEFAULT_FORMAT].flatten
 63 |       @sizes   = [options[:size]].flatten.compact
 64 |       @sizes   = [nil] if @sizes.empty?
 65 |       @rolling = !!options[:rolling]
 66 |     end
 67 | 
 68 |     # If there's only one size requested, generate the images directly into
 69 |     # the output directory. Multiple sizes each get a directory of their own.
 70 |     def directory_for(size)
 71 |       path = @sizes.length == 1 ? @output : File.join(@output, size)
 72 |       File.expand_path(path)
 73 |     end
 74 | 
 75 |     # Generate the resize argument.
 76 |     def resize_arg(size)
 77 |       size.nil? ? '' : "-resize #{size}"
 78 |     end
 79 | 
 80 |     # Generate the appropriate quality argument for the image format.
 81 |     def quality_arg(format)
 82 |       case format.to_s
 83 |       when /jpe?g/ then "-quality 85"
 84 |       when /png/   then "-quality 100"
 85 |       else ""
 86 |       end
 87 |     end
 88 | 
 89 |     # Generate the expanded list of requested page numbers.
 90 |     def page_list(pages)
 91 |       pages.split(',').map { |range|
 92 |         if range.include?('-')
 93 |           range = range.split('-')
 94 |           Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
 95 |         else
 96 |           range.to_i
 97 |         end
 98 |       }.flatten.uniq.sort
 99 |     end
100 | 
101 |   end
102 | 
103 | end
104 | 


--------------------------------------------------------------------------------
/lib/docsplit.rb:
--------------------------------------------------------------------------------
  1 | require 'tmpdir'
  2 | require 'fileutils'
  3 | require 'shellwords'
  4 | 
  5 | # The Docsplit module delegates to the Java PDF extractors.
  6 | module Docsplit
  7 | 
  8 |   VERSION       = '0.7.6' # Keep in sync with gemspec.
  9 | 
 10 |   ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
 11 | 
 12 |   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
 13 |   ESCAPED_ROOT  = ESCAPE[ROOT]
 14 | 
 15 |   METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
 16 |   
 17 |   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
 18 | 
 19 |   DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
 20 | 
 21 |   # Check for all dependencies, and note their absence.
 22 |   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
 23 |   DEPENDENCIES.each_key do |dep|
 24 |     dirs.each do |dir|
 25 |       if File.executable?(File.join(dir, dep.to_s))
 26 |         DEPENDENCIES[dep] = true
 27 |         break
 28 |       end
 29 |     end
 30 |   end
 31 | 
 32 |   # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
 33 |   if DEPENDENCIES[:tesseract]
 34 |     # osd will be listed in tesseract --listlangs
 35 |     val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
 36 |     DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
 37 |   end
 38 | 
 39 |     # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
 40 |   # broke.
 41 |   class ExtractionFailed < StandardError; end
 42 | 
 43 |   # Use the ExtractPages Java class to burst a PDF into single pages.
 44 |   def self.extract_pages(pdfs, opts={})
 45 |     pdfs = ensure_pdfs(pdfs)
 46 |     PageExtractor.new.extract(pdfs, opts)
 47 |   end
 48 | 
 49 |   # Use the ExtractText Java class to write out all embedded text.
 50 |   def self.extract_text(pdfs, opts={})
 51 |     pdfs = ensure_pdfs(pdfs)
 52 |     TextExtractor.new.extract(pdfs, opts)
 53 |   end
 54 | 
 55 |   # Use the ExtractImages Java class to rasterize a PDF into each page's image.
 56 |   def self.extract_images(pdfs, opts={})
 57 |     pdfs = ensure_pdfs(pdfs)
 58 |     opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
 59 |     ImageExtractor.new.extract(pdfs, opts)
 60 |   end
 61 | 
 62 |   # Use JODCConverter to extract the documents as PDFs.
 63 |   # If the document is in an image format, use GraphicsMagick to extract the PDF.
 64 |   def self.extract_pdf(docs, opts={})
 65 |     PdfExtractor.new.extract(docs, opts)
 66 |   end
 67 | 
 68 |   # Define custom methods for each of the metadata keys that we support.
 69 |   # Use the ExtractInfo Java class to print out a single bit of metadata.
 70 |   METADATA_KEYS.each do |key|
 71 |     instance_eval <<-EOS
 72 |       def self.extract_#{key}(pdfs, opts={})
 73 |         pdfs = ensure_pdfs(pdfs)
 74 |         InfoExtractor.new.extract(:#{key}, pdfs, opts)
 75 |       end
 76 |     EOS
 77 |   end
 78 |   
 79 |   def self.extract_info(pdfs, opts={})
 80 |     pdfs = ensure_pdfs(pdfs)
 81 |     InfoExtractor.new.extract_all(pdfs, opts)
 82 |   end
 83 | 
 84 |   # Utility method to clean OCR'd text with garbage characters.
 85 |   def self.clean_text(text)
 86 |     TextCleaner.new.clean(text)
 87 |   end
 88 | 
 89 |   private
 90 | 
 91 |   # Normalize a value in an options hash for the command line.
 92 |   # Ranges look like: 1-10, Arrays like: 1,2,3.
 93 |   def self.normalize_value(value)
 94 |     case value
 95 |     when Range then value.to_a.join(',')
 96 |     when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
 97 |     else            value.to_s
 98 |     end
 99 |   end
100 | 
101 | end
102 | 
103 | require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
104 | require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
105 | require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
106 | require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
107 | require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
108 | require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
109 | require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
110 | 


--------------------------------------------------------------------------------
/lib/docsplit/command_line.rb:
--------------------------------------------------------------------------------
  1 | require 'optparse'
  2 | require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
  3 | 
  4 | module Docsplit
  5 | 
  6 |   # A single command-line utility to separate a PDF into all its component parts.
  7 |   class CommandLine
  8 | 
  9 |     BANNER = <<-EOS
 10 | docsplit breaks apart documents into images, text, or individual pages.
 11 | It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
 12 | 
 13 | Usage:
 14 |   docsplit COMMAND [OPTIONS] path/to/doc.pdf
 15 |   Main commands:
 16 |     pages, images, text, pdf.
 17 |   Metadata commands:
 18 |     author, date, creator, keywords, producer, subject, title, length.
 19 | 
 20 | Example:
 21 |   docsplit images --size 700x --format jpg document.pdf
 22 | 
 23 | Dependencies:
 24 |   Ruby, Java, A working GraphicsMagick (gm) command,
 25 |   and a headless OpenOffice server for non-PDF documents.
 26 | 
 27 | Options:
 28 |     (size, pages and format can take comma-separated values)
 29 | 
 30 |     EOS
 31 | 
 32 |     # Creating a CommandLine runs off of the contents of ARGV.
 33 |     def initialize
 34 |       parse_options
 35 |       cmd = ARGV.shift
 36 |       @command = cmd && cmd.to_sym
 37 |       run
 38 |     end
 39 | 
 40 |     # Delegate to the Docsplit Ruby API to perform all extractions.
 41 |     def run
 42 |       begin
 43 |         case @command
 44 |         when :images  then Docsplit.extract_images(ARGV, @options)
 45 |         when :pages   then Docsplit.extract_pages(ARGV, @options)
 46 |         when :text    then Docsplit.extract_text(ARGV, @options)
 47 |         when :pdf     then Docsplit.extract_pdf(ARGV, @options)
 48 |         else
 49 |           if METADATA_KEYS.include?(@command)
 50 |             value = Docsplit.send("extract_#{@command}", ARGV, @options)
 51 |             puts value unless value.nil?
 52 |           else
 53 |             usage
 54 |           end
 55 |         end
 56 |       rescue ExtractionFailed => e
 57 |         puts e.message.chomp
 58 |         exit(1)
 59 |       end
 60 |     end
 61 | 
 62 |     # Print out the usage help message.
 63 |     def usage
 64 |       puts "\n#{@option_parser}\n"
 65 |       exit
 66 |     end
 67 | 
 68 | 
 69 |     private
 70 | 
 71 |     # Use the OptionParser library to parse out all supported options. Return
 72 |     # options formatted for the Ruby API.
 73 |     def parse_options
 74 |       @options = {:ocr => :default, :clean => true}
 75 |       @option_parser = OptionParser.new do |opts|
 76 |         opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
 77 |           @options[:output] = d
 78 |         end
 79 |         opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
 80 |           @options[:pages] = p
 81 |         end
 82 |         opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
 83 |           @options[:size] = s.split(',')
 84 |         end
 85 |         opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
 86 |           @options[:format] = t.split(',')
 87 |         end
 88 |         opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
 89 |           @options[:density] = d
 90 |         end
 91 |         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
 92 |           @options[:ocr] = o
 93 |         end
 94 |         opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
 95 |           @options[:clean] = false
 96 |         end
 97 |         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
 98 |           @options[:language] = l
 99 |         end
100 |         opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
101 |           @options[:detect_orientation] = false
102 |         end
103 |         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
104 |           @options[:rolling] = true
105 |         end
106 |         opts.on_tail('-v', '--version', 'display docsplit version') do
107 |           puts "Docsplit version #{Docsplit::VERSION}"
108 |           exit
109 |         end
110 |         opts.on_tail('-h', '--help', 'display this help message') do
111 |           usage
112 |         end
113 |       end
114 |       @option_parser.banner = BANNER
115 |       begin
116 |         @option_parser.parse!(ARGV)
117 |       rescue OptionParser::InvalidOption => e
118 |         puts e.message
119 |         exit(1)
120 |       end
121 |     end
122 | 
123 |   end
124 | 
125 | end


--------------------------------------------------------------------------------
/lib/docsplit/text_extractor.rb:
--------------------------------------------------------------------------------
  1 | module Docsplit
  2 | 
  3 |   # Delegates to **pdftotext** and **tesseract** in order to extract text from
  4 |   # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
  5 |   # forbid OCR extraction, but by default the heuristic works like this:
  6 |   #
  7 |   #  * Check for the presence of fonts in the PDF. If no fonts are detected,
  8 |   #    OCR is used automatically.
  9 |   #  * Extract the text of each page with **pdftotext**, if the page has less
 10 |   #    than 100 bytes of text (a scanned image page, or a page that just
 11 |   #    contains a filename and a page number), then add it to the list of
 12 |   #    `@pages_to_ocr`.
 13 |   #  * Re-OCR each page in the `@pages_to_ocr` list at the end.
 14 |   #
 15 |   class TextExtractor
 16 | 
 17 |     NO_TEXT_DETECTED = /---------\n\Z/
 18 | 
 19 |     OCR_FLAGS   = '-density 400x400 -colorspace GRAY'
 20 |     MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
 21 | 
 22 |     MIN_TEXT_PER_PAGE = 100 # in bytes
 23 | 
 24 |     def initialize
 25 |       @pages_to_ocr = []
 26 |     end
 27 | 
 28 |     # Extract text from a list of PDFs.
 29 |     def extract(pdfs, opts)
 30 |       extract_options opts
 31 |       FileUtils.mkdir_p @output unless File.exist?(@output)
 32 |       [pdfs].flatten.each do |pdf|
 33 |         @pdf_name = File.basename(pdf, File.extname(pdf))
 34 |         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
 35 |         if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
 36 |           extract_from_ocr(pdf, pages)
 37 |         else
 38 |           extract_from_pdf(pdf, pages)
 39 |           if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
 40 |             extract_from_ocr(pdf, @pages_to_ocr)
 41 |           end
 42 |         end
 43 |       end
 44 |     end
 45 | 
 46 |     # Does a PDF have any text embedded?
 47 |     def contains_text?(pdf)
 48 |       fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
 49 |       !fonts.match(NO_TEXT_DETECTED)
 50 |     end
 51 | 
 52 |     # Extract a page range worth of text from a PDF, directly.
 53 |     def extract_from_pdf(pdf, pages)
 54 |       return extract_full(pdf) unless pages
 55 |       pages.each {|page| extract_page(pdf, page) }
 56 |     end
 57 | 
 58 |     # Extract a page range worth of text from a PDF via OCR.
 59 |     def extract_from_ocr(pdf, pages)
 60 |       tempdir = Dir.mktmpdir
 61 |       base_path = File.join(@output, @pdf_name)
 62 |       escaped_pdf = ESCAPE[pdf]
 63 |       psm = @detect_orientation ? "-psm 1" : ""
 64 |       if pages
 65 |         pages.each do |page|
 66 |           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
 67 |           escaped_tiff = ESCAPE[tiff]
 68 |           file = "#{base_path}_#{page}"
 69 |           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
 70 |           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
 71 |           clean_text(file + '.txt') if @clean_ocr
 72 |           FileUtils.remove_entry_secure tiff
 73 |         end
 74 |       else
 75 |         tiff = "#{tempdir}/#{@pdf_name}.tif"
 76 |         escaped_tiff = ESCAPE[tiff]
 77 |         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
 78 |         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
 79 |         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
 80 |         clean_text(base_path + '.txt') if @clean_ocr
 81 |       end
 82 |     ensure
 83 |       FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
 84 |     end
 85 | 
 86 | 
 87 |     private
 88 | 
 89 |     def clean_text(file)
 90 |       File.open(file, 'r+') do |f|
 91 |         text = f.read
 92 |         f.truncate(0)
 93 |         f.rewind
 94 |         f.write(Docsplit.clean_text(text))
 95 |       end
 96 |     end
 97 | 
 98 |     # Run an external process and raise an exception if it fails.
 99 |     def run(command)
100 |       result = `#{command}`
101 |       raise ExtractionFailed, result if $? != 0
102 |       result
103 |     end
104 | 
105 |     # Run pdftotext command
106 |     def run_pdftotext(pdf, text_path, options=[])
107 |       options << '-enc UTF-8'
108 |       options << '-layout' if @keep_layout
109 | 
110 |       run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
111 |     end
112 | 
113 |     # Extract the full contents of a pdf as a single file, directly.
114 |     def extract_full(pdf)
115 |       text_path = File.join(@output, "#{@pdf_name}.txt")
116 |       run_pdftotext pdf, text_path
117 |     end
118 | 
119 |     # Extract the contents of a single page of text, directly, adding it to
120 |     # the `@pages_to_ocr` list if the text length is inadequate.
121 |     def extract_page(pdf, page)
122 |       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
123 |       run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"]
124 | 
125 |       unless @forbid_ocr
126 |         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
127 |       end
128 |     end
129 | 
130 |     def extract_options(options)
131 |       @output             = options[:output] || '.'
132 |       @pages              = options[:pages]
133 |       @force_ocr          = options[:ocr] == true
134 |       @forbid_ocr         = options[:ocr] == false
135 |       @language           = options[:language] || 'eng'
136 |       @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
137 |       @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
138 |       @keep_layout        = options.fetch(:layout, false)
139 |     end
140 | 
141 |   end
142 | 
143 | end
144 | 


--------------------------------------------------------------------------------
/lib/docsplit/pdf_extractor.rb:
--------------------------------------------------------------------------------
  1 | require 'rbconfig'
  2 | 
  3 | module Docsplit
  4 |   class PdfExtractor
  5 |     @@executable     = nil
  6 |     @@version_string = nil
  7 | 
  8 |     # Provide a set of helper functions to determine the OS.
  9 |     HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
 10 |     def windows?
 11 |       !!HOST_OS.match(/mswin|windows|cygwin/i)
 12 |     end
 13 |     def osx?
 14 |       !!HOST_OS.match(/darwin/i)
 15 |     end
 16 |     def linux?
 17 |       !!HOST_OS.match(/linux/i)
 18 |     end
 19 | 
 20 |     # The first line of the help output holds the name and version number
 21 |     # of the office software to be used for extraction.
 22 |     def version_string
 23 |       unless @@version_string
 24 |         null = windows? ? "NUL" : "/dev/null"
 25 |         @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
 26 |         if !!@@version_string.to_s.match(/[0-9]*/)
 27 |           @@version_string = `#{office_executable} --version`.split("\n").first
 28 |         end
 29 |       end
 30 |       @@version_string
 31 |     end
 32 |     def libre_office?
 33 |       !!version_string.match(/^LibreOffice/)
 34 |     end
 35 |     def open_office?
 36 |       !!version_string.match(/^OpenOffice.org/)
 37 |     end
 38 | 
 39 |     # A set of default locations to search for office software
 40 |     # These have been extracted from JODConverter.  Each listed
 41 |     # path should contain a directory "program" which in turn
 42 |     # contains the "soffice" executable.
 43 |     # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
 44 |     def office_search_paths
 45 |       if windows?
 46 |         office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
 47 |         program_files_path = ENV["CommonProgramFiles"]
 48 |         search_paths       = office_names.map{ |program| File.join(program_files_path, program) }
 49 |       elsif osx?
 50 |         search_paths = %w(
 51 |           /Applications/LibreOffice.app/Contents
 52 |           /Applications/OpenOffice.org.app/Contents
 53 |         )
 54 |       else # probably linux/unix
 55 |         # heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
 56 |         search_paths = %w(
 57 |           /usr/lib/libreoffice
 58 |           /usr/lib64/libreoffice
 59 |           /opt/libreoffice
 60 |           /usr/lib/openoffice
 61 |           /usr/lib64/openoffice
 62 |           /opt/openoffice.org3
 63 |           /app/vendor/libreoffice
 64 |           /usr/bin/libreoffice
 65 |           /usr/local/bin
 66 |           /usr/lib64/libreoffice
 67 |           /usr/lib64/openoffice.org3
 68 |         )
 69 |       end
 70 |       search_paths
 71 |     end
 72 | 
 73 |     # Identify the path to a working office executable.
 74 |     def office_executable
 75 |       paths = office_search_paths
 76 | 
 77 |       # If an OFFICE_PATH has been specified on the commandline
 78 |       # raise an error if that path isn't valid, otherwise, add
 79 |       # it to the front of our search paths.
 80 |       if ENV['OFFICE_PATH']
 81 |         raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
 82 |         paths.unshift(ENV['OFFICE_PATH'])
 83 |       end
 84 | 
 85 |       # The location of the office executable is OS dependent
 86 |       path_pieces = ["soffice"]
 87 |       if windows?
 88 |         path_pieces += [["program", "soffice.bin"]]
 89 |       elsif osx?
 90 |         path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
 91 |       else
 92 |         path_pieces += [["program", "soffice"]]
 93 |       end
 94 | 
 95 |       # Search for the first suitable office executable
 96 |       # and short circuit an executable is found.
 97 |       paths.each do |path|
 98 |         if File.exist? path
 99 |           @@executable ||= path unless File.directory? path
100 |           path_pieces.each do |pieces|
101 |             check_path = File.join(path, pieces)
102 |             @@executable ||= check_path if File.exist? check_path
103 |           end
104 |         end
105 |         break if @@executable
106 |       end
107 |       raise OfficeNotFound, "No office software found" unless @@executable
108 |       @@executable
109 |     end
110 | 
111 |     # Used to specify the office location for JODConverter
112 |     def office_path
113 |       File.dirname(File.dirname(office_executable))
114 |     end
115 | 
116 |     # Convert documents to PDF.
117 |     def extract(docs, opts)
118 |       out = opts[:output] || '.'
119 |       FileUtils.mkdir_p out unless File.exist?(out)
120 |       [docs].flatten.each do |doc|
121 |         ext = File.extname(doc)
122 |         basename = File.basename(doc, ext)
123 |         escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
124 | 
125 |         if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
126 |           `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
127 |         else
128 |           if libre_office?
129 |             # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
130 |             ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
131 | 
132 |             options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
133 |             cmd = "#{office_executable} #{options} 2>&1"
134 |             result = `#{cmd}`.chomp
135 |             raise ExtractionFailed, result if $? != 0
136 |             true
137 |           else # open office presumably, rely on JODConverter to figure it out.
138 |             options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
139 |             run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
140 |           end
141 |         end
142 |       end
143 |     end
144 | 
145 |     CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
146 | 
147 |     LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
148 | 
149 |     HEADLESS      = "-Djava.awt.headless=true"
150 | 
151 |     private
152 | 
153 |     # Runs a Java command, with quieted logging, and the classpath set properly.
154 |     def run_jod(command, pdfs, opts, return_output=false)
155 | 
156 |       pdfs   = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
157 |       office = osx? ? "-Doffice.home=#{office_path}" : office_path
158 |       cmd    = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
159 |       result = `#{cmd}`.chomp
160 |       raise ExtractionFailed, result if $? != 0
161 |       return return_output ? (result.empty? ? nil : result) : true
162 |     end
163 | 
164 |     class OfficeNotFound < StandardError; end
165 |   end
166 | end
167 | 


--------------------------------------------------------------------------------
/test/fixtures/with_pdf_extension/actually_an_rtf.pdf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
 2 | {\f357\froman\fcharset238\fprq2 Times New Roman CE;}{\f358\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f360\froman\fcharset161\fprq2 Times New Roman Greek;}{\f361\froman\fcharset162\fprq2 Times New Roman Tur;}
 3 | {\f362\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f363\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f364\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f365\fswiss\fcharset238\fprq2 Arial CE;}
 4 | {\f366\fswiss\fcharset204\fprq2 Arial Cyr;}{\f368\fswiss\fcharset161\fprq2 Arial Greek;}{\f369\fswiss\fcharset162\fprq2 Arial Tur;}{\f370\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f371\fswiss\fcharset178\fprq2 Arial (Arabic);}
 5 | {\f372\fswiss\fcharset186\fprq2 Arial Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;
 6 | \red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
 7 | \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \snext0 Normal;}{\*\cs10 \additive Default Paragraph Font;}{\s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
 8 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext15 Normal (Web);}{\*\cs16 \additive \i \sbasedon10 Emphasis;}{\*\cs17 \additive \ul\cf2 \sbasedon10 Hyperlink;}{\*\cs18 \additive \b \sbasedon10 Strong;}}{\info
 9 | {\title The USA -  how\'92s Obama doing}{\author foster}{\operator foster}{\creatim\yr2009\mo10\dy7\hr9\min50}{\revtim\yr2009\mo10\dy7\hr9\min52}{\version1}{\edmins1}{\nofpages1}{\nofwords0}{\nofchars0}{\*\company  home}{\nofcharsws0}{\vern8247}}
10 | \paperw11906\paperh16838 \widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1800\dgvorigin1440\dghshow1\dgvshow1
11 | \jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl 
12 | {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5
13 | \pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang
14 | {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain \s15\qc \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
15 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\f1\fs20 The USA -\~ how\rquote s Obama doing?}{\i\f1\fs20 \line }{\cs16\f1\fs20 ODD Circle\~\~ Tue\~ 28th Apl 09\~\~ at The Blue Mugge Pub}{\i\f1\fs20 
16 | \par }\pard \s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\cs16\f1\fs20 Notes using Obama\rquote s book \~The Audacity of Hope\~ and articles from }{\field\fldedit{\*\fldinst {\cs16\f1\fs20  
17 | HYPERLINK "http://www.opendemocracy.net/" }}{\fldrslt {\cs17\i\f1\fs20\ul\cf2 www.opendemocracy.net}}}{\cs16\f1\fs20  }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 1.\~ Hope\rquote  is the word, from his book
18 |  and from his speeches which carries infectious resonance.\~\~ Yet, already\~ Naomi Klein writing in The Nation this month has sign-posted \lquote hopebroken and hopesick\rquote \'85\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~
19 |  What are our hopes, reservations and fears?}{\i\f1\fs20 
20 | \par }{\cs16\f1\fs20 2.\~ Geoffrey Hodgson,\~ director of Reuters\rquote  Foundation Programme, Oxford}{\i\f1\fs20 \line }{\cs16\f1\fs20 \'93
21 | No American president has started with more personal ability or more sheer good-will from around the world than Barack Obama\'85\'94\~\~ }{\i\f1\fs20 \line }{\cs16\f1\fs20 At the recent G20\~ \'93Obama spoke of a new, more subtle, m
22 | ore truthful style of leadership\'94.\~\~ }{\i\f1\fs20 
23 | \par }{\cs16\f1\fs20 3.\~ We\rquote ll seek to list actions, plans and policies outlined since inauguration and assess merits.}{\i\f1\fs20 
24 | \par }{\cs16\f1\fs20 4.\~\~ Seeing Obama from another nation\rquote s perspective:\~\~\~ Daniel Lichanian on\~ France\rquote s Obama fixation:\~ \'93During the past eight years the French thought of their homeland as far superior \'85.\~
25 |  Now they celebrate the USA for\'85 an elevated politics that many fear is unattainable in France\'94.\~\~\~ }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 5.\~ OpenDemocracy\~ sought views from their authors from around the world on these thre
26 | e questions regarding Obama:\~ a)\~\~ one thing you hope for\~\~\~\~ b)\~ one thing you fear\~\~ c) one piece of advice.\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~ Here\rquote s a summary:\~\~\~ }{\i\f1\fs20 
27 | \par }{\cs16\f1\fs20 From a }{\cs18\b\i\f1\fs20 British}{\cs16\f1\fs20  professor:\~\~\~ hope;\~\~ \lquote take immediate and sustained action on climate change\rquote ;\~\~ fear\'85\~\~\~ \lquote unable to break free of past policy on Israel and Afghanistan
28 | \rquote ;\~\~ advice\'85 \lquote play it long\rquote }{\i\f1\fs20 
29 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 African}{\cs16\f1\fs20  Foundation for Development:\~\~\~ hope\'85\~ \lquote at last, a formal apology for slavery and dispossession of native Americans \endash  the two original sins of the Republic\rquote ;\~\~\~
30 |  fear\'85\~ \lquote business as usual\rquote ;\~\~\~ advice\'85\~ \lquote Trust your instincts\'85your appeal for change\rquote .}{\i\f1\fs20 
31 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Germany}{\cs16\f1\fs20 :\~\~ hope\'85\~ \lquote the USA actively engages in conflict resolution, starting in the middle-east.\rquote \~\~\~\~\~ fear\'85\~ \lquote protectionist tendencies\'85 US spe
32 | cial interest groups and other international players\'85\rquote  deflecting, de-railing Obama.\~\~ advice:\~\~ \lquote Pursue global policies in the most inclusive way\rquote .}{\i\f1\fs20 
33 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Egyp}{\cs16\f1\fs20 t:\~\~ hope\~ \lquote steer a course\'85 energetic and ambitious \'85 but not aggressive;\~\~ fear:\~ \lquote could yield to the \lquote wounded lion\rquote  impulse in US politics;\~ advice:\~ 
34 | \lquote be yourself\rquote .}{\i\f1\fs20 
35 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Japan}{\cs16\f1\fs20 :\~\~ hope:\~\~ \lquote the world is in fact round\'85 there are people living beyond America\rquote s horizons\rquote ;\~\~ fear:\~ \lquote Obama\rquote 
36 | s America regains confidence in the wrong way\rquote  ;\~ advice:}{\i\f1\fs20 \line }{\cs16\f1\fs20 \lquote he remains true to his acceptance speech\~\~ \'93I\rquote ll always be honest with you\'94.}{\i\f1\fs20 
37 | \par }{\cs16\f1\fs20 5.\~ A quote from The Audacity of Hope:\~ at the end of the }{\cs16\f1\fs20\ul Politics}{\cs16\f1\fs20  chapter:}{\i\f1\fs20 
38 | \par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\fs20 \~\~ \'93\'85 in a democracy, the most important office is the office of citizen\'94.}{
39 | \par }}


--------------------------------------------------------------------------------
/test/fixtures/obama_hopes.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
 2 | {\f357\froman\fcharset238\fprq2 Times New Roman CE;}{\f358\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f360\froman\fcharset161\fprq2 Times New Roman Greek;}{\f361\froman\fcharset162\fprq2 Times New Roman Tur;}
 3 | {\f362\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f363\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f364\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f365\fswiss\fcharset238\fprq2 Arial CE;}
 4 | {\f366\fswiss\fcharset204\fprq2 Arial Cyr;}{\f368\fswiss\fcharset161\fprq2 Arial Greek;}{\f369\fswiss\fcharset162\fprq2 Arial Tur;}{\f370\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f371\fswiss\fcharset178\fprq2 Arial (Arabic);}
 5 | {\f372\fswiss\fcharset186\fprq2 Arial Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;
 6 | \red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
 7 | \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \snext0 Normal;}{\*\cs10 \additive Default Paragraph Font;}{\s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
 8 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext15 Normal (Web);}{\*\cs16 \additive \i \sbasedon10 Emphasis;}{\*\cs17 \additive \ul\cf2 \sbasedon10 Hyperlink;}{\*\cs18 \additive \b \sbasedon10 Strong;}}{\info
 9 | {\title The USA -  how\'92s Obama doing}{\author foster}{\operator foster}{\creatim\yr2009\mo10\dy7\hr9\min50}{\revtim\yr2009\mo10\dy7\hr9\min52}{\version1}{\edmins1}{\nofpages1}{\nofwords0}{\nofchars0}{\*\company  home}{\nofcharsws0}{\vern8247}}
10 | \paperw11906\paperh16838 \widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1800\dgvorigin1440\dghshow1\dgvshow1
11 | \jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl 
12 | {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5
13 | \pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang
14 | {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain \s15\qc \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
15 | \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\f1\fs20 The USA -\~ how\rquote s Obama doing?}{\i\f1\fs20 \line }{\cs16\f1\fs20 ODD Circle\~\~ Tue\~ 28th Apl 09\~\~ at The Blue Mugge Pub}{\i\f1\fs20 
16 | \par }\pard \s15\ql \li0\ri0\sb100\sa100\sbauto1\saauto1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\cs16\f1\fs20 Notes using Obama\rquote s book \~The Audacity of Hope\~ and articles from }{\field\fldedit{\*\fldinst {\cs16\f1\fs20  
17 | HYPERLINK "http://www.opendemocracy.net/" }}{\fldrslt {\cs17\i\f1\fs20\ul\cf2 www.opendemocracy.net}}}{\cs16\f1\fs20  }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 1.\~ Hope\rquote  is the word, from his book
18 |  and from his speeches which carries infectious resonance.\~\~ Yet, already\~ Naomi Klein writing in The Nation this month has sign-posted \lquote hopebroken and hopesick\rquote \'85\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~
19 |  What are our hopes, reservations and fears?}{\i\f1\fs20 
20 | \par }{\cs16\f1\fs20 2.\~ Geoffrey Hodgson,\~ director of Reuters\rquote  Foundation Programme, Oxford}{\i\f1\fs20 \line }{\cs16\f1\fs20 \'93
21 | No American president has started with more personal ability or more sheer good-will from around the world than Barack Obama\'85\'94\~\~ }{\i\f1\fs20 \line }{\cs16\f1\fs20 At the recent G20\~ \'93Obama spoke of a new, more subtle, m
22 | ore truthful style of leadership\'94.\~\~ }{\i\f1\fs20 
23 | \par }{\cs16\f1\fs20 3.\~ We\rquote ll seek to list actions, plans and policies outlined since inauguration and assess merits.}{\i\f1\fs20 
24 | \par }{\cs16\f1\fs20 4.\~\~ Seeing Obama from another nation\rquote s perspective:\~\~\~ Daniel Lichanian on\~ France\rquote s Obama fixation:\~ \'93During the past eight years the French thought of their homeland as far superior \'85.\~
25 |  Now they celebrate the USA for\'85 an elevated politics that many fear is unattainable in France\'94.\~\~\~ }{\i\f1\fs20 \line \line }{\cs16\f1\fs20 5.\~ OpenDemocracy\~ sought views from their authors from around the world on these thre
26 | e questions regarding Obama:\~ a)\~\~ one thing you hope for\~\~\~\~ b)\~ one thing you fear\~\~ c) one piece of advice.\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~\~ Here\rquote s a summary:\~\~\~ }{\i\f1\fs20 
27 | \par }{\cs16\f1\fs20 From a }{\cs18\b\i\f1\fs20 British}{\cs16\f1\fs20  professor:\~\~\~ hope;\~\~ \lquote take immediate and sustained action on climate change\rquote ;\~\~ fear\'85\~\~\~ \lquote unable to break free of past policy on Israel and Afghanistan
28 | \rquote ;\~\~ advice\'85 \lquote play it long\rquote }{\i\f1\fs20 
29 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 African}{\cs16\f1\fs20  Foundation for Development:\~\~\~ hope\'85\~ \lquote at last, a formal apology for slavery and dispossession of native Americans \endash  the two original sins of the Republic\rquote ;\~\~\~
30 |  fear\'85\~ \lquote business as usual\rquote ;\~\~\~ advice\'85\~ \lquote Trust your instincts\'85your appeal for change\rquote .}{\i\f1\fs20 
31 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Germany}{\cs16\f1\fs20 :\~\~ hope\'85\~ \lquote the USA actively engages in conflict resolution, starting in the middle-east.\rquote \~\~\~\~\~ fear\'85\~ \lquote protectionist tendencies\'85 US spe
32 | cial interest groups and other international players\'85\rquote  deflecting, de-railing Obama.\~\~ advice:\~\~ \lquote Pursue global policies in the most inclusive way\rquote .}{\i\f1\fs20 
33 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Egyp}{\cs16\f1\fs20 t:\~\~ hope\~ \lquote steer a course\'85 energetic and ambitious \'85 but not aggressive;\~\~ fear:\~ \lquote could yield to the \lquote wounded lion\rquote  impulse in US politics;\~ advice:\~ 
34 | \lquote be yourself\rquote .}{\i\f1\fs20 
35 | \par }{\cs16\f1\fs20 From }{\cs18\b\i\f1\fs20 Japan}{\cs16\f1\fs20 :\~\~ hope:\~\~ \lquote the world is in fact round\'85 there are people living beyond America\rquote s horizons\rquote ;\~\~ fear:\~ \lquote Obama\rquote 
36 | s America regains confidence in the wrong way\rquote  ;\~ advice:}{\i\f1\fs20 \line }{\cs16\f1\fs20 \lquote he remains true to his acceptance speech\~\~ \'93I\rquote ll always be honest with you\'94.}{\i\f1\fs20 
37 | \par }{\cs16\f1\fs20 5.\~ A quote from The Audacity of Hope:\~ at the end of the }{\cs16\f1\fs20\ul Politics}{\cs16\f1\fs20  chapter:}{\i\f1\fs20 
38 | \par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs28\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\cs16\fs20 \~\~ \'93\'85 in a democracy, the most important office is the office of citizen\'94.}{
39 | \par }}


--------------------------------------------------------------------------------
/vendor/conf/document-formats.js:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "Portable Document Format",
  4 |     "extension": "pdf",
  5 |     "mediaType": "application/pdf",
  6 |     "storePropertiesByFamily": {
  7 |       "DRAWING": {"FilterName": "draw_pdf_Export"},
  8 |       "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
  9 |       "PRESENTATION": {"FilterName": "impress_pdf_Export"},
 10 |       "TEXT": {"FilterName": "writer_pdf_Export"}
 11 |     }
 12 |   },
 13 |   {
 14 |     "name": "Macromedia Flash",
 15 |     "extension": "swf",
 16 |     "mediaType": "application/x-shockwave-flash",
 17 |     "storePropertiesByFamily": {
 18 |       "DRAWING": {"FilterName": "draw_flash_Export"},
 19 |       "PRESENTATION": {"FilterName": "impress_flash_Export"}
 20 |     }
 21 |   },
 22 |   {
 23 |     "name": "HTML",
 24 |     "extension": "html",
 25 |     "mediaType": "text/html",
 26 |     "inputFamily": "TEXT",
 27 |     "storePropertiesByFamily": {
 28 |       "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
 29 |       "PRESENTATION": {"FilterName": "impress_html_Export"},
 30 |       "TEXT": {"FilterName": "HTML (StarWriter)"}
 31 |     }
 32 |   },
 33 |   {
 34 |     "name": "OpenDocument Text",
 35 |     "extension": "odt",
 36 |     "mediaType": "application/vnd.oasis.opendocument.text",
 37 |     "inputFamily": "TEXT",
 38 |     "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
 39 |   },
 40 |   {
 41 |     "name": "OpenOffice.org 1.0 Text Document",
 42 |     "extension": "sxw",
 43 |     "mediaType": "application/vnd.sun.xml.writer",
 44 |     "inputFamily": "TEXT",
 45 |     "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
 46 |   },
 47 |   {
 48 |     "name": "Microsoft Word",
 49 |     "extension": "doc",
 50 |     "mediaType": "application/msword",
 51 |     "inputFamily": "TEXT",
 52 |     "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
 53 |   },
 54 |   {
 55 |     "name": "Microsoft Word 2007 XML",
 56 |     "extension": "docx",
 57 |     "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 58 |     "inputFamily": "TEXT"
 59 |   },
 60 |   {
 61 |     "name": "Rich Text Format",
 62 |     "extension": "rtf",
 63 |     "mediaType": "text/rtf",
 64 |     "inputFamily": "TEXT",
 65 |     "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
 66 |   },
 67 |   {
 68 |     "name": "WordPerfect",
 69 |     "extension": "wpd",
 70 |     "mediaType": "application/wordperfect",
 71 |     "inputFamily": "TEXT"
 72 |   },
 73 |   {
 74 |     "name": "Plain Text",
 75 |     "extension": "txt",
 76 |     "mediaType": "text/plain",
 77 |     "inputFamily": "TEXT",
 78 |     "loadProperties": {
 79 |       "FilterName": "Text (encoded)",
 80 |       "FilterOptions": "utf8"
 81 |     },
 82 |     "storePropertiesByFamily": {"TEXT": {
 83 |       "FilterName": "Text (encoded)",
 84 |       "FilterOptions": "utf8"
 85 |     }}
 86 |   },
 87 |   {
 88 |     "name": "MediaWiki wikitext",
 89 |     "extension": "wiki",
 90 |     "mediaType": "text/x-wiki",
 91 |     "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
 92 |   },
 93 |   {
 94 |     "name": "OpenDocument Spreadsheet",
 95 |     "extension": "ods",
 96 |     "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
 97 |     "inputFamily": "SPREADSHEET",
 98 |     "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
 99 |   },
100 |   {
101 |     "name": "OpenOffice.org 1.0 Spreadsheet",
102 |     "extension": "sxc",
103 |     "mediaType": "application/vnd.sun.xml.calc",
104 |     "inputFamily": "SPREADSHEET",
105 |     "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
106 |   },
107 |   {
108 |     "name": "Microsoft Excel",
109 |     "extension": "xls",
110 |     "mediaType": "application/vnd.ms-excel",
111 |     "inputFamily": "SPREADSHEET",
112 |     "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
113 |   },
114 |   {
115 |     "name": "Microsoft Excel 2007 XML",
116 |     "extension": "xlsx",
117 |     "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
118 |     "inputFamily": "SPREADSHEET"
119 |   },
120 |   {
121 |     "name": "Comma Separated Values",
122 |     "extension": "csv",
123 |     "mediaType": "text/csv",
124 |     "inputFamily": "SPREADSHEET",
125 |     "loadProperties": {
126 |       "FilterName": "Text - txt - csv (StarCalc)",
127 |       "FilterOptions": "44,34,0"
128 |     },
129 |     "storePropertiesByFamily": {"SPREADSHEET": {
130 |       "FilterName": "Text - txt - csv (StarCalc)",
131 |       "FilterOptions": "44,34,0"
132 |     }}
133 |   },
134 |   {
135 |     "name": "Tab Separated Values",
136 |     "extension": "tsv",
137 |     "mediaType": "text/tab-separated-values",
138 |     "inputFamily": "SPREADSHEET",
139 |     "loadProperties": {
140 |       "FilterName": "Text - txt - csv (StarCalc)",
141 |       "FilterOptions": "9,34,0"
142 |     },
143 |     "storePropertiesByFamily": {"SPREADSHEET": {
144 |       "FilterName": "Text - txt - csv (StarCalc)",
145 |       "FilterOptions": "9,34,0"
146 |     }}
147 |   },
148 |   {
149 |     "name": "OpenDocument Presentation",
150 |     "extension": "odp",
151 |     "mediaType": "application/vnd.oasis.opendocument.presentation",
152 |     "inputFamily": "PRESENTATION",
153 |     "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
154 |   },
155 |   {
156 |     "name": "OpenOffice.org 1.0 Presentation",
157 |     "extension": "sxi",
158 |     "mediaType": "application/vnd.sun.xml.impress",
159 |     "inputFamily": "PRESENTATION",
160 |     "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
161 |   },
162 |   {
163 |     "name": "Microsoft PowerPoint",
164 |     "extension": "ppt",
165 |     "mediaType": "application/vnd.ms-powerpoint",
166 |     "inputFamily": "PRESENTATION",
167 |     "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
168 |   },
169 |   {
170 |     "name": "Microsoft PowerPoint 2007 XML",
171 |     "extension": "pptx",
172 |     "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173 |     "inputFamily": "PRESENTATION"
174 |   },
175 |   {
176 |     "name": "OpenDocument Drawing",
177 |     "extension": "odg",
178 |     "mediaType": "application/vnd.oasis.opendocument.graphics",
179 |     "inputFamily": "DRAWING",
180 |     "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
181 |   },
182 |   {
183 |     "name": "Scalable Vector Graphics",
184 |     "extension": "svg",
185 |     "mediaType": "image/svg+xml",
186 |     "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
187 |   },
188 |   {
189 |     "name": "Portable Network Graphic",
190 |     "extension": "png",
191 |     "mediaType": "image/png",
192 |     "storePropertiesByFamily": {
193 |       "DRAWING": {"FilterName": "draw_png_Export"},
194 |       "PRESENTATION": {"FilterName": "impress_png_Export"}
195 |     }
196 |   },
197 |   {
198 |     "name": "Graphics Interchange Format",
199 |     "extension": "gif",
200 |     "mediaType": "image/gif",
201 |     "storePropertiesByFamily": {
202 |       "DRAWING": {"FilterName": "draw_gif_Export"},
203 |       "PRESENTATION": {"FilterName": "impress_gif_Export"}
204 |     }
205 |   },
206 |   {
207 |     "name": "Joint Photographic Experts Group",
208 |     "extension": "jpg",
209 |     "mediaType": "image/jpeg",
210 |     "storePropertiesByFamily": {
211 |       "DRAWING": {"FilterName": "draw_jpg_Export"},
212 |       "PRESENTATION": {"FilterName": "impress_jpg_Export"}
213 |     }
214 |   },
215 |   {
216 |     "name": "Windows Bitmap",
217 |     "extension": "bmp",
218 |     "mediaType": "image/bmp",
219 |     "storePropertiesByFamily": {
220 |       "DRAWING": {"FilterName": "draw_bmp_Export"},
221 |       "PRESENTATION": {"FilterName": "impress_bmp_Export"}
222 |     }
223 |   },
224 |   {
225 |     "name": "Tagged Image File Format",
226 |     "extension": "tif",
227 |     "mediaType": "image/tiff",
228 |     "storePropertiesByFamily": {
229 |       "DRAWING": {"FilterName": "draw_tif_Export"},
230 |       "PRESENTATION": {"FilterName": "impress_tif_Export"}
231 |     }
232 |   }
233 | ]
234 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE HTML>
  2 | <html>
  3 | <head>
  4 |   <meta http-equiv="content-type" content="text/html;charset=UTF-8" />
  5 |   <meta http-equiv="X-UA-Compatible" content="chrome=1">
  6 |   <title>Doc&#9889;split</title>
  7 |   <style>
  8 |     body {
  9 |       font-size: 16px;
 10 |       line-height: 24px;
 11 |       background: #fffff5;
 12 |       color: #333300;
 13 |       font-family: Arial;
 14 |       font-family: "Palatino Linotype", "Book Antiqua", Palatino, FreeSerif, serif;
 15 |     }
 16 |     div.container {
 17 |       width: 720px;
 18 |       margin: 50px 0 50px 50px;
 19 |     }
 20 |     p, li {
 21 |       margin: 16px 0 16px 0;
 22 |       width: 550px;
 23 |     }
 24 |       p.break {
 25 |         margin-top: 35px;
 26 |       }
 27 |     a, a:visited {
 28 |       padding: 0 2px;
 29 |       text-decoration: none;
 30 |       background: #f7f7bb;
 31 |       color: #333300;
 32 |     }
 33 |     a:active, a:hover {
 34 |       color: #000;
 35 |       background: #ffff88;
 36 |     }
 37 |     h1, h2, h3, h4, h5, h6 {
 38 |       margin-top: 40px;
 39 |     }
 40 |     b.header {
 41 |       font-size: 18px;
 42 |     }
 43 |     span.alias {
 44 |       font-size: 14px;
 45 |       font-style: italic;
 46 |       margin-left: 20px;
 47 |     }
 48 |     table {
 49 |       margin: 16px 0; padding: 0;
 50 |     }
 51 |       tr, td {
 52 |         margin: 0; padding: 0;
 53 |       }
 54 |         td {
 55 |           padding: 9px 15px 9px 0;
 56 |         }
 57 |           td.definition {
 58 |             line-height: 18px;
 59 |             font-size: 14px;
 60 |           }
 61 |     code, pre, tt {
 62 |       font-family: Monaco, Consolas, "Lucida Console", monospace;
 63 |       font-size: 12px;
 64 |       line-height: 18px;
 65 |       color: #444;
 66 |     }
 67 |       code {
 68 |         margin-left: 20px;
 69 |       }
 70 |       pre {
 71 |         font-size: 12px;
 72 |         padding: 2px 0 2px 12px;
 73 |         border-left: 6px solid #da304d;
 74 |         margin: 0px 0 10px;
 75 |       }
 76 |         li pre {
 77 |           padding: 0;
 78 |           border-left: 0;
 79 |           margin: 6px 0 6px 0;
 80 |         }
 81 |     #diagram {
 82 |       margin: 20px 0 0 0;
 83 |     }
 84 |   </style>
 85 | </head>
 86 | <body>
 87 | 
 88 |   <div class="container">
 89 | 
 90 |     <h1>Doc<sub style=""><img style="width:24pt" src="noto_bolt.svg"></sub>split</h1>
 91 | 
 92 |     <p>
 93 |       <a href="http://github.com/documentcloud/docsplit/">Docsplit</a>
 94 |       is a command-line utility and Ruby library for splitting apart
 95 |       documents into their component parts: searchable UTF-8 <b>plain text</b>
 96 |       via OCR if necessary, page <b>images</b> or thumbnails in any format,
 97 |       <b>PDFs</b>, single <b>pages</b>, and document <b>metadata</b>
 98 |       (title, author, number of pages...)
 99 |     </p>
100 | 
101 |     <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.7.6</a>.</p>
102 | 
103 |     <p>
104 |       <i>Docsplit is an open-source component of <a href="http://documentcloud.org/">DocumentCloud</a>.</i>
105 |     </p>
106 | 
107 |     <p>
108 |       <a href="#installation">Installation &amp; Dependencies</a> |
109 |       <a href="#usage">Usage</a> |
110 |       <a href="#internals">Internals</a> |
111 |       <a href="#changes">Change Log</a>
112 |     </p>
113 | 
114 |     <h2 id="installation">Installation &amp; Dependencies</h2>
115 | 
116 |     <ol>
117 |       <li>
118 |         Grab the gem:<br />
119 |         <tt>gem install docsplit</tt>
120 |       </li>
121 |       <li>
122 |         Install <a href="http://www.graphicsmagick.org/">GraphicsMagick</a>.
123 |         Its &lsquo;<b>gm</b>&rsquo; command is used to generate images.<br />
124 |         Either compile it from
125 |         <a href="http://sourceforge.net/projects/graphicsmagick/files/">source</a>,
126 |         or use a package manager:
127 | <pre>
128 | [aptitude | port | brew] install graphicsmagick</pre>
129 |       </li>
130 |       <li>
131 |         Install <a href="http://poppler.freedesktop.org/">Poppler</a>.
132 |         On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
133 |         <tt>aptitude install poppler-utils poppler-data</tt><br />
134 |         On the Mac, you can install from source or use <b>MacPorts</b>:<br />
135 |         <tt>sudo port install poppler | brew install poppler</tt><br />
136 |       </li>
137 |       <li>
138 |         (Optional) Install <a href="http://www.ghostscript.com/">Ghostscript</a>:<br />
139 |         <tt>[aptitude | port | brew] install ghostscript</tt><br />
140 |         Ghostscript is required to convert PDF and Postscript files.
141 |       </li>
142 |       <li>
143 |         (Optional) Install <a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>:<br />
144 |         <tt>[aptitude | port | brew] install [tesseract | tesseract-ocr]</tt><br />
145 |         Without Tesseract installed, you'll still be able to extract text from
146 |         documents, but you won't be able to automatically OCR them.
147 |       </li>
148 |       <li>
149 |         (Optional) Install <a href="http://www.accesspdf.com/pdftk/">pdftk</a>.
150 |         On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
151 |         <tt>aptitude install pdftk</tt><br />
152 |         On the Mac, you can <a href="https://www.pdflabs.com/tools/pdftk-server/">download a recent installer</a> for the binary.
153 |         Without <b>pdftk</b> installed, you can use Docsplit, but won't be able
154 |         to split apart a multi-page PDF into single-page PDFs.
155 |       </li>
156 |       <li>
157 |         (Optional) Install <a href="http://www.libreoffice.org/">LibreOffice</a>.
158 |         On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
159 |         <tt>aptitude install libreoffice</tt><br />
160 |         On the Mac, download and install <a href="http://www.libreoffice.org/download">the latest release</a>.
161 |       </li>
162 |       <li>
163 |         (Optional) Install fonts to process documents that use <a href="https://help.ubuntu.com/community/Fonts#Chinese.2C_Japanese.2C_and_Korean_Fonts">Chinese, Japanese, and Korean Fonts</a>.
164 |         On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
165 |         <tt>aptitude install ttf-wqy-microhei ttf-wqy-zenhei ttf-kochi-gothic ttf-kochi-mincho fonts-nanum</tt><br />
166 |         On the Mac, the fonts should already be present. However you can always download the TTF files and install them using <a href="http://support.apple.com/en-us/HT201749">Font Book</a>.
167 |       </li>
168 |     </ol>
169 | 
170 |     <p><i>
171 |         Note: the gem will take a minute to download &mdash; the
172 |         JODConverter jar file tips the scales at 2MB.
173 |     </i></p>
174 | 
175 |     <h2 id="usage">Usage</h2>
176 | 
177 |     <p>
178 |       The Docsplit gem includes both the <tt>docsplit</tt> command-line utility
179 |       as well as a Ruby API. The available commands and options are identical in both.<br />
180 |       <tt>--output</tt> or <tt>-o</tt> can be passed to any command in order to
181 |       store the generated files in a directory of your choosing.
182 |     </p>
183 | 
184 |     <p>
185 |       <b class="header">images</b><code>--size --format --pages --density</code>
186 |       <span class="alias">Ruby: <b>extract_images</b></span>
187 |       <br />
188 |       Generates an image for each page in the document at the specified resolution
189 |       and format. Pass <tt>--pages</tt> or <tt>-p</tt> to choose the specific pages to
190 |       image. Passing<br /> <tt>--size</tt> or <tt>-s</tt> will specify the desired
191 |       image resolution, <tt>--density</tt> or <tt>-d</tt> will specify the DPI to rasterize the images
192 |       at during conversion by GraphicsMagick, and <tt>--format</tt> or <tt>-f</tt>
193 |       will select the format of the final images.
194 |     </p>
195 | <pre>
196 | docsplit images example.pdf
197 | docsplit images docs/*.pdf --size 700x,50x50 --format gif --pages 3,10-15,42</pre>
198 | <pre>
199 | Docsplit.extract_images('example.doc', :size => '1000x', :format => [:png, :jpg])</pre>
200 | 
201 |     <p class="break">
202 |       <b class="header">text</b><code>--pages --ocr --no-ocr --no-clean --language --no-orientation-detection</code>
203 |       <span class="alias">Ruby: <b>extract_text</b></span>
204 |       <br />
205 |       Extract the complete <b>UTF-8</b>-encoded plain text of a document to a
206 |       single file. If you'd like to extract the text for each page separately,
207 |       pass <tt>--pages all</tt>. You can use the <tt>--ocr</tt> and <tt>--no-ocr</tt>
208 |       flags to force OCR, or disable it, respectively. By default (if Tesseract is installed)
209 |       Docsplit will OCR the text of each page for which it fails to extract text
210 |       directly from the document. Docsplit will also attempt to clean up garbage
211 |       characters in the OCR'd text &mdash; to disable this, pass the
212 |       <tt>--no-clean</tt> flag.
213 |     </p>
214 |     <p>
215 |       By default Tesseract ships only with english extraction data.  
216 |       If <a href="https://code.google.com/p/tesseract-ocr/downloads/list"/>
217 |       any additional language models</a> are installed you can select one using
218 |       the <tt>--language</tt> flag.
219 |       
220 |       If <a href="https://code.google.com/p/tesseract-ocr/downloads/detail?name=tesseract-ocr-3.01.osd.tar.gz&can=2&q=">
221 |       Tesseract's orientation detection model</a> Docsplit will automatically use it
222 |       unless you specify not to with the <tt>--no-orientation-detection</tt>.
223 |     </p>
224 | <pre>
225 | docsplit text path/to/doc.pdf --pages all --language deu</pre>
226 | <pre>
227 | docs = Dir['storage/originals/*.doc']
228 | Docsplit.extract_text(docs, :ocr => false, :output => 'storage/text')</pre>
229 | 
230 |     <p class="break">
231 |       <b class="header">pages</b><code>--pages</code>
232 |       <span class="alias">Ruby: <b>extract_pages</b></span>
233 |       <br />
234 |       Burst apart a document into single-page PDFs. Use <tt>--pages</tt> to
235 |       specify the individual pages (or ranges of pages) you'd like to generate.
236 |     </p>
237 | <pre>
238 | docsplit pages path/to/doc.pdf --pages 1-10</pre>
239 | <pre>
240 | Docsplit.extract_pages('path/to/presentation.ppt')
241 | Docsplit.extract_pages('doc.pdf', :pages => 1..10)</pre>
242 | 
243 |     <p class="break">
244 |       <b class="header">pdf</b>
245 |       <span class="alias">Ruby: <b>extract_pdf</b></span>
246 |       <br />
247 |       Convert documents into PDFs. Any type of document that LibreOffice can read
248 |       may be converted. These include the Microsoft Office formats: <b>doc</b>, <b>docx</b>, <b>ppt</b>,
249 |       <b>xls</b> and so on, as well as <b>html</b>, <b>odf</b>, <b>rtf</b>, <b>swf</b>, <b>svg</b>, and <b>wpd</b>.
250 |       The first time that you convert a new file type, LibreOffice will lazy-load
251 |       the code that processes it &mdash; subsequent conversions will be much faster.
252 |     </p>
253 | <pre>
254 | docsplit pdf documentation/*.html</pre>
255 | <pre>
256 | Docsplit.extract_pdf('expense_report.xls')</pre>
257 | 
258 |     <p class="break">
259 |       <b class="header">author, date, creator, keywords, producer, subject, title, length</b><br />
260 |       <small><i>Ruby: <b>extract_...</b></i></small>
261 |       <br />
262 |       Retrieve a piece of metadata about the document. The <tt>docsplit</tt>
263 |       utility will print to <b>stdout</b>, the Ruby API will return the value.
264 |     </p>
265 | <pre>
266 | docsplit title path/to/stooges.pdf
267 | =&gt; Disorder in the Court</pre>
268 | <pre>
269 | Docsplit.extract_length('path/to/stooges.pdf')
270 | =&gt; 36</pre>
271 | 
272 | 
273 |     <h2 id="internals">Internals</h2>
274 | 
275 |     <p>
276 |       Under the hood, Docsplit is a thin wrapper around the excellent
277 |       <a href="http://www.graphicsmagick.org/">GraphicsMagick</a>,
278 |       <a href="http://poppler.freedesktop.org/">Poppler</a>,
279 |       <a href="http://www.accesspdf.com/pdftk/">PDFTK</a>,
280 |       <a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>, and
281 |       <a href="http://www.libreoffice.org/">LibreOffice</a> libraries.
282 |       Poppler is used to extract text and metadata from PDF documents,
283 |       PDFTK is used to split them apart into pages, and GraphicsMagick is used to generate
284 |       the page images (internally, it's rendering them with
285 |       <a href="http://pages.cs.wisc.edu/~ghost/doc/GPL/index.htm">GhostScript</a>).
286 |       LibreOffice and GraphicsMagick convert documents and images to PDF.
287 |       Tesseract provides the transparent OCR fallback support, if the document
288 |       is a simple scan, and the file doesn't contain any embedded text.
289 |     </p>
290 | 
291 |     <p>
292 |       Because documents need to be in PDF format before any metadata, text,
293 |       or images are extracted, it's faster to use <tt>docsplit pdf</tt>
294 |       to convert it up front, if you're planning to run more than one extraction.
295 |       Otherwise Docsplit will write out the PDF version to a temporary file before
296 |       proceeding with each command.
297 |     </p>
298 | 
299 |     <h2 id="changes">Change Log</h2>
300 | 
301 |     <p>
302 |       <b class="header">0.7.6</b><small> &ndash; Nov. 16, 2014</small><br />
303 |       Docsplit will now automatically use Tesseract's orientation detection model
304 |       if it is installed.
305 |     </p>
306 | 
307 |     <p>
308 |       <b class="header">0.7.5</b><small> &ndash; May 28, 2014</small><br />
309 |       Docsplit will detect PDFs regardless of extension using magic number-based
310 |       detection.
311 |     </p>
312 | 
313 |     <p>
314 |       <b class="header">0.7.2</b><small> &ndash; Feb. 23, 2013</small><br />
315 |       Bug fixes for LibreOffice support.
316 |     </p>
317 | 
318 |     <p>
319 |       <b class="header">0.7.0</b><small> &ndash; Feb. 23, 2013</small><br />
320 |       Docsplit now expresses a preference for LibreOffice over OpenOffice, with
321 |       an eye to removing JODConverter and OpenOffice support in future versions
322 |       (direct LibreOffice support is substantially faster than JODConverter).
323 |       Improved unicode support now correctly collects non-ascii characters from
324 |       pdfinfo.
325 |     </p>
326 | 
327 |     <p>
328 |       <b class="header">0.6.4</b><small> &ndash; Nov. 12, 2012</small><br />
329 |       Added a language flag for the Docsplit commandline, fixed several bugs,
330 |       and began preparations for the deprecation of pdftk.
331 |     </p>
332 | 
333 |     <p>
334 |       <b class="header">0.6.2</b><small> &ndash; Nov. 22, 2011</small><br />
335 |       Bugfix to escape document names during file type detection.
336 |     </p>
337 | 
338 |     <p>
339 |       <b class="header">0.6.1</b><small> &ndash; Nov. 18, 2011</small><br />
340 |       Docsplit now supports converting documents using LibreOffice
341 |       as well as OpenOffice, through JODConverter 3.0 beta4.
342 |     </p>
343 | 
344 |     <p>
345 |       <b class="header">0.6.0</b><small> &ndash; Sept. 13, 2011</small><br />
346 |       Docsplit should now handle shelling out for documents with arbitrary
347 |       characters in their filenames correctly, thanks to a series of
348 |       epic patches from Vladimir Rybas.
349 |       A <tt>--density</tt> option was added for specifying the resolution of
350 |       rasterization when generating images from documents.
351 |       The image resolution for OCR has been doubled from 200 to 400 DPI &mdash;
352 |       this shouldn't make a noticeable difference for normal docs, but will make
353 |       a world of difference for the fine print.
354 |       Docsplit now uses GraphicsMagick's <tt>--despeckle</tt> before OCR.
355 |     </p>
356 | 
357 |     <p>
358 |       <b class="header">0.5.2</b><small> &ndash; May 13, 2011</small><br />
359 |       For transparent conversion to PDF, made Docsplit prefer GraphicsMagick
360 |       over OpenOffice, when the file format is one that GraphicsMagick is able
361 |       to read: (png, gif, jpg, jpeg, tif, tiff, bmp, pnm, ppm, svg, eps).
362 |     </p>
363 | 
364 |     <p>
365 |       <b class="header">0.5.1</b><small> &ndash; April 26, 2011</small><br />
366 |       Minor tweaks to the <tt>TextCleaner</tt> to be more lenient about acryonms
367 |       with hyphens, and words with four vowels in a row.
368 |     </p>
369 | 
370 |     <p>
371 |       <b class="header">0.5.0</b><br />
372 |       Added a <tt>Docsplit::TextCleaner</tt> class which is used to post-process
373 |       OCR'd text, and remove garbage characters that are created when Tesseract
374 |       encounters non-english text. To disable the cleanup, pass <tt>--no-clean</tt>.
375 |     </p>
376 | 
377 |     <p>
378 |       <b class="header">0.4.1</b><br />
379 |       Upgraded the JODConverter dependency for PDF conversion via OpenOffice to
380 |       3.0 beta. Added PNG, GIF, TIF, JPG, and BMP to the list of supported
381 |       formats.
382 |     </p>
383 | 
384 |     <p>
385 |       <b class="header">0.3.4</b><br />
386 |       Adding a suggested optimization from the GraphicsMagick list -- only ever
387 |       generate one page image per GraphicsMagick call. Saves large amounts of
388 |       disk space for tempfiles on long documents.
389 |     </p>
390 | 
391 |     <p>
392 |       <b class="header">0.3.3</b><br />
393 |       Start using the MAGICK_TMPDIR environment variable to prevent parallel
394 |       Docsplit runs from having the potential to clobber each other's temporary
395 |       image files.
396 |     </p>
397 | 
398 |     <p>
399 |       <b class="header">0.3.1</b><br />
400 |       Added a memory limit to GraphicsMagick while generating the TIFFs for
401 |       Tesseract OCR -- prevents <tt>gm</tt> from gobbling up all available memory
402 |       on large files.
403 |     </p>
404 | 
405 |     <p>
406 |       <b class="header">0.3.0</b><br />
407 |       OCR support added via Tesseract, and the <tt>--ocr</tt> and <tt>--no-ocr</tt>
408 |       flags. PDFBox is no longer a dependency, and the gem is many megabytes
409 |       lighter for it.
410 |     </p>
411 | 
412 |     <p>
413 |       <b class="header">0.2.0</b><br />
414 |       Moving to Poppler's <tt>pdftotext</tt>. PDFBox had issues with Unicode in PDFs
415 |       and incorrectly split individual pages of text.
416 |     </p>
417 | 
418 |     <p>
419 |       <b class="header">0.1.3</b><br />
420 |       Fixing a bug with specifying explicit page ranges for image extraction.
421 |     </p>
422 | 
423 |     <p>
424 |       <b class="header">0.1.2</b><br />
425 |       Limiting the memory usage of GraphicsMagick to avoid out of memory errors
426 |       on very large PDFs.
427 |     </p>
428 | 
429 |     <p>
430 |       <b class="header">0.1.1</b><br />
431 |       Upgraded for compatibility with GraphicsMagick 1.3.11.
432 |     </p>
433 | 
434 |     <p>
435 |       <b class="header">0.1.0</b><br />
436 |       Initial Docsplit release.
437 |     </p>
438 | 
439 |     <p>
440 |       <br />
441 |       <a href="http://documentcloud.org/" title="A DocumentCloud Project" style="background:none;">
442 |         <img src="http://jashkenas.s3.amazonaws.com/images/a_documentcloud_project.png" alt="A DocumentCloud Project" style="position:relative;left:-10px;" />
443 |       </a>
444 |     </p>
445 | 
446 |     </div>
447 | 
448 |   </div>
449 | 
450 | </body>
451 | </html>
452 | 


--------------------------------------------------------------------------------