├── Gemfile ├── LICENSE.md ├── README.md ├── config.ru ├── db ├── migrations │ ├── 1_initial.rb │ ├── 2_add_size_column.rb │ └── 3_add_document_name_page_count.rb └── tabula-api.db ├── lib ├── tabula_api.rb └── tabula_api │ ├── api.rb │ ├── models.rb │ └── settings.rb ├── tabula-api.gemspec ├── test.db └── test ├── fixtures ├── document_pages.yml ├── documents.yml └── sample.pdf ├── test.db └── test.rb /Gemfile: -------------------------------------------------------------------------------- 1 | source "http://rubygems.org" 2 | gemspec 3 | gem "rake" 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012-2014 Manuel Aristarán 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Tabula API 2 | ========== 3 | 4 | **Important**: `tabula-api` is not yet functional. 5 | 6 | ## Methods 7 | 8 | ``` 9 | GET /documents 10 | Returns all the documents stored in Tabula 11 | parameters: 12 | POST /documents 13 | Upload a PDF 14 | parameters: 15 | * file: (required) 16 | GET /documents/:uuid 17 | An uploaded document 18 | parameters: 19 | * uuid: 20 | GET /documents/:uuid/document 21 | Download the original PDF 22 | parameters: 23 | * uuid: 24 | DELETE /documents/:uuid 25 | Delete an uploaded document 26 | parameters: 27 | * uuid: 28 | POST /documents/:uuid/tables 29 | Extract tables 30 | parameters: 31 | * uuid: 32 | * coords: (required) 33 | * extraction_method: 34 | DELETE /documents/:uuid/pages/:number 35 | Delete a page from a document 36 | parameters: 37 | * uuid: 38 | * number: (required) 39 | 40 | ``` 41 | 42 | ## Installation 43 | 44 | sqlite3 ../../.tabula/tabula_api.db 45 | bundle exec sequel -m db/migrations/ jdbc:sqlite:../../.tabula/tabula_api.db 46 | 47 | ### Run dev server 48 | 49 | ``` 50 | rackup 51 | ``` 52 | -------------------------------------------------------------------------------- /config.ru: -------------------------------------------------------------------------------- 1 | require 'lib/tabula_api' 2 | use Rack::Sendfile 3 | run TabulaApi::REST 4 | -------------------------------------------------------------------------------- /db/migrations/1_initial.rb: -------------------------------------------------------------------------------- 1 | Sequel.migration do 2 | change do 3 | create_table(:documents) do 4 | String :uuid, :length => 36, :null => false 5 | String :path, :text => true, :null => false 6 | DateTime :created_at, :default => Sequel::CURRENT_TIMESTAMP, :null => false 7 | primary_key :id 8 | end 9 | 10 | create_table(:document_pages) do 11 | primary_key :id 12 | Float :width, :null => false 13 | Float :height, :null => false 14 | Integer :number, :null => false 15 | Integer :rotation, :null => false, :default => 0 16 | 17 | foreign_key :document_id, :documents, :key => :id, :on_delete => :cascade 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /db/migrations/2_add_size_column.rb: -------------------------------------------------------------------------------- 1 | Sequel.migration do 2 | up do 3 | add_column :documents, :size, Integer 4 | end 5 | 6 | down do 7 | drop_column :documents, :size 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /db/migrations/3_add_document_name_page_count.rb: -------------------------------------------------------------------------------- 1 | Sequel.migration do 2 | up do 3 | # String :uuid, :length => 36, :null => false 4 | # String :path, :text => true, :null => false 5 | # DateTime :created_at, :default => Sequel::CURRENT_TIMESTAMP, :null => false 6 | # primary_key :id 7 | add_column :documents, :original_name, String 8 | add_column :documents, :page_count, Integer 9 | end 10 | 11 | down do 12 | drop_column :documents, :original_name 13 | drop_column :documents, :page_count 14 | end 15 | end -------------------------------------------------------------------------------- /db/tabula-api.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-api/133562e31210ae426da64191020d57f8386baf2d/db/tabula-api.db -------------------------------------------------------------------------------- /lib/tabula_api.rb: -------------------------------------------------------------------------------- 1 | require 'tabula' 2 | require 'grape' 3 | require 'sequel' 4 | 5 | require_relative './tabula_api/settings' 6 | require_relative './tabula_api/models' 7 | require_relative './tabula_api/api' 8 | -------------------------------------------------------------------------------- /lib/tabula_api/api.rb: -------------------------------------------------------------------------------- 1 | module TabulaApi 2 | class REST < Grape::API 3 | version 'v1', using: :header, vendor: 'tabula' 4 | format :json 5 | 6 | content_type :csv, 'text/csv' 7 | formatter :csv, lambda { |tables, env| tables.inject('') { |o, table| o += table.to_csv } } 8 | 9 | helpers do 10 | def job_executor 11 | end 12 | 13 | def is_valid_pdf?(path) 14 | File.open(path, 'r') { |f| f.read(4) } == '%PDF' 15 | end 16 | 17 | def get_document(uuid) 18 | doc = Models::Document.eager(:pages).first(uuid: uuid) 19 | error!('Not found', 404) if doc.nil? 20 | doc 21 | end 22 | 23 | def doc_to_h(doc) 24 | doc.values.merge(:pages => doc.pages.map(&:values)) 25 | end 26 | 27 | ## 28 | # page: a Tabula::Page instance 29 | # coords: coordinates specification [{'top' => .., 'left' => ..., 'bottom' => ..., 'right' => ... }, ...] 30 | # extraction_method: original|spreadsheet|guess 31 | def extract_tables_from_page(page, coords, extraction_method) 32 | coords.map do |coord| 33 | area = page.get_area([coord['top'], 34 | coord['left'], 35 | coord['bottom'], 36 | coord['right']]) 37 | 38 | if extraction_method == 'spreadsheet' \ 39 | || (extraction_method == 'guess' && area.is_tabular?) 40 | logger.info "Using extraction method: spreadsheet" 41 | (spreadsheets = area.spreadsheets).empty? ? Spreadsheet.empty(page) : spreadsheets.inject(&:+) 42 | else 43 | logger.info "Using extraction method: original" 44 | area.get_table 45 | end 46 | end 47 | end 48 | 49 | def autodetect_tables_from_page(page) 50 | 51 | end 52 | 53 | def logger 54 | REST.logger 55 | end 56 | end 57 | 58 | resource :documents do 59 | 60 | desc "Returns all the documents stored in Tabula" 61 | get do 62 | Models::Document.all 63 | end 64 | 65 | desc "Upload a PDF" 66 | params do 67 | requires :file, 68 | type: Rack::Multipart::UploadedFile, 69 | desc: 'PDF Document' 70 | optional :autodetect, type: Boolean, desc: "Should we attempt to auto-detect tables on the page?" #N.B. was "autodetect-tables in Tabula(-web)" 71 | end 72 | post do 73 | error!('Unsupported media type', 415) unless is_valid_pdf?(params[:file][:tempfile].path) 74 | 75 | doc = nil 76 | DB.transaction do 77 | doc = Models::Document.new_from_upload(params[:file]) 78 | end 79 | doc 80 | end 81 | 82 | route_param :uuid, requirements: { uuid: /[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/ } do 83 | desc "An uploaded document" 84 | get do 85 | doc_to_h(get_document(params[:uuid])) 86 | end 87 | 88 | desc "Download the original PDF" 89 | get 'document' do 90 | doc = get_document(params[:uuid]) 91 | content_type 'application/pdf' 92 | env['api.format'] = :binary 93 | File.open(doc.document_path).read 94 | end 95 | 96 | desc "Delete an uploaded document" 97 | delete do 98 | doc = get_document(params[:uuid]) 99 | doc.destroy 100 | end 101 | 102 | desc "Autodetect tables in this document." 103 | get '/tables' do 104 | doc = get_document(params[:uuid]) 105 | extractor = Tabula::Extraction::ObjectExtractor.new(doc.document_path) 106 | doc.pages_dataset.map do |p| 107 | if p.nil? 108 | [] 109 | else 110 | page = extractor.extract_page(p.number) 111 | page.spreadsheet_areas.map{|rect| rect.dims(:left, :top, :width, :height)} 112 | end 113 | end 114 | end 115 | 116 | desc "Extract tables" 117 | params do 118 | requires :coords, type: Array 119 | optional :extraction_method, type: String, regexp: /^(original|spreadsheet|guess)$/ 120 | end 121 | post 'tables' do 122 | doc = get_document(params[:uuid]) 123 | extractor = Tabula::Extraction::ObjectExtractor.new(doc.document_path) 124 | extraction_method = params[:extraction_method] || 'guess' 125 | 126 | logger.info "Requested extraction method: #{extraction_method}" 127 | 128 | params[:coords] 129 | .sort_by { |c| c[:page] } 130 | .group_by { |c| c[:page] } 131 | .flat_map do |page_number, coords| 132 | 133 | page = extractor.extract_page(page_number) 134 | 135 | extract_tables_from_page(page, coords, extraction_method) 136 | 137 | end.flatten(1) 138 | end 139 | 140 | resource :pages do 141 | desc "Delete a page from a document" 142 | params do 143 | requires :number, type: Integer, desc: 'Page Number' 144 | end 145 | delete ':number', requirements: { number: /\d+/ } do 146 | doc = get_document(params[:uuid]) 147 | page = doc.pages_dataset.where(number: params[:number]).first 148 | page.destroy 149 | end 150 | 151 | desc "Autodetect tables on this page" 152 | params do 153 | requires :number, type: Integer, desc: 'Page Number' 154 | end 155 | get ':number/tables' do 156 | doc = get_document(params[:uuid]) 157 | p = doc.pages_dataset.where(number: params[:number]).first 158 | error!('Not found', 404) if p.nil? 159 | 160 | extractor = Tabula::Extraction::ObjectExtractor.new(doc.document_path) 161 | page = extractor.extract_page(p.number) 162 | 163 | page.spreadsheet_areas.map{|rect| rect.dims(:left, :top, :width, :height)} 164 | end 165 | 166 | desc 'Extract tables from this page' 167 | params do 168 | requires :coords, type: Array 169 | requires :number, type: Integer, desc: 'Page Number' 170 | optional :extraction_method, type: String, regexp: /^(original|spreadsheet|guess)$/ 171 | end 172 | post ':number/tables' do 173 | doc = get_document(params[:uuid]) 174 | p = doc.pages_dataset.where(number: params[:number]).first 175 | error!('Not found', 404) if p.nil? 176 | 177 | extractor = Tabula::Extraction::ObjectExtractor.new(doc.document_path) 178 | extraction_method = params[:extraction_method] || 'guess' 179 | 180 | page = extractor.extract_page(p.number) 181 | 182 | extract_tables_from_page(page, params[:coords], extraction_method) 183 | end 184 | end 185 | end 186 | end 187 | end 188 | end 189 | -------------------------------------------------------------------------------- /lib/tabula_api/models.rb: -------------------------------------------------------------------------------- 1 | require 'securerandom' 2 | 3 | module TabulaApi 4 | DB = Sequel.connect(ENV['TABULA_API_DATABASE_URL'] || ("jdbc:sqlite://" + File.join(Settings.getDataDir, 'tabula_api.db'))) 5 | Sequel::Model.plugin :json_serializer 6 | 7 | module Models 8 | class Document < Sequel::Model 9 | one_to_many :pages, :key => :document_id 10 | attr_accessor :uploaded_file 11 | 12 | def after_create 13 | return if self.uploaded_file.nil? 14 | FileUtils.mkdir_p(File.dirname(self.document_path)) 15 | begin 16 | FileUtils.mv(self.uploaded_file, 17 | self.document_path) 18 | rescue Errno::EACCES # mv fails on Windows sometimes 19 | FileUtils.cp_r(self.uploaded_file, 20 | self.document_path) 21 | FileUtils.rm_rf(self.uploaded_file) 22 | end 23 | end 24 | 25 | def before_destroy 26 | FileUtils.rm_rf(File.dirname(self.document_path)) 27 | end 28 | 29 | def document_path 30 | File.join(Settings.getDataDir, 'pdfs', self.uuid, 'document.pdf') 31 | end 32 | 33 | class << self 34 | def new_from_upload(uploaded_file) 35 | doc = self.create(:uuid => SecureRandom.uuid, 36 | :path => uploaded_file[:filename], 37 | :uploaded_file => uploaded_file[:tempfile].path, 38 | :size => uploaded_file[:tempfile].size) 39 | Tabula::Extraction::PagesInfoExtractor.new(doc.document_path).pages.each do |p| 40 | doc.add_page(Page.new(:width => p.width, 41 | :height => p.height, 42 | :rotation => p.rotation, 43 | :number => p.number)) 44 | end 45 | doc 46 | end 47 | end 48 | end 49 | 50 | class Page < Sequel::Model(:document_pages) 51 | many_to_one :document, :key => :document_id 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /lib/tabula_api/settings.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | require 'fileutils' 3 | 4 | module TabulaApi 5 | module Settings 6 | 7 | def self.getDataDir 8 | # OS X: ~/Library/Application Support/Tabula 9 | # Win: %APPDATA%/Tabula 10 | # Linux: ~/.tabula 11 | 12 | # when invoking as "java -Dtabula.data_dir=/foo/bar ... -jar tabula.war" 13 | data_dir = java.lang.System.getProperty('tabula.data_dir') 14 | unless data_dir.nil? 15 | return java.io.File.new(data_dir).getPath 16 | end 17 | 18 | # when invoking with env var 19 | data_dir = ENV['TABULA_DATA_DIR'] 20 | unless data_dir.nil? 21 | return java.io.File.new(data_dir).getPath 22 | end 23 | 24 | # use the usual directory in (system-dependent) user home dir 25 | data_dir = nil 26 | case java.lang.System.getProperty('os.name') 27 | when /Windows/ 28 | # APPDATA is in a different place (under user.home) depending on 29 | # Windows OS version. so use that env var directly, basically 30 | appdata = ENV['APPDATA'] 31 | if appdata.nil? 32 | home = java.lang.System.getProperty('user.home') 33 | end 34 | data_dir = java.io.File.new(appdata, '/Tabula').getPath 35 | 36 | when /Mac/ 37 | home = java.lang.System.getProperty('user.home') 38 | data_dir = File.join(home, '/Library/Application Support/Tabula') 39 | 40 | 41 | else 42 | # probably *NIX 43 | home = java.lang.System.getenv('XDG_DATA_HOME') 44 | if !home.nil? 45 | # XDG 46 | data_dir = File.join(data_home, '/tabula') 47 | else 48 | # other, normal *NIX systems 49 | home = java.lang.System.getProperty('user.home') 50 | home = '.' if home.nil? 51 | data_dir = File.join(home, '/.tabula') 52 | end 53 | end # /case 54 | 55 | data_dir 56 | end 57 | ########## Initialize environment, using helpers ########## 58 | FileUtils.mkdir_p(File.join(self.getDataDir, 'pdfs')) 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /tabula-api.gemspec: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | $:.push File.expand_path("../lib", __FILE__) 3 | 4 | Gem::Specification.new do |s| 5 | s.name = "tabula-api" 6 | s.version = "0.0.1" 7 | s.authors = ["Manuel Aristarán"] 8 | s.email = ["manuel@jazzido.com"] 9 | s.homepage = "https://github.com/jazzido/tabula-api" 10 | s.summary = %q{a REST endpoint for tabula-extractor} 11 | s.description = %q{a REST endpoint for tabula-extractor} 12 | s.license = 'MIT' 13 | 14 | s.platform = 'java' 15 | 16 | s.files = `git ls-files`.split("\n") 17 | s.test_files = `git ls-files -- {test,features}/*`.split("\n") 18 | s.require_paths = ["lib"] 19 | 20 | s.add_development_dependency 'rack-test' 21 | s.add_development_dependency 'minitest' 22 | s.add_development_dependency 'fixture_dependencies' 23 | 24 | s.add_runtime_dependency "grape" 25 | s.add_runtime_dependency "sequel" 26 | s.add_runtime_dependency "jdbc-sqlite3" 27 | s.add_runtime_dependency "tabula-extractor" 28 | end 29 | -------------------------------------------------------------------------------- /test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-api/133562e31210ae426da64191020d57f8386baf2d/test.db -------------------------------------------------------------------------------- /test/fixtures/document_pages.yml: -------------------------------------------------------------------------------- 1 | page_1: 2 | width: 42 3 | height: 42 4 | number: 1 5 | rotation: 0 6 | -------------------------------------------------------------------------------- /test/fixtures/documents.yml: -------------------------------------------------------------------------------- 1 | document1: 2 | uuid: 8cf52024-1ab8-4ec2-8fb2-c7605417e564 3 | path: /tmp/foo 4 | pages: [page_1] 5 | -------------------------------------------------------------------------------- /test/fixtures/sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-api/133562e31210ae426da64191020d57f8386baf2d/test/fixtures/sample.pdf -------------------------------------------------------------------------------- /test/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabulapdf/tabula-api/133562e31210ae426da64191020d57f8386baf2d/test/test.db -------------------------------------------------------------------------------- /test/test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env jruby -J-Djava.awt.headless=true 2 | require 'tmpdir' 3 | 4 | ENV['TABULA_API_DATABASE_URL'] = "jdbc:sqlite:#{File.expand_path('test.db', File.dirname(__FILE__))}" 5 | ENV['TABULA_DATA_DIR'] = Dir.mktmpdir 6 | 7 | require_relative '../lib/tabula_api' 8 | # need to bring the models to the top level namespace 9 | # fixture_dependencies isn't smart enough to resolve 10 | # the constant by itself 11 | Document = TabulaApi::Models::Document 12 | Page = TabulaApi::Models::Page 13 | 14 | require 'minitest' 15 | require 'minitest/autorun' 16 | require 'sequel' 17 | require 'rack/test' 18 | require 'fixture_dependencies' 19 | 20 | FixtureDependencies.fixture_path = File.expand_path('fixtures', 21 | File.dirname(__FILE__)) 22 | 23 | # TabulaApi::DB.loggers << Logger.new($stderr) 24 | 25 | class TabulaApiTestCase < MiniTest::Test 26 | include Rack::Test::Methods 27 | 28 | def run(*args, &block) 29 | result = nil 30 | Sequel::Model.db.transaction(:rollback=>:always) { result = super } 31 | result 32 | end 33 | 34 | def app 35 | TabulaApi::REST 36 | end 37 | 38 | def setup 39 | Document.truncate 40 | end 41 | 42 | end 43 | 44 | 45 | class TabulaApiTests < TabulaApiTestCase 46 | 47 | def test_documents_collection 48 | FixtureDependencies.load(:document__document1) 49 | get '/documents' 50 | assert_equal 200, last_response.status 51 | resp = JSON.parse(last_response.body) 52 | assert_equal 1, resp.size 53 | assert_equal '8cf52024-1ab8-4ec2-8fb2-c7605417e564', resp.first['uuid'] 54 | end 55 | 56 | def test_upload_document_wrong_media_type 57 | file = Rack::Test::UploadedFile.new(File.expand_path('fixtures/documents.yml', 58 | File.dirname(__FILE__)), 59 | 'application/pdf') 60 | 61 | post '/documents', :file => file 62 | assert_equal 415, last_response.status 63 | end 64 | 65 | def test_get_document_wrong_pattern 66 | FixtureDependencies.load(:document__document1) 67 | get '/documents/foobarquuxor' 68 | assert_equal 404, last_response.status 69 | end 70 | 71 | def test_get_document_404 72 | FixtureDependencies.load(:document__document1) 73 | get '/documents/deadbeef-1ab8-4ec2-8fb2-c7605417e564' 74 | assert_equal 404, last_response.status 75 | end 76 | 77 | def test_download_original_document 78 | upload_file_path = File.expand_path('fixtures/sample.pdf', 79 | File.dirname(__FILE__)) 80 | file = Rack::Test::UploadedFile.new(upload_file_path, 81 | 'application/pdf') 82 | post '/documents', :file => file 83 | doc = JSON.parse(last_response.body) 84 | 85 | # retrieve uploaded document 86 | get "/documents/#{doc['uuid']}/document" 87 | 88 | assert_equal File.size(upload_file_path), last_response.headers['Content-Length'].to_i 89 | assert_equal 'application/pdf', last_response.headers['Content-Type'] 90 | end 91 | 92 | def test_delete_document 93 | upload_file_path = File.expand_path('fixtures/sample.pdf', 94 | File.dirname(__FILE__)) 95 | file = Rack::Test::UploadedFile.new(upload_file_path, 96 | 'application/pdf') 97 | post '/documents', :file => file 98 | doc = JSON.parse(last_response.body) 99 | 100 | # get doc object from DB before deleting 101 | doc_model = TabulaApi::Models::Document.first(uuid: doc['uuid']) 102 | 103 | delete "/documents/#{doc['uuid']}" 104 | 105 | assert !File.exists?(doc_model.document_path) 106 | assert_equal 0, TabulaApi::Models::Page.where(document_id: doc_model.id).count 107 | end 108 | 109 | def test_upload_pdf 110 | upload_file_path = File.expand_path('fixtures/sample.pdf', 111 | File.dirname(__FILE__)) 112 | file = Rack::Test::UploadedFile.new(upload_file_path, 113 | 'application/pdf') 114 | post '/documents', :file => file 115 | doc = JSON.parse(last_response.body) 116 | 117 | # retrieve uploaded document 118 | get "/documents/#{doc['uuid']}" 119 | doc = JSON.parse(last_response.body) 120 | 121 | assert_equal 5, doc['pages'].size 122 | end 123 | 124 | def test_delete_page 125 | FixtureDependencies.load(:document__document1) 126 | delete '/documents/8cf52024-1ab8-4ec2-8fb2-c7605417e564/pages/1' 127 | assert_equal 0, TabulaApi::Models::Page.where(document: TabulaApi::Models::Document.first(uuid: '8cf52024-1ab8-4ec2-8fb2-c7605417e564')).count 128 | end 129 | 130 | def test_extract_tables_from_document 131 | upload_file_path = File.expand_path('fixtures/sample.pdf', 132 | File.dirname(__FILE__)) 133 | file = Rack::Test::UploadedFile.new(upload_file_path, 134 | 'application/pdf') 135 | post '/documents', :file => file 136 | doc = JSON.parse(last_response.body) 137 | 138 | coords = { 'coords' => [ {"left" => 16.97142857142857, 139 | "right" => 762.3000000000001, 140 | "top" => 53.74285714285715, 141 | "bottom" => 548.7428571428571, 142 | "page" => 1}, 143 | {"left" => 16.97142857142857, 144 | "right" => 762.3000000000001, 145 | "top" => 53.74285714285715, 146 | "bottom" => 548.7428571428571, 147 | "page" => 2}] 148 | } 149 | 150 | post "/documents/#{doc['uuid']}/tables.json", 151 | JSON.dump(coords), 152 | "CONTENT_TYPE" => 'application/json' 153 | 154 | # TODO add assertions 155 | 156 | #puts JSON.parse(last_response.body).inspect 157 | end 158 | 159 | def test_extract_tables_from_document_page 160 | upload_file_path = File.expand_path('fixtures/sample.pdf', 161 | File.dirname(__FILE__)) 162 | file = Rack::Test::UploadedFile.new(upload_file_path, 163 | 'application/pdf') 164 | post '/documents', :file => file 165 | doc = JSON.parse(last_response.body) 166 | 167 | coords = { 'coords' => [ {"left" => 16.97142857142857, 168 | "right" => 762.3000000000001, 169 | "top" => 53.74285714285715, 170 | "bottom" => 548.7428571428571}] 171 | } 172 | 173 | post "/documents/#{doc['uuid']}/pages/1/tables.json", 174 | JSON.dump(coords), 175 | "CONTENT_TYPE" => 'application/json' 176 | 177 | puts JSON.parse(last_response.body).inspect 178 | # TODO add assertions 179 | end 180 | 181 | def test_get_table_structure_from_document_page 182 | upload_file_path = File.expand_path('fixtures/sample.pdf', 183 | File.dirname(__FILE__)) 184 | file = Rack::Test::UploadedFile.new(upload_file_path, 185 | 'application/pdf') 186 | post '/documents', :file => file 187 | doc = JSON.parse(last_response.body) 188 | 189 | coords = { 190 | 'coords' => [ {"left" => 16.97142857142857, 191 | "right" => 762.3000000000001, 192 | "top" => 53.74285714285715, 193 | "bottom" => 548.7428571428571}] 194 | } 195 | 196 | post "/documents/#{doc['uuid']}/pages/1/structure.json", 197 | JSON.dump(coords), 198 | "CONTENT_TYPE" => 'application/json' 199 | 200 | end 201 | 202 | def test_autodetected_tables 203 | upload_file_path = File.expand_path('fixtures/sample.pdf', 204 | File.dirname(__FILE__)) 205 | file = Rack::Test::UploadedFile.new(upload_file_path, 206 | 'application/pdf') 207 | post '/documents', :file => file 208 | doc = JSON.parse(last_response.body) 209 | 210 | get "/documents/#{doc['uuid']}/tables.json", 211 | "CONTENT_TYPE" => 'application/json' 212 | expected = [ 213 | [[18.0, 54.0, 744, 495]], 214 | [[18.0, 54.0, 744, 495]], 215 | [[18.0, 54.0, 744, 495]], 216 | [[18.0, 54.0, 744, 495]], 217 | [[18.0, 54.0, 744, 449]]] 218 | assert_equal expected, JSON.parse(last_response.body) 219 | end 220 | 221 | def test_autodetected_tables_from_page 222 | upload_file_path = File.expand_path('fixtures/sample.pdf', 223 | File.dirname(__FILE__)) 224 | file = Rack::Test::UploadedFile.new(upload_file_path, 225 | 'application/pdf') 226 | post '/documents', :file => file 227 | doc = JSON.parse(last_response.body) 228 | 229 | get "/documents/#{doc['uuid']}/pages/1/tables.json", 230 | "CONTENT_TYPE" => 'application/json' 231 | 232 | expected = [[18.0, 54.0, 744, 495]] 233 | 234 | assert_equal expected, JSON.parse(last_response.body) 235 | end 236 | 237 | end 238 | --------------------------------------------------------------------------------