├── .gitignore ├── Gemfile ├── Gemfile.lock ├── LICENSE ├── Rakefile ├── TODO.org ├── bin └── pdf-extract ├── catalog.json ├── data ├── familynames.db ├── reference.model └── stopwords.txt ├── lib ├── pdf-extract.rb └── pdf │ ├── extract.rb │ └── extract │ ├── analysis │ ├── columns.rb │ ├── margins.rb │ ├── sections.rb │ ├── titles.rb │ └── zones.rb │ ├── font_metrics.rb │ ├── kmeans.rb │ ├── language.rb │ ├── model │ ├── characters.rb │ ├── chunks.rb │ └── regions.rb │ ├── multi_range.rb │ ├── names.rb │ ├── pdf.rb │ ├── references │ ├── references.rb │ ├── resolve.rb │ ├── resolved_references.rb │ └── score.rb │ ├── spatial.rb │ └── view │ ├── abstract_view.rb │ ├── bib_view.rb │ ├── pdf_view.rb │ └── xml_view.rb ├── pdf-extract.gemspec ├── readme.md ├── tasks ├── assign.rb └── train.rb └── test └── catalog /.gitignore: -------------------------------------------------------------------------------- 1 | bin/*.pdf 2 | 3 | *.sw* 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/Gemfile -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/Gemfile.lock -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/LICENSE -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/Rakefile -------------------------------------------------------------------------------- /TODO.org: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/TODO.org -------------------------------------------------------------------------------- /bin/pdf-extract: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/bin/pdf-extract -------------------------------------------------------------------------------- /catalog.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /data/familynames.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/data/familynames.db -------------------------------------------------------------------------------- /data/reference.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/data/reference.model -------------------------------------------------------------------------------- /data/stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/data/stopwords.txt -------------------------------------------------------------------------------- /lib/pdf-extract.rb: -------------------------------------------------------------------------------- 1 | require_relative 'pdf/extract.rb' 2 | -------------------------------------------------------------------------------- /lib/pdf/extract.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract.rb -------------------------------------------------------------------------------- /lib/pdf/extract/analysis/columns.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/analysis/columns.rb -------------------------------------------------------------------------------- /lib/pdf/extract/analysis/margins.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/analysis/margins.rb -------------------------------------------------------------------------------- /lib/pdf/extract/analysis/sections.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/analysis/sections.rb -------------------------------------------------------------------------------- /lib/pdf/extract/analysis/titles.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/analysis/titles.rb -------------------------------------------------------------------------------- /lib/pdf/extract/analysis/zones.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/analysis/zones.rb -------------------------------------------------------------------------------- /lib/pdf/extract/font_metrics.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/font_metrics.rb -------------------------------------------------------------------------------- /lib/pdf/extract/kmeans.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/kmeans.rb -------------------------------------------------------------------------------- /lib/pdf/extract/language.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/language.rb -------------------------------------------------------------------------------- /lib/pdf/extract/model/characters.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/model/characters.rb -------------------------------------------------------------------------------- /lib/pdf/extract/model/chunks.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/model/chunks.rb -------------------------------------------------------------------------------- /lib/pdf/extract/model/regions.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/model/regions.rb -------------------------------------------------------------------------------- /lib/pdf/extract/multi_range.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/multi_range.rb -------------------------------------------------------------------------------- /lib/pdf/extract/names.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/names.rb -------------------------------------------------------------------------------- /lib/pdf/extract/pdf.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/pdf.rb -------------------------------------------------------------------------------- /lib/pdf/extract/references/references.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/references/references.rb -------------------------------------------------------------------------------- /lib/pdf/extract/references/resolve.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/references/resolve.rb -------------------------------------------------------------------------------- /lib/pdf/extract/references/resolved_references.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/references/resolved_references.rb -------------------------------------------------------------------------------- /lib/pdf/extract/references/score.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/references/score.rb -------------------------------------------------------------------------------- /lib/pdf/extract/spatial.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/spatial.rb -------------------------------------------------------------------------------- /lib/pdf/extract/view/abstract_view.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/view/abstract_view.rb -------------------------------------------------------------------------------- /lib/pdf/extract/view/bib_view.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/view/bib_view.rb -------------------------------------------------------------------------------- /lib/pdf/extract/view/pdf_view.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/view/pdf_view.rb -------------------------------------------------------------------------------- /lib/pdf/extract/view/xml_view.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/lib/pdf/extract/view/xml_view.rb -------------------------------------------------------------------------------- /pdf-extract.gemspec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/pdf-extract.gemspec -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/readme.md -------------------------------------------------------------------------------- /tasks/assign.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/tasks/assign.rb -------------------------------------------------------------------------------- /tasks/train.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/tasks/train.rb -------------------------------------------------------------------------------- /test/catalog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CrossRef/pdfextract/HEAD/test/catalog --------------------------------------------------------------------------------