├── .gitignore ├── Gemfile ├── LICENSE ├── README.md ├── anystyle-cli.gemspec ├── bin └── anystyle └── lib └── anystyle ├── cli.rb └── cli ├── commands ├── base.rb ├── check.rb ├── find.rb ├── parse.rb └── train.rb └── version.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | /.bundle/ 3 | Gemfile.lock 4 | .ruby-version 5 | .byebug_history 6 | res 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gemspec 3 | 4 | group :development, :test do 5 | gem 'anystyle', github: 'inukshuk/anystyle' 6 | end 7 | 8 | group :debug do 9 | gem 'debug', require: false 10 | end 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Sylvester Keil 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | AnyStyle Command Line Interface 2 | =============================== 3 | 4 | anystyle --help 5 | --------------- 6 | NAME 7 | anystyle - Finds and parses bibliographic references 8 | 9 | SYNOPSIS 10 | anystyle [global options] command [command options] [arguments...] 11 | 12 | VERSION 13 | 1.1.0 (cli 1.0.2, data 1.2.0) 14 | 15 | GLOBAL OPTIONS 16 | -F, --finder-model=file - Set the finder model file (default: none) 17 | -P, --parser-model=file - Set the parser model file (default: none) 18 | --adapter=name - Set the dictionary adapter (default: ruby) 19 | -f, --format=name - Set the output format (default: ["json"]) 20 | --pdfinfo=path - Set the path for pdfinfo (default: none) 21 | --pdftotext=path - Set the path for pdftotext (default: none) 22 | --help - Show this message 23 | --[no-]stdout - Print results directly to stdout 24 | --[no-]verbose - Print status messages to stderr 25 | --version - Display the program version 26 | -w, --[no-]overwrite - Allow overwriting existing files 27 | 28 | COMMANDS 29 | check - Check tagged documents or references 30 | find - Find and extract references from text documents 31 | help - Shows a list of commands or help for one command 32 | license - Print license information 33 | parse - Parse and convert references 34 | train - Create a new finder or parser model 35 | 36 | anystyle help find 37 | ------------------ 38 | NAME 39 | find - Find and extract references from text documents 40 | 41 | SYNOPSIS 42 | anystyle [global options] find [command options] input [output] 43 | 44 | DESCRIPTION 45 | This manual page documents the AnyStyle `find' command. AnyStyle `find' 46 | analyzes PDF or text documents and extracts all references it finds. 47 | 48 | The input argument can be a single PDF or text document, or a folder 49 | containing multiple documents. The (optional) output argument specifies 50 | the folder where the results shall be saved; if no output folder is 51 | specified, results will be saved in the folder containing the input. 52 | 53 | AnyStyle `find' supports the following formats: 54 | bib BibTeX (references only); 55 | csl CSL/JSON (references only); 56 | ris RIS (references only); 57 | json AnyStyle JSON (references only); 58 | ref One reference per line, suitable for parser input; 59 | txt Plain text document; 60 | ttx Tagged document format, used for training the finder model; 61 | xml References only, XML, suitable for training the parser model. 62 | 63 | You can specify multiple output formats, separated by commas. 64 | 65 | Anlyzing PDF documents currently depends on `pdftotext' which must be 66 | installed separately. 67 | EXAMPLES 68 | anystyle -f csl,xml find thesis.pdf 69 | 70 | Extract references from `thesis.pdf' and save them in `thesis.csl' and 71 | `thesis.xml'. 72 | 73 | anystyle -f bib find --no-layout thesis.pdf bib 74 | 75 | Extract references from `thesis.pdf' in `no-layout' mode (e.g., use this 76 | if your document uses a multi-column layout) and save them in BibTeX in 77 | `./bib/thesis.bib'. 78 | 79 | anystyle find --crop 72 thesis.pdf - 80 | 81 | Extract references from `thesis.pdf' cropping away one inch (72pt) from 82 | each page border and print the results to STDOUT. 83 | 84 | anystyle find --crop 72,28 thesis.pdf - 85 | 86 | Extract references from `thesis.pdf' cropping away one inch (72pt) from 87 | each page's left and right border, approx. 1cm (28pt) from the top 88 | and bottom. 89 | 90 | 91 | COMMAND OPTIONS 92 | -C, --crop=pt - Set cropping boundary for text extraction (default: none) 93 | --[no-]layout - Use layout mode for PDF text extraction (default: enabled) 94 | --[no-]solo - Include references outside of reference sections 95 | 96 | 97 | anystyle help parse 98 | ------------------- 99 | COMMAND OPTIONS 100 | --[no-]layout - Use layout mode for PDF text extraction (default: enabled) 101 | NAME 102 | parse - Parse and convert references 103 | 104 | SYNOPSIS 105 | anystyle [global options] parse input [output] 106 | 107 | DESCRIPTION 108 | This manual page documents the AnyStyle `parse' command. AnyStyle `parse' 109 | segments references (one per line) and converts them into structured 110 | formats. 111 | 112 | The input argument can be a single text document containing one full 113 | reference per line (blank lines will be ignored), or a folder containing 114 | multiple documents. The (optional) output argument specifies 115 | the folder where the results shall be saved; if no output folder is 116 | specified, results will be saved in the folder containing the input. 117 | 118 | AnyStyle `parse' supports the following formats: 119 | bib BibTeX (normalized); 120 | csl CSL/JSON (normalized); 121 | ris ris (normalized); 122 | json AnyStyle JSON (normalized); 123 | ref One reference per line, suitable for parser input; 124 | txt Same as `ref'; 125 | xml XML, suitable for training the parser model. 126 | 127 | You can specify multiple output formats, separated by commas. 128 | 129 | EXAMPLES 130 | anystyle -f json,xml parse biblio.txt 131 | 132 | Extract references from `biblio.txt' and save them in `biblio.json' and 133 | `biblio.xml'. 134 | 135 | anystyle --stdout -f csl parse input.txt 136 | 137 | Extract references from `input.txt' and print them to STDOUT in CSL/JSON. 138 | 139 | 140 | anystyle help check 141 | ------------------- 142 | NAME 143 | check - Check tagged documents or references 144 | 145 | SYNOPSIS 146 | anystyle [global options] check input 147 | 148 | DESCRIPTION 149 | This manual page documents the AnyStyle `check' command. AnyStyle `check' 150 | analyzes tagged text documents or references. 151 | 152 | The input argument can be a single TTX or XML document, or a folder 153 | containing multiple documents. 154 | 155 | AnyStyle `check' supports the following input formats: 156 | ttx Tagged document format, used for training the finder model; 157 | xml References only, XML, suitable for training the parser model. 158 | 159 | EXAMPLES 160 | anystyle check training-data.xml 161 | 162 | Checks all references in the XML file and prints a report to STDOUT. 163 | 164 | 165 | anystyle help train 166 | ------------------- 167 | NAME 168 | train - Create a new finder or parser model 169 | 170 | SYNOPSIS 171 | anystyle [global options] train input [output] 172 | 173 | DESCRIPTION 174 | This manual page documents the AnyStyle `train' command. AnyStyle `train' 175 | creates a new finder or parser model based on the supplied training sets. 176 | 177 | The input argument can be a XML document, or a folder containing multiple 178 | TTX documents. 179 | 180 | EXAMPLES 181 | anystyle train data.xml my-model.mod 182 | 183 | Creates a new parser model based on the XML training set and saves it 184 | as `my-model.mod'. To use your model use the global `--finder-model' 185 | or `--parser-model' flags. 186 | 187 | 188 | License 189 | ------- 190 | Copyright 2011-2018 Sylvester Keil. All rights reserved. 191 | 192 | AnyStyle is distributed under a BSD-style license. 193 | See LICENSE for details. 194 | -------------------------------------------------------------------------------- /anystyle-cli.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | lib = File.expand_path('../lib/', __FILE__) 3 | $:.unshift lib unless $:.include?(lib) 4 | 5 | require 'anystyle/cli/version' 6 | 7 | Gem::Specification.new do |s| 8 | s.name = 'anystyle-cli' 9 | s.version = AnyStyle::CLI::VERSION.dup 10 | s.platform = Gem::Platform::RUBY 11 | s.authors = ['Sylvester Keil'] 12 | s.email = ['http://sylvester.keil.or.at'] 13 | s.homepage = 'http://anystyle.io' 14 | s.summary = 'AnyStyle CLI' 15 | s.description = 'A command line interface to the AnyStyle Parser and Finder.' 16 | s.license = 'BSD-2-Clause' 17 | s.require_path = 'lib' 18 | s.bindir = 'bin' 19 | s.executables = ['anystyle'] 20 | s.required_ruby_version = '>= 2.3' 21 | 22 | s.add_runtime_dependency('anystyle', '~>1.6') 23 | s.add_runtime_dependency('gli', '~>2.17') 24 | 25 | s.files = `git ls-files`.split("\n") - %w{ 26 | .gitignore 27 | Gemfile 28 | anystyle-cli.gemspec 29 | } 30 | end 31 | 32 | # vim: syntax=ruby 33 | -------------------------------------------------------------------------------- /bin/anystyle: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'gli' 4 | require 'anystyle/cli' 5 | 6 | include GLI::App 7 | include AnyStyle::CLI 8 | 9 | program_desc 'Finds and parses bibliographic references' 10 | 11 | version '%s (cli %s, data %s)' % [ 12 | AnyStyle::VERSION, VERSION, AnyStyle::Data::VERSION 13 | ] 14 | 15 | subcommand_option_handling :normal 16 | arguments :strict 17 | 18 | wrap_help_text :verbatim 19 | 20 | accept(Array) { |value| value.split(',') } 21 | 22 | #config_file '.anystyle' 23 | 24 | switch 'verbose', 25 | desc: 'Print status messages to stderr' 26 | 27 | switch ['w', 'overwrite'], 28 | desc: 'Allow overwriting existing files' 29 | 30 | switch 'stdout', 31 | desc: 'Print results directly to stdout' 32 | 33 | flag ['F', 'finder-model'], 34 | arg_name: 'file', 35 | desc: 'Set the finder model file' 36 | 37 | flag ['P', 'parser-model'], 38 | arg_name: 'file', 39 | desc: 'Set the parser model file' 40 | 41 | flag 'pdftotext', 42 | arg_name: 'path', 43 | desc: 'Set the path for pdftotext' 44 | 45 | flag 'pdfinfo', 46 | arg_name: 'path', 47 | desc: 'Set the path for pdfinfo' 48 | 49 | 50 | flag 'adapter', 51 | default_value: 'ruby', 52 | arg_name: 'name', 53 | must_match: %w{ ruby memory gdbm }, 54 | desc: 'Set the dictionary adapter' 55 | 56 | flag ['f', 'format'], 57 | default_value: ['json'], 58 | arg_name: 'name', 59 | type: Array, 60 | must_match: /(bib|csl|ris|json|ref|ttx|txt|xml)(,(bib|csl|ris|json|ref|ttx|txt|xml))*/, 61 | desc: 'Set the output format' 62 | 63 | 64 | pre do |opts| 65 | AnyStyle::Dictionary.defaults[:adapter] = opts[:adapter] 66 | 67 | unless opts[:'finder-model'].nil? 68 | AnyStyle::Finder.defaults[:model] = 69 | File.expand_path(opts[:'finder-model']) 70 | end 71 | 72 | unless opts[:'parser-model'].nil? 73 | AnyStyle::Parser.defaults[:model] = 74 | File.expand_path(opts[:'parser-model']) 75 | end 76 | 77 | unless opts[:pdftotext].nil? 78 | AnyStyle::Finder.defaults[:pdftotext] = 79 | opts[:pdftotext] 80 | end 81 | 82 | unless opts[:pdfinfo].nil? 83 | AnyStyle::Finder.defaults[:pdfinfo] = 84 | opts[:pdfinfo] 85 | end 86 | 87 | AnyStyle 88 | end 89 | 90 | 91 | desc 'Find and extract references from text documents' 92 | long_desc %{ 93 | This manual page documents the AnyStyle `find' command. AnyStyle `find' 94 | analyzes PDF or text documents and extracts all references it finds. 95 | 96 | The input argument can be a single PDF or text document, or a folder 97 | containing multiple documents. The (optional) output argument specifies 98 | the folder where the results shall be saved; if no output folder is 99 | specified, results will be saved in the folder containing the input. 100 | 101 | AnyStyle `find' supports the following formats: 102 | bib BibTeX (references only); 103 | csl CSL/JSON (references only); 104 | ris RIS (references only); 105 | json AnyStyle JSON (references only); 106 | ref One reference per line, suitable for parser input; 107 | txt Plain text document; 108 | ttx Tagged document format, used for training the finder model; 109 | xml References only, XML, suitable for training the parser model. 110 | 111 | You can specify multiple output formats, separated by commas. 112 | 113 | Anlyzing PDF documents currently depends on `pdftotext' which must be 114 | installed separately. 115 | 116 | EXAMPLES 117 | anystyle -f csl,xml find thesis.pdf 118 | 119 | Extract references from `thesis.pdf' and save them in `thesis.csl' and 120 | `thesis.xml'. 121 | 122 | anystyle -f bib find --no-layout thesis.pdf bib 123 | 124 | Extract references from `thesis.pdf' in `no-layout' mode (e.g., use this 125 | if your document uses a multi-column layout) and save them in BibTeX in 126 | `./bib/thesis.bib'. 127 | 128 | anystyle find --crop 72 thesis.pdf - 129 | 130 | Extract references from `thesis.pdf' cropping away one inch (72pt) from 131 | each page border and print the results to STDOUT. 132 | 133 | anystyle find --crop 72,28 thesis.pdf - 134 | 135 | Extract references from `thesis.pdf' cropping away one inch (72pt) from 136 | each page's left and right border, approx. 1cm (28pt) from the top 137 | and bottom. 138 | }.lstrip 139 | 140 | arg :input 141 | arg :output, :optional 142 | command :find do |cmd| 143 | cmd.switch 'layout', 144 | default_value: true, 145 | desc: 'Use layout mode for PDF text extraction' 146 | 147 | cmd.switch 'solo', 148 | default_value: false, 149 | desc: 'Include references outside of reference sections' 150 | 151 | cmd.flag ['C', 'crop'], 152 | arg_name: 'pt', 153 | type: Array, 154 | must_match: /\d+(,\d+)?|\d+,\d+(,-?\d+){2}/, 155 | desc: 'Set cropping boundary for text extraction' 156 | 157 | cmd.action do |opts, params, args| 158 | Commands::Find.new(opts).run(args, params) 159 | end 160 | end 161 | 162 | 163 | desc 'Parse and convert references' 164 | long_desc %{ 165 | This manual page documents the AnyStyle `parse' command. AnyStyle `parse' 166 | segments references (one per line) and converts them into structured 167 | formats. 168 | 169 | The input argument can be a single text document containing one full 170 | reference per line (blank lines will be ignored), or a folder containing 171 | multiple documents. The (optional) output argument specifies 172 | the folder where the results shall be saved; if no output folder is 173 | specified, results will be saved in the folder containing the input. 174 | 175 | AnyStyle `parse' supports the following formats: 176 | bib BibTeX (normalized); 177 | csl CSL/JSON (normalized); 178 | json AnyStyle JSON (normalized); 179 | ref One reference per line, suitable for parser input; 180 | txt Same as `ref'; 181 | xml XML, suitable for training the parser model. 182 | 183 | You can specify multiple output formats, separated by commas. 184 | 185 | EXAMPLES 186 | anystyle -f json,xml parse biblio.txt 187 | 188 | Extract references from `biblio.txt' and save them in `biblio.json' and 189 | `biblio.xml'. 190 | 191 | anystyle --stdout -f csl parse input.txt 192 | 193 | Extract references from `input.txt' and print them to STDOUT in CSL/JSON. 194 | }.lstrip 195 | 196 | arg :input 197 | arg :output, :optional 198 | command :parse do |cmd| 199 | cmd.action do |opts, params, args| 200 | Commands::Parse.new(opts).run(args, params) 201 | end 202 | end 203 | 204 | desc 'Check tagged documents or references' 205 | long_desc %{ 206 | This manual page documents the AnyStyle `check' command. AnyStyle `check' 207 | analyzes tagged text documents or references. 208 | 209 | The input argument can be a single TTX or XML document, or a folder 210 | containing multiple documents. 211 | 212 | AnyStyle `check' supports the following input formats: 213 | ttx Tagged document format, used for training the finder model; 214 | xml References only, XML, suitable for training the parser model. 215 | 216 | EXAMPLES 217 | anystyle check training-data.xml 218 | 219 | Checks all references in the XML file and prints a report to STDOUT. 220 | }.lstrip 221 | 222 | arg :input 223 | command :check do |cmd| 224 | cmd.action do |opts, params, args| 225 | Commands::Check.new(opts).run(args, params) 226 | end 227 | end 228 | 229 | 230 | desc 'Create a new finder or parser model' 231 | long_desc %{ 232 | This manual page documents the AnyStyle `train' command. AnyStyle `train' 233 | creates a new finder or parser model based on the supplied training sets. 234 | 235 | The input argument can be a XML document, or a folder containing multiple 236 | TTX documents. 237 | 238 | EXAMPLES 239 | anystyle train data.xml my-model.mod 240 | 241 | Creates a new parser model based on the XML training set and saves it 242 | as `my-model.mod'. To use your model use the global `--finder-model' 243 | or `--parser-model' flags. 244 | }.lstrip 245 | 246 | arg :input 247 | arg :output, :optional 248 | command :train do |cmd| 249 | cmd.action do |opts, params, args| 250 | Commands::Train.new(opts).run(args, params) 251 | end 252 | end 253 | 254 | desc 'Print license information' 255 | command :license do |cmd| 256 | cmd.action do 257 | puts 'AnyStyle.' 258 | puts 'Copyright (C) 2011-%d Sylvester Keil.' % Time.now.year 259 | puts <<~EOL 260 | 261 | Wapiti. 262 | Copyright (C) 2009-2013 CNRS. 263 | 264 | All rights reserved. 265 | 266 | Redistribution and use in source and binary forms, with or without 267 | modification, are permitted provided that the following conditions are met: 268 | 269 | * Redistributions of source code must retain the above copyright notice, this 270 | list of conditions and the following disclaimer. 271 | 272 | * Redistributions in binary form must reproduce the above copyright notice, 273 | this list of conditions and the following disclaimer in the documentation 274 | and/or other materials provided with the distribution. 275 | 276 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR 277 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 278 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 279 | EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 280 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 281 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 282 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 283 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 284 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 285 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 286 | 287 | EOL 288 | end 289 | end 290 | 291 | exit run(ARGV) 292 | 293 | # vim: syntax=ruby 294 | -------------------------------------------------------------------------------- /lib/anystyle/cli.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | require 'pathname' 3 | require 'anystyle' 4 | 5 | require 'anystyle/cli/version' 6 | require 'anystyle/cli/commands/base' 7 | require 'anystyle/cli/commands/check' 8 | require 'anystyle/cli/commands/find' 9 | require 'anystyle/cli/commands/parse' 10 | require 'anystyle/cli/commands/train' 11 | -------------------------------------------------------------------------------- /lib/anystyle/cli/commands/base.rb: -------------------------------------------------------------------------------- 1 | module AnyStyle 2 | module CLI 3 | module Commands 4 | class Base 5 | attr_reader :options, :output_folder 6 | 7 | def initialize(options) 8 | @options = options 9 | end 10 | 11 | def run(args, params) 12 | raise NotImplementedYet 13 | end 14 | 15 | def verbose? 16 | !!options[:verbose] 17 | end 18 | 19 | def stdout? 20 | !!options[:stdout] 21 | end 22 | 23 | def overwrite? 24 | !!options[:overwrite] 25 | end 26 | 27 | def each_format(&block) 28 | options[:format].each(&block) 29 | end 30 | 31 | def find(input, opts = {}) 32 | AnyStyle.find(input, 33 | format: :wapiti, 34 | layout: opts[:layout], 35 | crop: opts[:crop].nil? ? nil : opts[:crop].map(&:to_i)) 36 | end 37 | 38 | def parse_file(file) 39 | parse(Wapiti::Dataset.open(file, **AnyStyle::Parser.defaults)) 40 | end 41 | 42 | def parse(input) 43 | AnyStyle.parse(input, format: :wapiti) 44 | end 45 | 46 | def format(dataset, fmt) 47 | case fmt 48 | when 'bib' 49 | AnyStyle.parser.format_bibtex(dataset).to_s 50 | when 'csl' 51 | JSON.pretty_generate AnyStyle.parser.format_csl(dataset) 52 | when 'ris' 53 | AnyStyle.parser.format_ris(dataset).to_s 54 | when 'json' 55 | JSON.pretty_generate AnyStyle.parser.format_hash(dataset) 56 | when 'ref', 'txt' 57 | dataset.to_txt 58 | when 'xml' 59 | dataset.to_xml(indent: 2).to_s 60 | else 61 | raise ArgumentError, "format not supported: #{fmt}" 62 | end 63 | end 64 | 65 | def extsub(path, new_extname) 66 | basename = path.basename(path.extname) 67 | path.dirname.join("#{basename}#{new_extname}") 68 | end 69 | 70 | def transpose(path, base_path) 71 | if output_folder.nil? 72 | path 73 | else 74 | output_folder.join(path.relative_path_from(base_path)) 75 | end 76 | end 77 | 78 | def set_output_folder(path) 79 | case path 80 | when nil, '-' 81 | options[:stdout] = true 82 | else 83 | @output_folder = Pathname.new(path).expand_path 84 | end 85 | ensure 86 | unless @output_folder.nil? 87 | if @output_folder.exist? 88 | raise ArgumentError, 89 | "not a directory: #{path}" unless @output_folder.directory? 90 | else 91 | @output_folder.mkdir 92 | end 93 | end 94 | end 95 | 96 | def say(*args) 97 | STDERR.puts(*args) if verbose? 98 | end 99 | 100 | def report(error, file) 101 | STDERR.puts "Error processing `#{file}'" 102 | STDERR.puts " #{error.message}" 103 | STDERR.puts " #{error.backtrace[0]}" 104 | STDERR.puts " #{error.backtrace[1]}" 105 | STDERR.puts " ..." 106 | end 107 | 108 | def walk(input) 109 | path = Pathname(input).expand_path 110 | raise ArgumentError, "path does not exist: #{input}" unless path.exist? 111 | 112 | if path.directory? 113 | path.each_child do |file| 114 | begin 115 | yield file, path unless file.directory? 116 | rescue => e 117 | report e, file.relative_path_from(path) 118 | end 119 | end 120 | else 121 | begin 122 | yield path, path.dirname 123 | rescue => e 124 | report e, path.basename 125 | end 126 | end 127 | end 128 | 129 | def write(content, path, base_path) 130 | if stdout? 131 | STDOUT.puts(content) 132 | else 133 | path = transpose(path, base_path) 134 | if !overwrite? && path.exist? 135 | raise RuntimeError, 136 | "file exists, use --overwrite to force saving: #{path}" 137 | end 138 | File.write path, content 139 | end 140 | end 141 | end 142 | end 143 | end 144 | end 145 | -------------------------------------------------------------------------------- /lib/anystyle/cli/commands/check.rb: -------------------------------------------------------------------------------- 1 | module AnyStyle 2 | module CLI 3 | module Commands 4 | class Check < Base 5 | def run(args, params) 6 | walk args[0] do |path| 7 | print 'Checking %.25s' % "#{path.basename}....................." 8 | start = Time.now 9 | stats = check path 10 | report stats, Time.now - start 11 | end 12 | end 13 | 14 | def check(path) 15 | case path.extname 16 | when '.ttx' 17 | AnyStyle.finder.check Wapiti::Dataset.open(path.to_s) 18 | when '.xml' 19 | AnyStyle.parser.check Wapiti::Dataset.open(path.to_s) 20 | else 21 | raise ArgumentError, "cannot check untagged input: #{path}" 22 | end 23 | end 24 | 25 | def report(stats, time) 26 | if stats[:token][:errors] == 0 27 | puts ' ✓ %2ds' % time 28 | else 29 | puts '%4d seq %6.2f%% %6d tok %5.2f%% %2ds' % [ 30 | stats[:sequence][:errors], 31 | stats[:sequence][:rate], 32 | stats[:token][:errors], 33 | stats[:token][:rate], 34 | time 35 | ] 36 | end 37 | end 38 | end 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/anystyle/cli/commands/find.rb: -------------------------------------------------------------------------------- 1 | module AnyStyle 2 | module CLI 3 | module Commands 4 | class Find < Base 5 | def run(args, params) 6 | set_output_folder args[1] 7 | walk args[0] do |path, base_path| 8 | say "Analyzing #{path.relative_path_from(base_path)} ..." 9 | doc = find(path.to_s, params) 10 | ref = doc[0].references(normalize_blocks: !params[:solo]) 11 | 12 | if ref.length == 0 13 | say "no references found." 14 | else 15 | say "#{ref.length} references found." 16 | dst = nil 17 | each_format do |fmt| 18 | case fmt 19 | when 'ttx' 20 | res = doc.to_s tagged: true 21 | when 'txt' 22 | res = doc.to_s tagged: false 23 | when 'ref' 24 | res = ref.join("\n") 25 | else 26 | dst ||= parse(ref.join("\n")) 27 | res = format(dst, fmt) 28 | end 29 | 30 | out = extsub(path, ".#{fmt}") 31 | say "Writing #{out.relative_path_from(base_path)} ..." 32 | write res, out, base_path 33 | end 34 | end 35 | end 36 | end 37 | end 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/anystyle/cli/commands/parse.rb: -------------------------------------------------------------------------------- 1 | module AnyStyle 2 | module CLI 3 | module Commands 4 | class Parse < Base 5 | def run(args, params) 6 | set_output_folder args[1] 7 | walk args[0] do |path, base_path| 8 | say "Parsing #{path.relative_path_from(base_path)} ..." 9 | dataset = parse_file(path.to_s) 10 | say "#{dataset.length} references found." 11 | each_format do |fmt| 12 | res = format(dataset, fmt) 13 | out = extsub(path, ".#{fmt}") 14 | say "Writing #{out.relative_path_from(base_path)} ..." 15 | write res, out, base_path 16 | end 17 | end 18 | end 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/anystyle/cli/commands/train.rb: -------------------------------------------------------------------------------- 1 | module AnyStyle 2 | module CLI 3 | module Commands 4 | class Train < Base 5 | def run(args, params) 6 | check_no_overwrite! args[1] 7 | 8 | Wapiti.debug! 9 | model = train(args[0]) 10 | 11 | if args[1].nil? 12 | model.save 13 | else 14 | model.save File.expand_path(args[1]) 15 | end 16 | end 17 | 18 | def train(path) 19 | case 20 | when File.extname(path) == '.xml' 21 | AnyStyle.parser.train Wapiti::Dataset.open(path.to_s) 22 | AnyStyle.parser.model 23 | when File.directory?(path) 24 | AnyStyle.finder.train Dir[File.join(path, '*.ttx')] 25 | AnyStyle.finder.model 26 | else 27 | raise ArgumentError, "cannot train input: #{path}" 28 | end 29 | end 30 | 31 | def check_no_overwrite!(path) 32 | if !overwrite? && (path.nil? || File.exist?(path)) 33 | raise RuntimeError, 34 | "file exists, use --overwrite to force saving: #{path}" 35 | end 36 | end 37 | end 38 | end 39 | end 40 | end 41 | 42 | -------------------------------------------------------------------------------- /lib/anystyle/cli/version.rb: -------------------------------------------------------------------------------- 1 | module AnyStyle 2 | module CLI 3 | VERSION = '1.5.0'.freeze 4 | end 5 | end 6 | --------------------------------------------------------------------------------