├── .gitignore
├── 1.1
    ├── spec.md
    └── spec_zh_CN.md
├── 1.2
    ├── defs.yml
    ├── index.html
    ├── spec.after.html
    ├── spec.before.html
    ├── spec.md
    └── templates
    │   ├── element
    │   └── property
├── Makefile
├── README.md
├── biblio.json
├── gen-defs.py
├── hocr-spec.md
├── images
    ├── baseline.png
    ├── bbox-crop.png
    └── bbox.odg
├── index.html
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | 1.2/include/defs
2 | 


--------------------------------------------------------------------------------
/1.1/spec.md:
--------------------------------------------------------------------------------
   1 | # The hOCR Embedded OCR Workflow and Output Format, version 1.1
   2 | 
   3 | **OBSOLETE**: [Version 1.2](http://kba.github.io/hocr-spec/1.2/) supersedes this document.
   4 | 
   5 | The purpose of this document is to define an open standard for representing OCR
   6 | results. The goal is to reuse as much existing technology as possible, and to
   7 | arrive at a representation that makes it easy to reuse OCR results.
   8 | 
   9 | This is the english translation, a [chinese translation is available as well](./spec_zh_CN.md).
  10 | 
  11 | 
  12 | ## Table of Contents
  13 | 
  14 | <!-- BEGIN-MARKDOWN-TOC -->
  15 | * [Table of Contents](#table-of-contents)
  16 | * [Revision History](#revision-history)
  17 | * [1 Rationale](#1-rationale)
  18 | * [2 Getting Started](#2-getting-started)
  19 | * [3 Terminology and Representation](#3-terminology-and-representation)
  20 | 	* [General Properties](#general-properties)
  21 | 		* [`bbox`](#bbox)
  22 | 		* [`textangle`](#textangle)
  23 | 	* [Non-recommended general properties](#non-recommended-general-properties)
  24 | 		* [`poly`](#poly)
  25 | 		* [`order`](#order)
  26 | 		* [`presence`](#presence)
  27 | 		* [`cflow`](#cflow)
  28 | 		* [`baseline`](#baseline)
  29 | * [4 Logical Structuring Elements](#4-logical-structuring-elements)
  30 | 	* [`ocr_document`](#ocr_document)
  31 | 	* [`ocr_title`](#ocr_title)
  32 | 	* [`ocr_author`](#ocr_author)
  33 | 	* [`ocr_abstract`](#ocr_abstract)
  34 | 	* [`ocr_part`](#ocr_part)
  35 | 	* [`ocr_chapter`](#ocr_chapter)
  36 | 	* [`ocr_section`](#ocr_section)
  37 | 	* [`ocr_subsubsection`](#ocr_subsubsection)
  38 | 	* [`ocr_display`](#ocr_display)
  39 | 	* [`ocr_blockquote`](#ocr_blockquote)
  40 | 	* [`ocr_par`](#ocr_par)
  41 | 	* [`ocr_linear`](#ocr_linear)
  42 | 	* [`ocr_caption`](#ocr_caption)
  43 | * [5 Typesetting Related Elements](#5-typesetting-related-elements)
  44 | 	* [Classes for typesetting elements](#classes-for-typesetting-elements)
  45 | 		* [`ocr_page`](#ocr_page)
  46 | 		* [`ocr_column`](#ocr_column)
  47 | 		* [`ocr_carea`](#ocr_carea)
  48 | 		* [`ocr_line`](#ocr_line)
  49 | 		* [`ocr_separator`](#ocr_separator)
  50 | 		* [`ocr_noise`](#ocr_noise)
  51 | 	* [Recommended Properties for typesetting elements](#recommended-properties-for-typesetting-elements)
  52 | 		* [`bbox (typesetting)`](#bbox-typesetting)
  53 | 		* [`image`](#image)
  54 | 		* [`imagemd5`](#imagemd5)
  55 | 		* [`ppageno`](#ppageno)
  56 | 		* [`lpageno`](#lpageno)
  57 | 	* [Optional Properties for typesetting elements](#optional-properties-for-typesetting-elements)
  58 | 		* [`scan_res`](#scan_res)
  59 | 		* [`x_scanner`](#x_scanner)
  60 | 		* [`x_source`](#x_source)
  61 | 		* [`hardbreak`](#hardbreak)
  62 | 	* [Classes for floats](#classes-for-floats)
  63 | 		* [`ocr_float`](#ocr_float)
  64 | 		* [`ocr_separator`](#ocr_separator-1)
  65 | 		* [`ocr_textfloat`](#ocr_textfloat)
  66 | 		* [`ocr_textimage`](#ocr_textimage)
  67 | 		* [`ocr_image`](#ocr_image)
  68 | 		* [`ocr_linedrawing`](#ocr_linedrawing)
  69 | 		* [`ocr_photo`](#ocr_photo)
  70 | 		* [`ocr_header`](#ocr_header)
  71 | 		* [`ocr_footer`](#ocr_footer)
  72 | 		* [`ocr_pageno`](#ocr_pageno)
  73 | 		* [`ocr_table`](#ocr_table)
  74 | * [6 Inline Representations](#6-inline-representations)
  75 | 	* [Classes for Inline Representation](#classes-for-inline-representation)
  76 | 		* [`ocr_glyph`](#ocr_glyph)
  77 | 		* [`ocr_glyphs`](#ocr_glyphs)
  78 | 		* [`ocr_dropcap`](#ocr_dropcap)
  79 | 		* [`ocr_chem`](#ocr_chem)
  80 | 		* [`ocr_math`](#ocr_math)
  81 | 		* [Non-breaking space](#non-breaking-space)
  82 | 		* [Non-default spaces](#non-default-spaces)
  83 | 		* [Hyphenation](#hyphenation)
  84 | 		* [Superscript and Subscript](#superscript-and-subscript)
  85 | 		* [Ruby characters](#ruby-characters)
  86 | * [7 Character Information](#7-character-information)
  87 | 	* [Classes for Character Information](#classes-for-character-information)
  88 | 		* [`ocr_cinfo`](#ocr_cinfo)
  89 | 	* [Properties for Character Information](#properties-for-character-information)
  90 | 		* [`cuts`](#cuts)
  91 | 		* [`nlp`](#nlp)
  92 | * [8 OCR Engine-Specific Markup](#8-ocr-engine-specific-markup)
  93 | 	* [Classes for engine specific markup](#classes-for-engine-specific-markup)
  94 | 		* [`ocrx_block`](#ocrx_block)
  95 | 		* [`ocrx_line`](#ocrx_line)
  96 | 		* [`ocrx_word`](#ocrx_word)
  97 | 	* [Properties for engine-specific markup](#properties-for-engine-specific-markup)
  98 | 		* [`x_font`](#x_font)
  99 | 		* [`x_fsize`](#x_fsize)
 100 | 		* [`x_boxes`](#x_boxes)
 101 | 		* [`x_confs`](#x_confs)
 102 | 		* [`x_wconf`](#x_wconf)
 103 | * [9 Font, Text Color, Language, Direction](#9-font-text-color-language-direction)
 104 | * [10 Alternative Segmentations / Readings](#10-alternative-segmentations--readings)
 105 | * [11 Grouped Elements and Multiple Hierarchies](#11-grouped-elements-and-multiple-hierarchies)
 106 | * [12 Capabilities](#12-capabilities)
 107 | 	* [`ocrp_lang`](#ocrp_lang)
 108 | 	* [`ocrp_dir`](#ocrp_dir)
 109 | 	* [`ocrp_poly`](#ocrp_poly)
 110 | 	* [`ocrp_font`](#ocrp_font)
 111 | 	* [`ocrp_nlp`](#ocrp_nlp)
 112 | 	* [`ocr_embeddedformat_<formatname>`](#ocr_embeddedformat_)
 113 | 	* [`ocr_<tag>_unordered`](#ocr__unordered)
 114 | * [13 Profiles](#13-profiles)
 115 | * [14 Required Meta Information](#14-required-meta-information)
 116 | * [15 HTML Markup](#15-html-markup)
 117 | 	* [`html_none`](#html_none)
 118 | 	* [`html_ocr`](#html_ocr)
 119 | 	* [`html_absolute`](#html_absolute)
 120 | 	* [`html_xytable`](#html_xytable)
 121 | 	* [`html_simpl`](#html_simpl)
 122 | 	* [15.1 Restrictions on HTML Content](#151-restrictions-on-html-content)
 123 | 	* [15.2 Recommendations for Mappings](#152-recommendations-for-mappings)
 124 | 		* [15.2.1 html_none](#1521-html_none)
 125 | 		* [15.2.2 html_simple](#1522-html_simple)
 126 | 		* [15.2.3 html_ocr_<engine>](#1523-html_ocr_)
 127 | 		* [15.2.4 html_absolute_<element>](#1524-html_absolute_)
 128 | 		* [15.2.5 html_xytable_absolute](#1525-html_xytable_absolute)
 129 | 		* [15.2.6 html_xytable_relative](#1526-html_xytable_relative)
 130 | 		* [15.2.7 html_<processor>](#1527-html_)
 131 | 			* [`html_latex2html`](#html_latex2html)
 132 | 			* [`html_msword`](#html_msword)
 133 | 			* [`html_ooffice`](#html_ooffice)
 134 | 			* [`html_docbook_xsl`](#html_docbook_xsl)
 135 | * [16 Document Meta Information](#16-document-meta-information)
 136 | * [17 Sample Usage](#17-sample-usage)
 137 | 
 138 | <!-- END-MARKDOWN-TOC -->
 139 | 
 140 | ## Revision History
 141 | 
 142 | hOCR has been originally developed by Thomas Breuel.
 143 | 
 144 | See the [releases](https://github.com/kba/hocr-spec/releases/) and full [commit
 145 | history](https://github.com/kba/hocr-spec/commits/) for a revision history.
 146 | 
 147 | ## 1 Rationale
 148 | 
 149 | The purpose of this document is to define an open standard for representing OCR
 150 | results. The goal is to reuse as much existing technology as possible, and to
 151 | arrive at a representation that makes it easy to reuse OCR results.
 152 | 
 153 | 
 154 | ## 2 Getting Started
 155 | 
 156 | This document describes many tags and a lot of information that can be output.
 157 | However, getting started with hOCR is easy: you only need to output the tags
 158 | and information you actually want to.  For example, just outputting `ocr_line`
 159 | tags with bounding boxes is already very useful for many applications.  Just
 160 | start simple and add more output information as the need arises.
 161 | 
 162 | 
 163 | ## 3 Terminology and Representation
 164 | 
 165 | This document describes a representation of various aspects of OCR output in an
 166 | XML-like format. That is, we define as set of tags containing text and other
 167 | tags, together with attributes of those tags. However, since the content we are
 168 | representing is formatted text,
 169 | 
 170 | However, we are not actually using a new XML for the representation; instead
 171 | embed the representation in XHTML (or HTML) because XHTML and XHTML processing
 172 | already define many aspects of OCR output representation that would otherwise
 173 | need additional, separate and ad-hoc definitions. These aspects include:
 174 | 
 175 | * standard representations for common logical structuring elements, including
 176 |   section headings, citations, tables, emphasis, line breaks, quotations,
 177 |   citations, and preformatted text
 178 | * standard representations for fonts, embedded images, embedded vector
 179 |   graphics, tables, languages, writing direction, colors
 180 | * standard representations for geometric layout and positioning
 181 | * output files that are understood without any further modification by widely
 182 |   used viewers (browsers), editors, conversion tools, and indexing tools
 183 | * libraries for parsing and generating the content
 184 | * support for document metadata
 185 | 
 186 | We are embedding this information inside HTML by encoding it within valid tags
 187 | and attributes inside HTML; We are going to use the terms "elements" and
 188 | "properties" for referring to embedded markup.
 189 | 
 190 | Elements are defined by the class= attribute on an arbitrary HTML tag. All
 191 | elements in this format have a class name of the form `ocr..._...`.
 192 | 
 193 | Properties are defined by putting information into the `title=` attribute of an
 194 | HTML tag. Properties in title attributes are of the form “name values...”, and
 195 | multiple properties are separated by semicolons.
 196 | 
 197 | Here is an example:
 198 | 
 199 | ```html
 200 | <div class="ocr_page" id="page_1">
 201 |   <div class="ocr_carea" id="column_2" title="bbox 313 324 733 1922">
 202 |     <div class="ocr_par" id="par_7"> ... </div>
 203 |     <div class="ocr_par" id="par_19"> ... </div>
 204 |   </div>
 205 | </div>
 206 | ```
 207 | 
 208 | ### General Properties
 209 | 
 210 | The following properties can apply to most elements (where it makes sense):
 211 | 
 212 | #### `bbox`
 213 | 
 214 | * `bbox x0 y0 x1 y1` – the bounding box of the element relative to the
 215 |   binarized document image
 216 |   * use `x_bboxes` below for character bounding boxes
 217 |   * do not use `bbox` unless the bounding box of the layout component is, in
 218 |     fact, rectangular
 219 |   * some non-rectangular layout components may have rectangular bounding boxes
 220 |     if the non-rectangularity is caused by floating elements around which text flows
 221 | 
 222 | See also the section [`bbox (typesetting)`](#bbox-typesetting).
 223 | 
 224 | #### `textangle`
 225 | 
 226 | * `textangle alpha` - the angle in degrees by which textual content has been
 227 |   rotate relative to the rest of the page (if not present, the angle is assumed
 228 |   to be zero); rotations are counter-clockwise, so an angle of 90 degrees is
 229 |   vertical text running from bottom to top in Latin script; note that this is
 230 |   different from reading order, which should be indicated using standard HTML
 231 |   properties
 232 | 
 233 | ### Non-recommended general properties
 234 | 
 235 | The following properties can apply to most elements but should not be used
 236 | unless there is no alternative:
 237 | 
 238 | #### `poly`
 239 | 
 240 | * `poly x0 y0 x1 y1 ...` - a closed polygon for elements with non-rectangular bounds
 241 |   * this property must not be used unless there is no other way of
 242 |     representing the layout of the page using rectangular bounding boxes,
 243 |     since most tools will simply not have the capability of dealing with
 244 |     non-rectangular layouts
 245 |   * note that the natural and correct representation of many non-rectangular
 246 |     layouts is in terms of rectangular content areas and rectangular floats
 247 |   * documents using polygonal borders anywhere must indicate this in the
 248 |     metadata
 249 |   * documents should attempt to provide a reasonable bbox equivalent as well
 250 | 
 251 | #### `order`
 252 | 
 253 | * `order n` – the reading order of the element (an integer)
 254 |   * this property must not be used unless there is no other way of representing
 255 |     the reading order of the page by element ordering within the page, since
 256 |     many tools will not be able to deal with content that is not in reading order
 257 | 
 258 | #### `presence`
 259 | 
 260 | * `presence` presence must be declared in the document meta data
 261 | 
 262 | The following property relates the flow between multiple `ocr_carea` elements,
 263 | and between `ocr_carea` and `ocr_linear` elements.
 264 | 
 265 | #### `cflow`
 266 | 
 267 | * `cflow s` – the content flow on the page that this element is a part of
 268 |   * s must be a unique string for each content flow
 269 |   * must be present on ocr_carea and ocrx_block tags when reading order is
 270 |     attempted and multiple content flows are present
 271 |   * presence must be declared in the document meta data
 272 | 
 273 | This property applies primarily to textlines
 274 | 
 275 | #### `baseline`
 276 | 
 277 | * `baseline pn pn-1 ... p0` - a polynomial describing the baseline of a line of
 278 |   text
 279 |   * the polynomial is in the coordinate system of the line, with the bottom
 280 |     left of the bounding box as the origin
 281 | 
 282 | ## 4 Logical Structuring Elements
 283 | 
 284 | We recognize the following logical structuring elements:
 285 | 
 286 | * `ocr_document`
 287 |   * `ocr_linear`
 288 |     * `ocr_title`
 289 |     * `ocr_author`
 290 |     * `ocr_abstract`
 291 |     * `ocr_part` [`<h1>`]
 292 |       * `ocr_chapter` [`<h1>`]
 293 |         * `ocr_section` [`<h2>`]
 294 |           * `ocr_sub*section` [`<h3>`,`<h4>`]
 295 |             * `ocr_display` 
 296 |             * `ocr_blockquote` [`<blockquote>`]
 297 |             * `ocr_par` [`<p>`]
 298 | 
 299 | ### `ocr_document`
 300 | ### `ocr_title`
 301 | ### `ocr_author`
 302 | ### `ocr_abstract`
 303 | ### `ocr_part`
 304 | ### `ocr_chapter`
 305 | ### `ocr_section`
 306 | ### `ocr_subsubsection`
 307 | ### `ocr_display`
 308 | ### `ocr_blockquote`
 309 | ### `ocr_par`
 310 | 
 311 | These logical tags have their standard meaning as used in the publishing
 312 | industry and tools like LaTeX, MS Word, and others.
 313 | 
 314 | The standard HTML tags given in brackets specify the preferred HTML tags to use
 315 | with those logical structuring elements, but it may not be possible or
 316 | desirable to actually chose those tags (e.g., when adding hOCR information to
 317 | an existing HTML output routine).
 318 | 
 319 | ### `ocr_linear`
 320 | 
 321 | For all of these elements except `ocr_linear`, there exists a natural linear
 322 | ordering defined by reading order (`ocr_linear` indicates that the elements
 323 | contained in it have a linear ordering). At the level of `ocr_linear`, there
 324 | may not be a single distinguished order. A common example of `ocr_linear` is a
 325 | newspaper, in which a single newspaper may contain many linear, but there is no
 326 | unique reading order for the different linear. OCR evaluation tools should
 327 | therefore be sensitive to the order of all elements other than `ocr_linear`.
 328 | 
 329 | Tags must be nested as indicated by nesting above, but not all tags within the
 330 | hierarchy need to be present.
 331 | 
 332 | Textual information like section numbers and bullets must be represented as
 333 | text inside the containing element.
 334 | 
 335 | Documents whose logical structure does not map naturally onto these logical
 336 | structuring elements must not use them for other purpose.
 337 | 
 338 | ### `ocr_caption`
 339 | 
 340 | Image captions may be indicated using the `ocr_caption` element; such an
 341 | element refers to the image(s) contained within the same float, or the
 342 | immediately adjacent image if both the image and the `ocr_caption` element are
 343 | in running text.
 344 | 
 345 | 
 346 | ## 5 Typesetting Related Elements
 347 | 
 348 | The following typesetting related elements are based on a typesetting model as
 349 | found in most typesetting systems, including
 350 | [XSL:FO](https://www.w3.org/TR/xsl11/#fo-section),
 351 | [(La)TeX](https://latex-project.org/guides/usrguide.pdf),
 352 | [LibreOffice](https://wiki.documentfoundation.org/images/e/e6/WG42-WriterGuideLO.pdf),
 353 | and Microsoft Word.
 354 | 
 355 | In those systems, each page is divided into a number of areas. Each area can
 356 | either be a part of the body text (or multiple body texts, in the case of
 357 | newspaper layouts). The content of the areas derives from a linear stream of
 358 | textual content, which flows into the areas, filling them linewise in their
 359 | preferred directions.
 360 | 
 361 | Overlaid onto the page is a set of floating elements; floating elements exist
 362 | outside the normal reading order. Floating elements may be introduced by the
 363 | textual content, or they may be related to the page itself (anchoring is a
 364 | logical property). In typesetting systems, floating elements may be anchored to
 365 | the page, to paragraphs, or to the content stream. Floating elements can
 366 | overlap content areas and render on top of or under content, or they can force
 367 | content to flow around them. The default for floating elements in this spec is
 368 | that their anchor is undefined (it is a logical property, not a typesetting
 369 | property), and that text flows around them. Note that with rectangular content
 370 | areas and rectangular floats, already a wide variety of non-rectangular text
 371 | shapes can be realized.
 372 | 
 373 | **Issue: there is currently no way of indicating anchoring or flow-around
 374 | properties for floating elements; properties need to be defined for this.**
 375 | 
 376 | ### Classes for typesetting elements
 377 | 
 378 | The following classes, as well as [floats](#classes-for-floats) are used for type-setting
 379 | elements.
 380 | 
 381 | #### `ocr_page`
 382 | 
 383 | * `ocr_page`
 384 | 
 385 | The `ocr_page` element must be present in all hOCR documents.
 386 | 
 387 | #### `ocr_column`
 388 | 
 389 | **DEPRECATED**: Please use [`ocr_carea`](#ocr_carea) instead
 390 | 
 391 | #### `ocr_carea`
 392 | 
 393 | * `ocr_carea`
 394 | 
 395 | "ocr content area" or "body area"
 396 | 
 397 | Used to be called ~~ocr_column~~
 398 | 
 399 | #### `ocr_line`
 400 | 
 401 | Should be in a `<span>`
 402 | 
 403 | #### `ocr_separator`
 404 | 
 405 | * `ocr_separator` (any separator or similar element)
 406 | 
 407 | #### `ocr_noise`
 408 | 
 409 | * `ocr_noise` (any noise element that isn't part of typesetting)
 410 | 
 411 | ### Recommended Properties for typesetting elements
 412 | 
 413 | The following properties should be present:
 414 | 
 415 | #### `bbox (typesetting)`
 416 | 
 417 | * `bbox`
 418 |   * the bounding box of the page; for pages, the top left corner must be at
 419 |     `(0,0)`, so a typical page bounding box will look like `bbox 0 0 2300 3200`
 420 | 
 421 | #### `image`
 422 | 
 423 | * `image imagefile`
 424 |   * image file name used as input
 425 |   * syntactically, must be a UNIX-like pathname or http URL (no Windows pathnames)
 426 |   * may be relative
 427 |   * cannot be resolved to the actual file in general (e.g., if the hOCR file
 428 |     becomes separated from the image file)
 429 |   * if the hOCR file is present in a directory hierarchy or file archive, should
 430 |     resolve to the corresponding image file
 431 | 
 432 | #### `imagemd5`
 433 | 
 434 | * `imagemd5 checksum`
 435 |   * MD5 fingerprint of the image file that this page was derived from
 436 |   * allows re-associating pages with source images
 437 | 
 438 | #### `ppageno`
 439 | 
 440 | * `ppageno n`
 441 |   * the physical page number
 442 |   * the front cover is page number 0
 443 |   * should be unique
 444 |   * must not be present unless the pages in the document have a physical ordering
 445 |   * must not be present unless it is well defined and unique
 446 | 
 447 | #### `lpageno`
 448 | 
 449 | * `lpageno string`
 450 |   * the logical page number expressed on the page
 451 |   * may not be numerical (e.g., Roman numerals)
 452 |   * usually is unique
 453 |   * must not be present unless it has been recognized from the page and is unambiguous
 454 | 
 455 | ### Optional Properties for typesetting elements
 456 | 
 457 | The following properties MAY be present:
 458 | 
 459 | #### `scan_res`
 460 | 
 461 | * `scan_res x_res y_res`
 462 |   * scanning resolution in DPI
 463 | #### `x_scanner`
 464 | 
 465 | * `x_scanner string`
 466 |   * a representation of the scanner
 467 | 
 468 | #### `x_source`
 469 | 
 470 | * `x_source string`
 471 |   * an implementation-dependent representation of the document source
 472 |   * could be a URL or a /gfs/ path
 473 |   * offsets within a multipage format (e.g., TIFF) may be represented using
 474 |     additional strings or using URL parameters or fragments
 475 |   * examples
 476 |     * `x_source /gfs/cc/clean/012345678911 17`
 477 |     * `x_source http://pageserver/012345678911&page=17`
 478 | 
 479 | The `ocr_carea` elements should appear reading order unless this is impossible
 480 | because of some other structuring requirement If the document contains multiple
 481 | `ocr_linear` streams, then each `ocr_carea` must indicate which stream it belongs
 482 | to.
 483 | 
 484 | In typesetting systems, content areas are filled with “blocks”, but most of
 485 | those blocks are not recoverable or semantically meaningful. However, one type
 486 | of block is visible and very important for OCR engines: the line. Lines are
 487 | typesetting blocks that only contain glyphs (“inlines” in XSL terminology).
 488 | 
 489 | They are represented by the `ocr_line` area. In addition to the standard
 490 | properties, the `ocr_line` area supports the following additional properties:
 491 | 
 492 | #### `hardbreak`
 493 | 
 494 | * `hardbreak n`
 495 |   * a zero (default) indicates that the end of the line is not a hard
 496 |     (explicit) line break, but a break due to text flow
 497 |   * a one indicates that the line is a hard (explicit) line break
 498 | 
 499 | Any special characters representing the desired end-of-line processing must be
 500 | present inside the `ocr_line` element. Examples of such special characters are a
 501 | soft hyphen ("­", `U+00AD`), a hard line break (`<br>`), or whitespace (` `) for soft
 502 | line breaks.
 503 | 
 504 | Note that for many documents, the actual ground truth careas are well-defined
 505 | by the document style of the original document before printing and scanning.
 506 | From a single page, the `careas` of the original document style cannot be
 507 | recovered exactly. However, the partition of a document by `ocr_carea` for an
 508 | individual page shall be considered correct relative to ground truth if
 509 | 
 510 | 1. all the text contained in a ground truth carea is fully contained within a
 511 |   single `ocr_carea`,
 512 | 2. no text outside a ground truth `carea` is contained within an
 513 |   `ocr_carea`, and 
 514 | 3. the `ocr_careas` appear in the same order as the text flow
 515 |   relationships between the ground truth careas.
 516 | 
 517 | ### Classes for floats
 518 | 
 519 | Floats should not be nested.
 520 | 
 521 | The following floats are defined:
 522 | 
 523 | #### `ocr_float`
 524 | 
 525 | * `ocr_float`
 526 | 
 527 | #### `ocr_separator`
 528 | 
 529 | * `ocr_separator`
 530 | 
 531 | #### `ocr_textfloat`
 532 | 
 533 | * `ocr_textfloat`
 534 | 
 535 | #### `ocr_textimage`
 536 | 
 537 | * `ocr_textimage`
 538 | 
 539 | #### `ocr_image`
 540 | 
 541 | * `ocr_image`
 542 | 
 543 | #### `ocr_linedrawing`
 544 | 
 545 | * `ocr_linedrawing` – something that could be represented well and naturally in
 546 |   a vector graphics format like SVG (even if it is actually represented as PNG)
 547 | 
 548 | #### `ocr_photo`
 549 | 
 550 | * `ocr_photo` – something that requires JPEG or PNG to be represented well
 551 | 
 552 | #### `ocr_header`
 553 | 
 554 | * `ocr_header`
 555 | 
 556 | #### `ocr_footer`
 557 | 
 558 | * `ocr_footer`
 559 | 
 560 | #### `ocr_pageno`
 561 | 
 562 | * `ocr_pageno`
 563 | 
 564 | #### `ocr_table`
 565 | 
 566 | * `ocr_table`
 567 | 
 568 | ## 6 Inline Representations
 569 | 
 570 | There is some content that should behave and flow like text
 571 | 
 572 | ### Classes for Inline Representation
 573 | 
 574 | #### `ocr_glyph`
 575 | 
 576 | * `ocr_glyph` – an individual glyph represented as an image (e.g., an unrecognized character)
 577 |   * must contain a single `<img>` tag, or be present on one
 578 | 
 579 | #### `ocr_glyphs`
 580 | 
 581 | * `ocr_glyphs` – multiple glyphs represented as an image (e.g., an unrecognized word)
 582 |   * must contain a single `<img>` tag, or be present on one
 583 | 
 584 | #### `ocr_dropcap`
 585 | 
 586 | * `ocr_dropcap` – an individual glyph representing a dropcap
 587 |   * may contain text or an `<img>` tag; the `alt` of the image tag should
 588 |     contain the corresponding text
 589 | 
 590 | #### `ocr_chem`
 591 | 
 592 | * `ocr_chem` – a chemical formula
 593 |   * must contain either a single `<img>` tag or
 594 |     [ChemML](http://www.xml-cml.org/) markup, or be present on one
 595 | 
 596 | #### `ocr_math`
 597 | 
 598 | * `ocr_math` – a mathematical formula
 599 |   * must contain either a single `<img>` tag or
 600 |     [MathML](https://www.w3.org/Math/) markup, or be present on one
 601 | 
 602 | Mathematical and chemical formulas that float must be put into an `ocr_float`
 603 | section.
 604 | 
 605 | Mathematical and chemical formulas that are “display” mode should be put into
 606 | an `ocr_display` section.
 607 | 
 608 | #### Non-breaking space
 609 | 
 610 | Non-breaking spaces must be represented using the HTML `&nbsp;` entity.
 611 | 
 612 | #### Non-default spaces
 613 | 
 614 | Different space widths should be indicated using HTML and `&ensp;`, `&emsp`,
 615 | `&thinsp;`, `&zwnj;`, `&zwj;`.
 616 | 
 617 | #### Hyphenation
 618 | 
 619 | Soft hyphens must be represented using the HTML `&shy;` entity.
 620 | 
 621 | The HTML `&lrm;` and `&rlm;` entities (indicating writing direction) must not
 622 | be used; all writing direction changes must be indicated with tags.
 623 | 
 624 | #### Superscript and Subscript
 625 | 
 626 | Other superscripts and subscripts must be represented using the HTML `<sup>` and
 627 | `<sub>` tags, even if special Unicode characters are available.
 628 | 
 629 | #### Ruby characters
 630 | 
 631 | [Furigana and similar constructs](https://en.wikipedia.org/wiki/Ruby_character)
 632 | must be represented using their correct Unicode encoding.
 633 | 
 634 | ## 7 Character Information
 635 | 
 636 | ### Classes for Character Information
 637 | 
 638 | Character-level information may be put on any element that contains only a
 639 | single "line" of text.
 640 | 
 641 | #### `ocr_cinfo`
 642 | 
 643 | If no other layout element applies, the `ocr_cinfo` element may be used.
 644 | 
 645 | ### Properties for Character Information
 646 | 
 647 | #### `cuts`
 648 | 
 649 | * `cuts c1 c2 c3 ...`
 650 |   * character segmentation cuts (see below)
 651 |   * there must be a bbox property relative to which the cuts can be interpreted
 652 | 
 653 | #### `nlp`
 654 | 
 655 | * `nlp c1 c2 c3 ...`
 656 |   * estimate of the negative log probabilities of each character by the recognizer
 657 | 
 658 | For left-to-write writing directions, cuts are sequences of deltas in the x and
 659 | y direction; the first delta in each path is an offset in the x direction
 660 | relative to the last x position of the previous path. The subsequent deltas
 661 | alternate between up and right moves.
 662 | 
 663 | Assume a bounding box of `(0,0,300,100)`; then
 664 | 
 665 | ```python
 666 | cuts("10 11 7 19") =
 667 |     [ [(10,0),(10,100)], [(21,0),(21,100)], [(28,0),(28,100)], [(47,0),(47,100)] ]
 668 | cuts("10,50,3 11,30,-3") =
 669 |     [ [(10,0),(10,50),(13,50),(13,100)], [(21,0),(21,30),(18,30),(18,100)] ]
 670 | ```
 671 | 
 672 | Here is an example:
 673 | 
 674 | ```html
 675 | <span class="ocr_cinfo" title="bbox 0 0 300 100; nlp 1.7 2.3 3.9 2.7; cuts 9 11 7,8,-2 15 3">hello</span>
 676 | ```
 677 | 
 678 | 
 679 | Cuts are between all codepoints contained within the element, including any
 680 | whitespace and control characters.  Simply use a delta of 0 (zero) for
 681 | invisible codepoints.
 682 | 
 683 | Writing directions other than left-to-right specify cuts as if the bounding box
 684 | for the element had been rotated by a multiple of 90 degrees such that the
 685 | writing direction is left to right, then rotated back.
 686 | 
 687 | It is undefined what happens when cut paths intersect, with the exception that
 688 | a delta of 0 always corresponds to an invisible codepoint.
 689 | 
 690 | 
 691 | ## 8 OCR Engine-Specific Markup
 692 | 
 693 | A few abstractions are used as intermediate abstractions in OCR engines,
 694 | although they do not have a meaning that can be defined either in terms of
 695 | typesetting or logical function. Representing them may be useful to represent
 696 | existing OCR output, say for workflow abstractions.
 697 | 
 698 | Common suggested engine-specific markup are:
 699 | 
 700 | ### Classes for engine specific markup
 701 | 
 702 | #### `ocrx_block`
 703 | 
 704 | * `ocrx_block`
 705 |   * any kind of "block" returned by an OCR system
 706 |   * engine-specific because the definition of a "block" depends on the engine
 707 | 
 708 | #### `ocrx_line`
 709 | 
 710 | * `ocrx_line`
 711 |   * any kind of "line" returned by an OCR system that differs from the standard ocr_line above
 712 |   * might be some kind of "logical" line
 713 | 
 714 | #### `ocrx_word`
 715 | 
 716 | * `ocrx_word`
 717 |   * any kind of "word" returned by an OCR system
 718 |   * engine specific because the definition of a "word" depends on the engine
 719 | 
 720 | The meaning of these tags is OCR engine specific. However, generators should
 721 | attempt to ensure the following properties:
 722 | 
 723 | * an `ocrx_block` should not contain content from multiple ocr_careas
 724 | * the union of all `ocrx_blocks` should approximately cover all `ocr_careas`
 725 | * an `ocrx_block` should contain either a float or body text, but not both
 726 | * an `ocrx_block` should contain either an image or text, but not both
 727 | * an `ocrx_line` should correspond as closely as possible to an `ocr_line`
 728 | * `ocrx_cinfo` should nest inside `ocrx_line`
 729 | * `ocrx_cinfo` should contain only `x_conf`, `x_bboxes`, and `cuts` attributes
 730 | 
 731 | ### Properties for engine-specific markup
 732 | 
 733 | The following properties are defined:
 734 | 
 735 | #### `x_font`
 736 | 
 737 | * `x_font s`
 738 |   * OCR-engine specific font names
 739 | 
 740 | #### `x_fsize`
 741 | 
 742 | * `x_fsize n`
 743 |   * OCR-engine specific font size
 744 | 
 745 | #### `x_boxes`
 746 | 
 747 | * `x_bboxes b1x0 b1y0 b1x1 b1y1 b2x0 b2y0 b2x1 b2y1 ...`
 748 |   * OCR-engine specific boxes associated with each codepoint contained in the
 749 |     element
 750 |   * note that the bbox property is a property for the bounding box of a layout
 751 |     element, not of individual characters
 752 |   * in particular, use `<span class="ocr_cinfo" title="x_bboxes ....">`, not
 753 |     `<span class="ocr_cinfo" title="bbox ...">`
 754 | 
 755 | #### `x_confs`
 756 | 
 757 | * `x_confs c1 c2 c3 ...`
 758 |   * OCR-engine specific character confidences
 759 |   * `c1` etc. must be numbers
 760 |   * higher values should express higher confidences
 761 |   * if possible, convert character confidences to values between 0 and 100 and
 762 |     have them approximate posterior probabilities (expressed in %)
 763 | 
 764 | #### `x_wconf`
 765 | 
 766 | * `x_wconf n`
 767 |   * OCR-engine specific confidence for the entire contained substring
 768 |   * n must be a number
 769 |   * higher values should express higher confidences
 770 |   * if possible, convert word confidences to values between 0 and 100 and have
 771 |     them approximate posterior probabilities (expressed in %)
 772 | 
 773 | 
 774 | ## 9 Font, Text Color, Language, Direction
 775 | 
 776 | OCR-generated font and text color information is encoded using standard HTML
 777 | and CSS attributes on elements with a class of `ocr_...` or `ocrx_...`.
 778 | Language and writing direction should be indicated using the HTML standard
 779 | attributes `lang=` and `dir=`, or alternatively can be indicated as properties on
 780 | elements.
 781 | 
 782 | OCR information and presentation information can be separated by putting the
 783 | CSS info related to the CSS in an outer element with an `ocr_` or `ocrx_` class,
 784 | and then overriding it for the presentation by nesting another `<span>` with the
 785 | actual presentation information inside that:
 786 | 
 787 | ```
 788 | <span class="ocr_cinfo" style="ocr style"><span style="presentation style"> ... </span></span>
 789 | ```
 790 | 
 791 | The CSS3 text layout attributes can be used when necessary. For example, CSS
 792 | supports writing-mode, direction, glyph-orientation [ISO-15924-based
 793 | script](http://www.unicode.org/iso15924/codelists.html), text-indent, etc.
 794 | 
 795 | 
 796 | ## 10 Alternative Segmentations / Readings
 797 | 
 798 | Alternative segmentations and readings are indicated by a `<span>` with
 799 | `class="alternatives"`. It must contains `<ins>` and `<del>` elements. The first
 800 | contained element should be `<ins>` and represent the most probable interpretation,
 801 | the subsequent ones `<del>`. Each `<ins>` and `<del>` element should have `class="alt"` and a
 802 | property of either `nlp` or `x_cost`. These `<span>`, `<ins>`, and `<del>` tags can nest
 803 | arbitrarily.
 804 | 
 805 | Example:
 806 | 
 807 | ```html
 808 | <span class="alternatives">
 809 |   <ins class="alt" title="nlp 0.3">hello</ins>
 810 |   <del class="alt" title="nlp 1.1">hallo</del>
 811 | </span>
 812 | ```
 813 | 
 814 | Whitespace within the `<span>` but outside the contained `<ins>`/`<del>`
 815 | elements is ignored and should be inserted to improve readability of the HTML
 816 | when viewed in a browser.
 817 | 
 818 | 
 819 | ## 11 Grouped Elements and Multiple Hierarchies
 820 | 
 821 | The different levels of layout information (logical, physical, engine-specific)
 822 | each form hierarchies, but those hierarchies may not be mutually compatible;
 823 | for example, a single `ocr_page` may contain information from multiple sections
 824 | or chapters. To represent both hierarchies within a single document, elements
 825 | may be grouped together.  That is, two elements with the same class may be
 826 | treated as one element by adding a "groupid identifier" property to them and
 827 | using the same identifier. 
 828 | 
 829 | Grouped elements should be logically consistent with the markup they represent;
 830 | for example, it is probably not sensible to use grouped elements to interleave
 831 | parts of two different chapters.  Therefore, grouped elements should usually be
 832 | adjacent in the markup.
 833 | 
 834 | Applications using hOCR may choose to manipulate grouped elements directly, but
 835 | the simplest way of dealing with them is to transform a document with grouped
 836 | elements into one without grouped elements prior to further processing by first
 837 | removing tags that are not of interest for the subsequent processing step, and
 838 | then collapsing grouped elements into single elements.  For example, output
 839 | that contains both logical and physical layout information, where the logical
 840 | layout information uses grouped elements, can be transformed by removing all
 841 | the physical layout information, and then collapsing all split `ocr_chapter`
 842 | elements into single `ocr_chapter` elements based on the groupid.  The result is
 843 | a simple DOM tree.  This transformation can be provided generically as a
 844 | pre-processor or Javascript.
 845 | 
 846 | The presence of grouped elements does not need to be indicated in the header;
 847 | when it affects their operations, hOCR processors should check for the presence
 848 | of grouped elements in the output and fail with an error message if they cannot
 849 | correctly process the hOCR information.
 850 | 
 851 | 
 852 | ## 12 Capabilities
 853 | 
 854 | Any program generating files in this output format must indicate in the
 855 | document metadata what kind of markup it is capable of generating. This
 856 | includes listing the exact set of markup sections that the system could have
 857 | generated, even if it did not actually generate them for the particular
 858 | document.
 859 | 
 860 | The capability to generate specific properties is given by the prefix `ocrp_...`;
 861 | the important properties are:
 862 | 
 863 | ### `ocrp_lang`
 864 | 
 865 | * `ocrp_lang` – capable of generating `lang=` attributes
 866 | 
 867 | ### `ocrp_dir`
 868 | 
 869 | * `ocrp_dir` – capable of generating `dir=` attributes
 870 | 
 871 | ### `ocrp_poly`
 872 | 
 873 | * `ocrp_poly` – capable of generating [polygonal bounds](#poly)
 874 | 
 875 | ### `ocrp_font`
 876 | 
 877 | * `ocrp_font` – capable of generating font information (standard font information)
 878 | 
 879 | ### `ocrp_nlp`
 880 | 
 881 | * `ocrp_nlp` – capable of generating [nlp confidences](#nlp)
 882 | 
 883 | ### `ocr_embeddedformat_<formatname>`
 884 | 
 885 | The capability to generate other specific embedded formats is given by the
 886 | prefix `ocr_embeddedformat_<formatname>`.
 887 | 
 888 | ### `ocr_<tag>_unordered`
 889 | 
 890 | If an OCR engine represents a particular tag but cannot determine reading order
 891 | for that tag, it must must specify a capability of `ocr_<tag>_unordered`.
 892 | 
 893 | If a document lists a certain capabilities but no element or attribute is found
 894 | that corresponds to that capability, users of the document may infer that the
 895 | content is absent in the source document. If a capability is not listed, the
 896 | corresponding element or attribute must not be present in the document.
 897 | 
 898 | 
 899 | ## 13 Profiles
 900 | 
 901 | hOCR provides standard means of marking up information, but it does not mandate
 902 | the presence or absence of particular kinds of information.  For example, an
 903 | hOCR file may contain only logical markup, only physical markup, or only
 904 | engine-specific markup. As a result, merely knowing that OCR output is hOCR
 905 | compliant doesn't tell us whether that file is actually useful for subsequent
 906 | processing.
 907 | 
 908 | OCR systems can use hOCR in various different ways internally, but we will
 909 | eventually define some common profiles that mandate what kinds of information
 910 | needs to be present in particular kinds of output.
 911 | 
 912 | Of particular importance are:
 913 | 
 914 | * physical layout profile: OCR output in XHTML format with a defined set of
 915 |   common physical layout markup capabilities (page, carea, floats, line).
 916 |   Logical layout may be present as well, but the document tree structure must
 917 |   represent the physical layout structure, with logical layout elements split
 918 |   and grouped as needed.
 919 | 
 920 | * logical layout profile: OCR output in XHTML format with a defined set of
 921 |   common logical layout markup capabilities (linear, chapter, section,
 922 |   subsection).  Physical layout may be present as well, but the document tree
 923 |   structure must represent the logical layout structure, with logical layout
 924 |   elements split and grouped as needed.
 925 | 
 926 | Other possible profiles might be defined for specific engines or specific document classes:
 927 | 
 928 | * common commercial OCR output (e.g., Abbyy)
 929 |   * ocr_page
 930 |   * ocrx_block, ocrx_line, ocrx_word
 931 |   * ocrp_lang
 932 |   * ocrp_font
 933 | * book target
 934 |   * all logical structuring elements (as applicable), except ocr_linear
 935 |   * ocr_page
 936 | * newspaper target
 937 |   * all logical structuring elements (as applicable)
 938 |   * articles map on ocr_linear
 939 |   * ocr_page
 940 | 
 941 | 
 942 | ## 14 Required Meta Information
 943 | 
 944 | The OCR system is required to indicate the following using meta tags in the header:
 945 | 
 946 | * `<meta name="ocr-system" content="name version"/>`
 947 | * `<meta name="ocr-capabilities" content="capabilities"/>`
 948 |   * see the capabilities defined above
 949 | 
 950 | The OCR system should indicate the following information
 951 | 
 952 | * `<meta name="ocr-number-of-pages" content="number-of-pages"/>`
 953 | * `<meta name="ocr-langs" content="languages-considered-by-ocr"/>`
 954 |   * use [ISO 639-1](https://www.loc.gov/standards/iso639-2/php/code_list.php) codes
 955 |   * value may be `unknown`
 956 | * `<meta name="ocr-scripts" content="scripts-considered-by-ocr"/>`
 957 |   * use [ISO 15924](http://www.unicode.org/iso15924/codelists.html) letter codes
 958 |   * value may be `unknown`
 959 | 
 960 | 
 961 | ## 15 HTML Markup
 962 | 
 963 | The HTML-based markup is orthogonal to the hOCR-based markup; that is, both can
 964 | be chosen independent of one another. The only thing that needs to be
 965 | consistent between the two markups is the text contained within the tags. hOCR
 966 | and other embedded format tags can be put on HTML tags, or they can be put on
 967 | their own `<div>`/`<span>` tags.
 968 | 
 969 | There are many different choices possible and reasonable for the HTML markup,
 970 | depending on the use and further processing of the document. Each such choice
 971 | must be indicated in the meta data for the document.
 972 | 
 973 | Many mappings derived from existing tools are quite similar, and most follow
 974 | the restrictions and recommendations below already without further
 975 | modifications.
 976 | 
 977 | Depending on the particular HTML markup used in the document, the document is
 978 | suitable for different kinds of processing and use. The formats have the
 979 | following intents:
 980 | 
 981 | ### `html_none`
 982 | 
 983 | * `html_none`: straightforward equivalent of Goodoc or [XDOC](http://www.vividata.com/manuals/core12xdc.pdf)
 984 | 
 985 | ### `html_ocr`
 986 | 
 987 | * `html_ocr`: straightforward recording of commercial OCR system output
 988 | 
 989 | ### `html_absolute`
 990 | 
 991 | * `html_absolute`: target format for services like Google's View as HTML
 992 | 
 993 | ### `html_xytable`
 994 | 
 995 | * `html_xytable`: target format for layout-preserving on-screen document viewing
 996 | 
 997 | ### `html_simpl`
 998 | 
 999 | * `html_simpl`: target format for convenient on-line viewing and intermediate format for indexing
1000 | 
1001 | As long as a format contains the hOCR information, it can be reprocessed by
1002 | layout analysis software and converted into one of the other formats. In
1003 | particular, we envision layout analysis tools for converting any hOCR document
1004 | into `html_absolute`, `html_xytable`, and `html_simple`. Furthermore,
1005 | internally, a layout analysis system might use `html_xytable` as an
1006 | intermediate format for converting hOCR into `html_simple`.
1007 | 
1008 | 
1009 | ### 15.1 Restrictions on HTML Content
1010 | 
1011 | To avoid problems, any use of HTML markup must follow the following rules:
1012 | 
1013 | * HTML content must not use class names that conflict with any of those defined in this document (`ocr_*`)
1014 | * HTML content must not use the title= attribute on any element with an ocr_* class for any purposes other than encoding OCR-related properties as described in this document
1015 | 
1016 | 
1017 | ### 15.2 Recommendations for Mappings
1018 | 
1019 | When possible, any mapping of logical structure onto HTML should try to follow the following rules:
1020 | 
1021 | * the mapping should be "natural" -- similar to what an author of the document
1022 |   might have entered into a WYSIWYG content creation tool
1023 | * text should be in reading order
1024 | * all tags should be used for the intended purpose (and only for the intended
1025 |   purpose) as defined in the [HTML 4 spec](https://www.w3.org/TR/html4/).
1026 | * floats are contained in `<div>` elements with a `style` that includes a float attribute
1027 | * repeating floating page elements (header/footer) should be repeated and occur
1028 |   in their natural location in reading order (e.g., between pages)
1029 | * embedded images and SVG should be contained in files in the same directory
1030 |   (no `/` in the URL) and embedded with `<img>` and `<embed>` tags, respectively
1031 | 
1032 | Specifically
1033 | 
1034 | * `<em>` and `<strong>` should represent emphasis, and are preferred to `<b>`, `<i>`, and `<u>`
1035 | * `<b>`, `<i>`, and `<u>` should represent a change in the corresponding
1036 |   attribute for the current font (but an OCR font specification must still be
1037 |   given)
1038 | * `<p>` should represent paragraph breaks
1039 | * `<br>` should represent explicit linebreaks (not linebreak that happen because of text flow)
1040 | * `<h1>`, ..., `<h6>` should represent the logical nesting structure (if any) of the document
1041 | * `<a>` should represent hyperlinks and references within the document
1042 | * `<blockquote>` should represent indented quotations, but not other uses of indented text.
1043 | * `<ul>`, `<ol>`, `<dl>` should represent lists
1044 | * `<table>` should represent tables, including correct use of the `<th>` tag
1045 | 
1046 | If necessary, the markup may use the following non-standard tags:
1047 | 
1048 | * `<nobr>` to indicate that line breaking is not permitted for the enclosed content
1049 | * `<wbr>` to indicate that line breaking is permitted at that location
1050 | 
1051 | 
1052 | #### 15.2.1 html_none
1053 | 
1054 | The simplest HTML markup for hOCR formats contains no logical markup at all; it
1055 | is simply a collection of `<div>` and `<span>` elements with associated hOCR
1056 | information. Note that such documents can still be rendered visually through
1057 | the use of CSS.
1058 | 
1059 | 
1060 | #### 15.2.2 html_simple
1061 | 
1062 | This is a format that follows the restrictions and recommendations above, and only uses the following tags:
1063 | 
1064 | * `<h1>` ...  `<h6>`
1065 | * `<p>`, `<br>`
1066 | * `<b>`, `<i>`, and `<u>` for appearance changes (bold, italic, underline)
1067 | * `<font>` for any other appearance changes
1068 | * `<a>`
1069 | * `<div>` with a float style for floats
1070 | * `<table>` for tables
1071 | * `<img>` for images
1072 | * all SVG must be externally embedded with the `<embed>` tag
1073 | * the use of other embedded formats is permitted
1074 | * all other uses of `<div>`, `<span>`, `<ins>`, and `<del>` only for hOCR tags or other embedded formats (hCard, …)
1075 | 
1076 | #### 15.2.3 html_ocr_<engine>
1077 | 
1078 | The HTML markup produced by default by the OCR engine for the given document.
1079 | Examples of possible values are:
1080 | 
1081 | * `html_ocr_finereader_8`
1082 | * `html_ocr_textbridge_11`
1083 | * `html_ocr_unknown` – the HTML was generated by some OCR engine, but it's unknown which one
1084 | 
1085 | 
1086 | #### 15.2.4 html_absolute_<element>
1087 | 
1088 | The HTML represents absolute positioning of elements on each page. The possible subformats are:
1089 | 
1090 | * `html_absolute_cols` – absolute positioning of cols
1091 | * `html_absolute_pars` – absolute positioning of paragraphs
1092 | * `html_absolute_lines` – absolute positioning of lines
1093 | * `html_absolute_words` – absolute positioning of words
1094 | * `html_absolute_chars` – absolute positioning of characters
1095 | 
1096 | The ["View as HTML" for PDF
1097 | files](https://googlewebmastercentral.blogspot.de/2011/09/pdfs-in-google-search-results.html)
1098 | feature of Google Search uses `html_absolute_lines`; this is probably the most
1099 | reasonable choice for approximating the appearance of the original document.
1100 | 
1101 | 
1102 | #### 15.2.5 html_xytable_absolute
1103 | 
1104 | The HTML is a table that gives the XY-cut layout segmentation structure of the
1105 | page in tabular form. Note that in this format, text order does not necessarily
1106 | correspond to reading order.
1107 | 
1108 | The format must contain one `<table>` of class ocr_xycut representing each page.
1109 | The `<table>` structure must represent the absolute size of the original page
1110 | element. The markup of the content of the table itself is as in html_simple.
1111 | 
1112 | 
1113 | #### 15.2.6 html_xytable_relative
1114 | 
1115 | The page representation is as in
1116 | [`html_xytable_absolute`](#1525-html_xytable_absolute), but table element sizes
1117 | are expressed relative (percentages).
1118 | 
1119 | 
1120 | #### 15.2.7 html_<processor>
1121 | 
1122 | The HTML represents markup that follows the mappings of the given document
1123 | processor to HTML. Note that the document doesn't actually need to have been
1124 | constructed in the processor and that the processor doesn't need to have been
1125 | used to generate the HTML. For example, the `html_latex2html` tag merely
1126 | indicates that, say, a scanned and ocr'ed article uses the same conventions for
1127 | logical markup tags that an equivalent article actually written in LaTeX and
1128 | actually converted to HTML would have used. Possible subformats are:
1129 | 
1130 | 
1131 | ##### `html_latex2html`
1132 | 
1133 | * `html_latex2html`
1134 | 
1135 | ##### `html_msword`
1136 | 
1137 | * `html_msword` – HTML mapping generated by “Save As HTML”
1138 | 
1139 | ##### `html_ooffice`
1140 | 
1141 | * `html_ooffice` – HTML mapping generated by “Save As HTML”
1142 | 
1143 | ##### `html_docbook_xsl`
1144 | 
1145 | * `html_docbook_xsl` – HTML mapping generated by official XSL style sheets
1146 | 
1147 | 
1148 | ## 16 Document Meta Information
1149 | 
1150 | For document meta information, use the [Dublin Core Embedding into
1151 | HTML](http://dublincore.org/documents/dcq-html/). See also [Citation Guidelines
1152 | for Dublin Core](http://dublincore.org/documents/dc-citation-guidelines/).
1153 | 
1154 | ## 17 Sample Usage
1155 | 
1156 | See also the [hocr-tools](https://github.com/tmbdev/hocr-tools) for more samples.
1157 | 
1158 | The HTML format described here may seem fairly complicated and difficult to
1159 | parse, but because there are lots of tools for manipulating HTML documents,
1160 | they're actually pretty easy to manipulate. Here are some examples:
1161 | 
1162 | ```python
1163 | import libxml2,re,os,string
1164 | 
1165 | # convert the HTML to XHTML (if necessary)
1166 | os.system("tidy -q -asxhtml < page.html > page.xhtml 2> /dev/null")
1167 | 
1168 | # parse the XML
1169 | doc = libxml2.parseFile('page.xhtml')
1170 | 
1171 | # search all nodes having a class of ocr_line
1172 | lines = doc.xpathEval("//*[@class='ocr_line']")
1173 | 
1174 | # a function for extracting the text from a node
1175 | def get_text(node):
1176 |     textnodes = node.xpathEval(".//text()")
1177 |     s = string.join([node.getContent() for node in textnodes])
1178 |     return re.sub(r'\s+',' ',s)
1179 | 
1180 | # a function for extracting the bbox property from a node
1181 | # note that the title= attribute on a node with an ocr_ class must
1182 | # conform with the OCR spec
1183 | 
1184 | def get_bbox(node):
1185 |     data = node.prop('title')
1186 |     bboxre = re.compile(r'\bbbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')
1187 |     return [int(x) for x in bboxre.search(data).groups()]
1188 | 
1189 | # this extracts all the bounding boxes and the text they contain
1190 | # it doesn't matter what other markup the line node may contain
1191 | for line in lines:
1192 |     print get_bbox(line),get_text(line)
1193 | ```
1194 | 
1195 | Note that the OCR markup, basic HTML markup, and semantic markup can co-exist
1196 | within the same HTML file without interfering with one another.
1197 | 


--------------------------------------------------------------------------------
/1.1/spec_zh_CN.md:
--------------------------------------------------------------------------------
   1 | # hOCR 内置 OCR 工作流程和输出格式, version 1.1
   2 | 
   3 | The purpose of this document is to define an open standard for representing OCR
   4 | results. The goal is to reuse as much existing technology as possible, and to
   5 | arrive at a representation that makes it easy to reuse OCR results.
   6 | 
   7 | 本文的目的是为表达ocr的结果定义一个开放的标准，目标是尽可能多的重现现有技术，使ocr的结果表达更容易的再现。
   8 | 
   9 | This is the chinese translation, an [english translation is available as well](./spec.md).
  10 | 
  11 | ## Table of Contents
  12 | 
  13 | <!-- BEGIN-MARKDOWN-TOC -->
  14 | * [Table of Contents](#table-of-contents)
  15 | * [修订记录 Revision History](#修订记录-revision-history)
  16 | * [1 基本原理 Rationale](#1-基本原理-rationale)
  17 | * [2 入门 Getting Started](#2-入门-getting-started)
  18 | * [3 术语表达 Terminology and Representation](#3-术语表达-terminology-and-representation)
  19 | * [4 逻辑结构元素 Logical Structuring Elements](#4-逻辑结构元素-logical-structuring-elements)
  20 | * [5 排版相关元素 Typesetting Related Elements](#5-排版相关元素-typesetting-related-elements)
  21 | * [6 Inline Representations](#6-inline-representations)
  22 | * [7 字符信息 Character Information](#7-字符信息-character-information)
  23 | * [8 OCR 引擎的特定标记 OCR Engine-Specific Markup](#8-ocr-引擎的特定标记-ocr-engine-specific-markup)
  24 | * [9 字体，文本颜色，语言，方向 Font, Text Color, Language, Direction](#9-字体-文本颜色-语言-方向-font-text-color-language-direction)
  25 | * [10 替代分割/读数 Alternative Segmentations / Readings](#10-替代分割读数-alternative-segmentations--readings)
  26 | * [11 组元素和多个层级 Grouped Elements and Multiple Hierarchies](#11-组元素和多个层级-grouped-elements-and-multiple-hierarchies)
  27 | * [12 功能 Capabilities](#12-功能-capabilities)
  28 | * [13 配置 Profiles](#13-配置-profiles)
  29 | * [14 必需的元信息 Required Meta Information](#14-必需的元信息-required-meta-information)
  30 | * [15 HTML Markup](#15-html-markup)
  31 | 	* [15.1 HTML内容的限制 Restrictions on HTML Content](#151-html内容的限制-restrictions-on-html-content)
  32 | 	* [15.2 映射的建议 Recommendations for Mappings](#152-映射的建议-recommendations-for-mappings)
  33 | 		* [15.2.1 html_none](#1521-html_none)
  34 | 		* [15.2.2 html_simple](#1522-html_simple)
  35 | 		* [15.2.3 html_ocr_<engine>](#1523-html_ocr_)
  36 | 		* [15.2.4 html_absolute_<element>](#1524-html_absolute_)
  37 | 		* [15.2.5 html_xytable_absolute](#1525-html_xytable_absolute)
  38 | 		* [15.2.6 html_xytable_relative](#1526-html_xytable_relative)
  39 | 		* [15.2.7 html_<processor>](#1527-html_)
  40 | * [16 文件元信息 Document Meta Information](#16-文件元信息-document-meta-information)
  41 | * [17 用法示例 Sample Usage](#17-用法示例-sample-usage)
  42 | 
  43 | <!-- END-MARKDOWN-TOC -->
  44 | 
  45 | ## 修订记录 Revision History
  46 | 
  47 | hOCR has been originally developed by Thomas Breuel.
  48 | 
  49 | See the [releases](https://github.com/kba/hocr-spec/releases/) and full [commit
  50 | history](https://github.com/kba/hocr-spec/commits/) for a revision history.
  51 | 
  52 | 
  53 | ## 1 基本原理 Rationale
  54 | 
  55 | The purpose of this document is to define an open standard for representing OCR
  56 | results. The goal is to reuse as much existing technology as possible, and to
  57 | arrive at a representation that makes it easy to reuse OCR results.
  58 | 
  59 | 本文的目的是为表达ocr的结果定义一个开放的标准，目标是尽可能多的重现现有技术，使ocr的结果表达更容易的再现。
  60 | 
  61 | 
  62 | ## 2 入门 Getting Started
  63 | 
  64 | This document describes many tags and a lot of information that can be output.
  65 | However, getting started with hOCR is easy: you only need to output the tags
  66 | and information you actually want to.  For example, just outputting `ocr_line`
  67 | tags with bounding boxes is already very useful for many applications.  Just
  68 | start simple and add more output information as the need arises.
  69 | 
  70 | 本文档描述了最终输出的诸多标签和大量信息，hOCR 入门很简单，只要使用这些标签和信息来表示你要输出的内容。例如，只输出带边界框的ocr_line标签对于许多应用来说是非常有用处的。从简单开始，在需要的时候来增加更多的信息。
  71 | 
  72 | 
  73 | ## 3 术语表达 Terminology and Representation
  74 | 
  75 | This document describes a representation of various aspects of OCR output in an
  76 | XML-like format. That is, we define as set of tags containing text and other
  77 | tags, together with attributes of those tags. However, since the content we are
  78 | representing is formatted text,
  79 | 
  80 | However, we are not actually using a new XML for the representation; instead
  81 | embed the representation in XHTML (or HTML) because XHTML and XHTML processing
  82 | already define many aspects of OCR output representation that would otherwise
  83 | need additional, separate and ad-hoc definitions. These aspects include:
  84 | 
  85 | 文档主要描述类似XML格式的OCR输出的各个方面的表示，也就是说，我们定义一套包含文本和其他标签的标签集合，和这些标签的属性。然而，由于想表达的文本是格式化文本，实际上并没有用一个新的xml来表达，而是将表达嵌入到XHTMl(HTML)中，因为XHTMl或者HTML过程已经定义了许多OCR的输出，否则需要额外，单独的临时定义，这些方面包括：
  86 | 
  87 | 
  88 | * standard representations for common logical structuring elements, including
  89 |   section headings, citations, tables, emphasis, line breaks, quotations,
  90 |   citations, and preformatted text
  91 | * standard representations for fonts, embedded images, embedded vector
  92 |   graphics, tables, languages, writing direction, colors
  93 | * standard representations for geometric layout and positioning
  94 | * output files that are understood without any further modification by widely
  95 |   used viewers (browsers), editors, conversion tools, and indexing tools
  96 | * libraries for parsing and generating the content
  97 | * support for document metadata
  98 | 
  99 | *  通用逻辑结构元素的标准表达，包括章节标题，引用，表格，强调，换行符，引文和预格式化文本
 100 | *  字体，嵌入式图像，嵌入式矢量图形，表格，语言，书写方向，颜色的标准表达
 101 | *  几何布局和定位的标准表达
 102 | *  广泛使用的查看器（浏览器），编辑，转换工具，和索引工具，没有使用以上这些工具来对输出文件进一步的修改来增加理解
 103 | *  解析和生成内容库
 104 | *  支持文档元数据
 105 | 
 106 | 
 107 | 
 108 | We are embedding this information inside HTML by encoding it within valid tags
 109 | and attributes inside HTML; We are going to use the terms "elements" and
 110 | "properties" for referring to embedded markup.
 111 | 
 112 | Elements are defined by the class= attribute on an arbitrary HTML tag. All
 113 | elements in this format have a class name of the form `ocr…_…`.
 114 | 
 115 | Properties are defined by putting information into the `title=` attribute of an
 116 | HTML tag. Properties in title attributes are of the form “name values…”, and
 117 | multiple properties are separated by semicolons.
 118 | 
 119 | 通过HTML内有效的标签和属性将上述信息编码嵌入到html中，我们使 用术语“元素”（elements）和“属性”（properties）来指代嵌入式标记，在任意的html标签中通过class=attribute来定义元素。这个格式的所有元素有一个以`ocr.._..` 形式的类名，将信息加入HTML标签属性title=中使用来定义属性（Properties），标题属性的属性以“name values..”形式存在，多属性用分号分隔。
 120 | 
 121 | Here is an example:
 122 | 
 123 | 下面是一个例子：
 124 | 
 125 | ```html
 126 | <div class="ocr_page" id="page_1">
 127 |   <div class="ocr_carea" id="column_2" title="bbox 313 324 733 1922">
 128 |     <div class="ocr_par" id="par_7"> ... </div>
 129 |     <div class="ocr_par" id="par_19"> ... </div>
 130 |   </div>
 131 | </div>
 132 | ```
 133 | 
 134 | The following properties can apply to most elements (where it makes sense):
 135 | 
 136 | * `bbox x0 y0 x1 y1` – the bounding box of the element relative to the
 137 |   binarized document image
 138 |   * use `x_bboxes` below for character bounding boxes
 139 |   * do not use `bbox` unless the bounding box of the layout component is, in
 140 |     fact, rectangular
 141 |   * some non-rectangular layout components may have rectangular bounding boxes
 142 |     if the non-rectangularity is caused by floating elements around which text flows
 143 | 
 144 | * `textangle alpha` - the angle in degrees by which textual content has been
 145 |   rotate relative to the rest of the page (if not present, the angle is assumed
 146 |   to be zero); rotations are counter-clockwise, so an angle of 90 degrees is
 147 |   vertical text running from bottom to top in Latin script; note that this is
 148 |   different from reading order, which should be indicated using standard HTML
 149 |   properties
 150 | 
 151 | 以下属性可以适应于大多数的元素：
 152 | 
 153 | *  bbox x0 y0 x1 y1 ----元素相对应的边框的二值化文档图像
 154 |   * x_bbox表示字符边框（bounding boxes）
 155 |   * 不要使用bbox，除非页面布局组成的边框是矩形
 156 |   * 一些非矩形页面布局组成可能会有矩形边框，如果非矩形是由文本流出周围的浮点元素造成。
 157 | *  textangle alpha 角度是通过文本内容相对于页面其他部分旋转的角度来度量（如果不存在，就为0）；旋转是逆时针的，所以90度是在拉丁脚本中垂直文本方向从底部变成顶部；注意的是这与阅读顺序是不同的，需要使用标准HTML属性来表达。
 158 | 
 159 | 
 160 | 
 161 | The following properties can apply to most elements but should not be used
 162 | unless there is no alternative:
 163 | 
 164 | * `poly x0 y0 x1 y1 ...` - a closed polygon for elements with non-rectangular bounds
 165 |   * this property must not be used unless there is no other way of
 166 |     representing the layout of the page using rectangular bounding boxes,
 167 |     since most tools will simply not have the capability of dealing with
 168 |     non-rectangular layouts
 169 |   * note that the natural and correct representation of many non-rectangular
 170 |     layouts is in terms of rectangular content areas and rectangular floats
 171 |   * documents using polygonal borders anywhere must indicate this in the
 172 |     metadata
 173 |   * documents should attempt to provide a reasonable bbox equivalent as well
 174 | * `order n` – the reading order of the element (an integer)
 175 |   * this property must not be used unless there is no other way of representing
 176 |     the reading order of the page by element ordering within the page, since
 177 |     many tools will not be able to deal with content that is not in reading order
 178 | * `presence` presence must be declared in the document meta data
 179 | 
 180 | 以下属性适用于大部分的内容，但是尽量不要使用，除非没有选择
 181 | 
 182 | * poly x0 y0 x1 y1 ...非矩形边界闭合多边行
 183 |   *  这个属性最好不要用，除非没有方法用矩阵边框来表达页面布局，因为大多数工具没有处理非矩形页面布局的能力
 184 |   *  注意许多非矩形页面布局的自然和正确的表示是在长矩阵内容领域和矩形浮动方面
 185 |   *  文件中使用多边形的边界的任何地方都必须在元数据中表明这一点
 186 |   *  文件同时还必须尝试提供合理的等价bbox
 187 | *  order n ---元素的阅读顺序(整数)
 188 |   *  这个属性最好不要使用，除非没有其他方法可以通过页面内的元素顺序来表达页面的阅读顺序，因为许多工具无法处理没有阅读顺序的内容
 189 | *  presence –若存在必须在文档元数据中表明
 190 | 
 191 | 
 192 | 
 193 | 
 194 | The following property relates the flow between multiple `ocr_carea` elements,
 195 | and between `ocr_carea` and `ocr_linear` elements.
 196 | 
 197 | * `cflow s` – the content flow on the page that this element is a part of
 198 |   * s must be a unique string for each content flow
 199 |   * must be present on ocr_carea and ocrx_block tags when reading order is
 200 |     attempted and multiple content flows are present
 201 |   * presence must be declared in the document meta data
 202 | 
 203 | 以下属性涉及多个ocr_carea元素的流动，以及ocr_carea和ocr_linear元素
 204 | 
 205 | * cflow s –页面上内容的流动，该元素的一部分
 206 |   * s 必须是一个对每个内容流唯一的字符串
 207 |   * 必须存在于ocr_carea和ocrx_block标签中，当尝试阅读顺序和存在多个内容流时
 208 |   * 若存在必须在文件元数据中说明
 209 | 
 210 | 
 211 | This property applies primarily to textlines
 212 | 
 213 | * `baseline pn pn-1 … p0` - a polynomial describing the baseline of a line of
 214 |   text
 215 |   * the polynomial is in the coordinate system of the line, with the bottom
 216 |     left of the bounding box as the origin
 217 | 
 218 | 此属性主要适应于文本行
 219 | 
 220 | * baseline pn pn-1 ..p0-描述文本行的基线多项式
 221 |   * 多项式在该行的坐标系中，与左边框为为原点的底部
 222 | 
 223 | 
 224 | ## 4 逻辑结构元素 Logical Structuring Elements
 225 | 
 226 | 我们认为存在以下逻辑结构元素:
 227 | We recognize the following logical structuring elements:
 228 | * `ocr_document`
 229 |   * `ocr_linear`
 230 |     * `ocr_title`
 231 |     * `ocr_author`
 232 |     * `ocr_abstract`
 233 |     * `ocr_part` [`<H1>`]
 234 |       * `ocr_chapter` [`<H1>`]
 235 |         * `ocr_section` [`<H2>`]
 236 |           * `ocr_sub*section` [`<H3>`,`<H4>`]
 237 |             * `ocr_display` 
 238 |             * `ocr_blockquote` [`<BLOCKQUOTE>`]
 239 |             * `ocr_par` [`<P>`]
 240 | 
 241 | These logical tags have their standard meaning as used in the publishing
 242 | industry and tools like LaTeX, MS Word, and others.
 243 | 
 244 | 在已出版的产业和工具中比如LaTeX, MS Word和其他，这些逻辑标签具有标准的意义。
 245 | 
 246 | The standard HTML tags given in brackets specify the preferred HTML tags to use
 247 | with those logical structuring elements, but it may not be possible or
 248 | desirable to actually chose those tags (e.g., when adding hOCR information to
 249 | an existing HTML output routine).
 250 | 
 251 | 括号中给出的标准的HTML标签指定首选的HTML标签与逻辑结构元素使用，但它可能无法或者令人满意的实际选择这些标签（例如，当增加hOCR信息到这些现有的HTML输出程序）。
 252 | 
 253 | 
 254 | For all of these elements except `ocr_linear`, there exists a natural linear
 255 | ordering defined by reading order (`ocr_linear` indicates that the elements
 256 | contained in it have a linear ordering). At the level of `ocr_linear`, there
 257 | may not be a single distinguished order. A common example of `ocr_linear` is a
 258 | newspaper, in which a single newspaper may contain many linear, but there is no
 259 | unique reading order for the different linear. OCR evaluation tools should
 260 | therefore be sensitive to the order of all elements other than `ocr_linear`.
 261 | 
 262 | 除了ocr_linear其他所有元素，存在由阅读顺序限定的天然线性排序(“ocr_linear”表示包含的元素具有线性排序）。在“ocr_linear”的层面，有可能不是一个单一的杰出的顺序。“ocr_linear”的一个常见的例子是报纸，其中单份报纸可能包含许多linear，但对于不同的linear无唯一的阅读顺序。因此OCR评价工具应该比ocr_linear对元素的顺序更敏感。
 263 | 
 264 | 
 265 | Tags must be nested as indicated by nesting above, but not all tags within the
 266 | hierarchy need to be present.
 267 | 
 268 | 由上述嵌套所指示的标签必须嵌套的，但分层结构内的所有标签不一定都需要存在。
 269 | 
 270 | 
 271 | Textual information like section numbers and bullets must be represented as
 272 | text inside the containing element.
 273 | 
 274 | 例如章节号和bullets原文信息必须表示为包含的元素中的文本。
 275 | 
 276 | 
 277 | Documents whose logical structure does not map naturally onto these logical
 278 | structuring elements must not use them for other purpose.
 279 | 
 280 | 逻辑结构没有很自然地映射到这些逻辑结构元素中的文件，不得使用它们用于其他目的。
 281 | 
 282 | Image captions may be indicated using the `ocr_caption` element; such an
 283 | element refers to the image(s) contained within the same float, or the
 284 | immediately adjacent image if both the image and the `ocr_caption` element are
 285 | in running text.
 286 | 
 287 | 图片说明可以用“ocr_caption”元素来表示;这样的元素是指包含在同一范围内浮动图像，或者直接相邻的图像如果这两个图像和“ocr_caption”元素是在运行的文本。
 288 | 
 289 | 
 290 | ## 5 排版相关元素 Typesetting Related Elements
 291 | 
 292 | The following typesetting related elements are based on a typesetting model as
 293 | found in most typesetting systems, including
 294 | [XSL:FO](https://www.w3.org/TR/xsl11/#fo-section),
 295 | [(La)TeX](https://latex-project.org/guides/usrguide.pdf),
 296 | [LibreOffice](https://wiki.documentfoundation.org/images/e/e6/WG42-WriterGuideLO.pdf),
 297 | and Microsoft Word.
 298 | 
 299 | 下面排版相关的元素是基于在最优排版系统中找到的排版模型，包括XXSL:FO, (La)TeX, LibreOffice, 和 Microsoft Word。在这些系统中，每个页面被划分成若干个区域。每个区域可以是正文的一部分（或多个正文部分，例如报纸的布局）。区域的内容从文本内容的线性流中得出，流入区域，在其优选的方向面向行填充它们。
 300 | 
 301 | 
 302 | In those systems, each page is divided into a number of areas. Each area can
 303 | either be a part of the body text (or multiple body texts, in the case of
 304 | newspaper layouts). The content of the areas derives from a linear stream of
 305 | textual content, which flows into the areas, filling them linewise in their
 306 | preferred directions.
 307 | 
 308 | 覆盖在该页面是一组浮动元素；浮动元素存在正常阅读顺序之外。浮动元素可以由文字内容引入，或者它们可以与网页本身有关（锚定anchoring是一个逻辑属性）。
 309 | 
 310 | Overlaid onto the page is a set of floating elements; floating elements exist
 311 | outside the normal reading order. Floating elements may be introduced by the
 312 | textual content, or they may be related to the page itself (anchoring is a
 313 | logical property). In typesetting systems, floating elements may be anchored to
 314 | the page, to paragraphs, or to the content stream. Floating pelements can
 315 | overlap content areas and render on top of or under content, or they can force
 316 | content to flow around them. The default for floating elements in this spec is
 317 | that their anchor is undefined (it is a logical property, not a typesetting
 318 | property), and that text flows around them. Note that with rectangular content
 319 | areas and rectangular floats, already a wide variety of non-rectangular text
 320 | shapes can be realized.
 321 | 
 322 | 在排版系统，浮动元素可以锚定到该页面，段落，或向内容流。浮动元素可以重叠内容领域和渲染内容顶部或者底部，或者他们可以强制内容在周围的流动。浮动元素的默认说明是，他们锚是不确定的（这是一个逻辑属性，而不是排版属性），和周围的文本流。需要注意的是矩形内容区域和矩形浮动属性，已经可实现各种各样非矩形文本的形状。
 323 | 
 324 | **Issue: there is currently no way of indicating anchoring or flow-around
 325 | properties for floating elements; properties need to be defined for this.**
 326 | 
 327 | **Issue: 针对浮动元素，目前还没有定义好如何来表示锚点或浮动元素流动属性，需要为这个定义属性。**
 328 | 
 329 | The typesetting related elements therefore are:
 330 | 
 331 | 因此，排版相关的元素有：
 332 | 
 333 | * `ocr_page`
 334 | * `ocr_carea` ("ocr content area" or "body area"; used to be called ~~ocr_column~~)
 335 | * `ocr_line` [`<SPAN>`]
 336 | * (floats)
 337 | * `ocr_separator` (any separator or similar element)
 338 | * `ocr_noise` (any noise element that isn't part of typesetting)
 339 | 
 340 | The `ocr_page` element must be present in all hOCR documents.
 341 | 
 342 | 所有 hOCR 文件必须包含 `ocr_page` 元素。
 343 | 
 344 | The following properties should be present:
 345 | 
 346 | 以下元素必须存在：
 347 | 
 348 | * `bbox`
 349 |   * the bounding box of the page; for pages, the top left corner must be at
 350 |     `(0,0)`, so a typical page bounding box will look like `bbox 0 0 2300 3200`
 351 | * `image imagefile`
 352 |   * image file name used as input
 353 |   * syntactically, must be a UNIX-like pathname or http URL (no Windows pathnames)
 354 |   * may be relative
 355 |   * cannot be resolved to the actual file in general (e.g., if the hOCR file
 356 |     becomes separated from the image file)
 357 |   * if the hOCR file is present in a directory hierarchy or file archive, should
 358 |     resolve to the corresponding image file
 359 | * `imagemd5 checksum`
 360 |   * MD5 fingerprint of the image file that this page was derived from
 361 |   * allows re-associating pages with source images
 362 | * `ppageno n`
 363 |   * the physical page number
 364 |   * the front cover is page number 0
 365 |   * should be unique
 366 |   * must not be present unless the pages in the document have a physical ordering
 367 |   * must not be present unless it is well defined and unique
 368 | * `lpageno string`
 369 |   * the logical page number expressed on the page
 370 |   * may not be numerical (e.g., Roman numerals)
 371 |   * usually is unique
 372 |   * must not be present unless it has been recognized from the page and is unambiguous
 373 | 
 374 | *  BBOX
 375 |   *  叶面边框；对于页，左上角都必须位于（0,0），这样一个典型的页面边框会看起来像“BBOX 0 0 2300 3200”
 376 | 
 377 | *  image imagefile图像镜像文件
 378 |   *  图片文件名做输入
 379 |   *  语法，必须是一个UNIX类似的路径或HTTP URL（没有Windows路径名）
 380 |   *  可能是相关
 381 |   *  一般不能被解析为实际文件（例如，如果该hOCR文件与图像文件分开）
 382 |   *  如果hOCR文件存在于目录层次或文件归档，应解析为相应的图像文件
 383 | *  imagemd5 checksum校验
 384 |   *  这一页的来源的图像文件的MD5指纹
 385 |   *  允许重新关联的页面与源图像
 386 | *  ppageno n
 387 |   *  物理页号
 388 |   *  封面页码0
 389 |   *  应该是唯一的
 390 |   *  一定不能出现，除非该文档的页面有一个物理顺序
 391 |   *  除非它被很好地定义和唯一的，否则不能出现
 392 | *  lpageno string 
 393 |   *  页上表达上的逻辑页号
 394 |   *  未必是数字（例如，罗马数字）
 395 |   *  通常是唯一的
 396 |   *  除非它已经从页面识别并且是不含糊，否则不能存在
 397 | 
 398 | 
 399 | 
 400 | The following properties MAY be present:
 401 | 
 402 | * `scan_res x_res y_res`
 403 |   * scanning resolution in DPI
 404 | * `x_scanner string`
 405 |   * a representation of the scanner
 406 | * `x_source string`
 407 |   * an implementation-dependent representation of the document source
 408 |   * could be a URL or a /gfs/ path
 409 |   * offsets within a multipage format (e.g., TIFF) may be represented using
 410 |     additional strings or using URL parameters or fragments
 411 |   * examples
 412 |     * `x_source /gfs/cc/clean/012345678911 17`
 413 |     * `x_source http://pageserver/012345678911&page=17`
 414 | 
 415 | 以下属性可能存在
 416 | * scan_res x_res y_res
 417 |   * 在DPI扫描分辨率
 418 | * x_scanner string
 419 |   * 扫描器的表示
 420 | * x_source string
 421 |   * 文档源的实现相关的表达式
 422 |   * 可以是一个URL或/ gfs /路径
 423 |   * 一个多格式内的偏移量（例如，TIFF）可以使用额外的字符串或使用URL参数或片段来表示
 424 |   * 例子
 425 |     * x_source / gfs / cc /clean/ 012345678911 17
 426 |     * x_source http://pageserver/012345678911&page=17
 427 | 
 428 | 
 429 | The `ocr_carea` elements should appear reading order unless this is impossible
 430 | because of some other structuring requirement If the document contains multiple
 431 | `ocr_linear` streams, then each `ocr_carea` must indicate which stream it belongs
 432 | to.
 433 | 
 434 | `ocr_carea` 元素应该出现阅读顺序，除非顺序不可得，因为其他一些结构要求如果文档包含多个 `ocr_linear` 流，那么每个 `ocr_carea` 必须表明它属于哪个流。
 435 | 
 436 | In typesetting systems, content areas are filled with “blocks”, but most of
 437 | those blocks are not recoverable or semantically meaningful. However, one type
 438 | of block is visible and very important for OCR engines: the line. Lines are
 439 | typesetting blocks that only contain glyphs (“inlines” in XSL terminology).
 440 | 
 441 | 在排版系统，内容方面都以“块”填充，但大部分区块的无法收回的或语义含义。然而，一个类型的块是OCR引擎可见而且非常重要：行。行排版仅包含字形（XSL中的术语“内联”）模块。
 442 | 
 443 | They are represented by the `ocr_line` area. In addition to the standard
 444 | properties, the `ocr_line` area supports the following additional properties:
 445 | 
 446 | 它们由 `ocr_line` 面积表示。除了标准的属性，所述 `ocr_line` 区域支持以下附加性能：
 447 | 
 448 | * `hardbreak n`
 449 |   * a zero (default) indicates that the end of the line is not a hard
 450 |     (explicit) line break, but a break due to text flow
 451 |   * a one indicates that the line is a hard (explicit) line break
 452 | 
 453 | * hardbreak n
 454 |   * 0（默认值）表示该行的结尾不是一个（hard）硬（明确的）换行，而是由于文本流中断
 455 |   * 1表示指示该行是一个硬（显式）换行
 456 | 
 457 | 
 458 | Any special characters representing the desired end-of-line processing must be
 459 | present inside the `ocr_line` element. Examples of such special characters are a
 460 | soft hyphen ("­", `U+00AD`), a hard line break (`<br>`), or whitespace (` `) for soft
 461 | line breaks.
 462 | 
 463 | 
 464 | 代表所需的最终的行处理的任何特殊字符必须存在 `ocr_line` 元件内部。这样的特殊字符的例子是软连字符（hyphen）（“”,U+00AD），硬换行符（“<br>”）或空格（ ）软换行符。
 465 | 
 466 | Note that for many documents, the actual ground truth careas are well-defined
 467 | by the document style of the original document before printing and scanning.
 468 | From a single page, the `careas` of the original document style cannot be
 469 | recovered exactly. However, the partition of a document by `ocr_carea` for an
 470 | individual page shall be considered correct relative to ground truth if
 471 | 
 472 | 注意，对于许多文档，打印和扫描之前，实际基准careas由原始文档的文档样式明确定义。从单一的页面，原来的文档样式的 `careas` 不能精确恢复。 然而，通过 `ocr_carea` 为单个页面文档的分区应被相对于基准来说被视为正确的，如果
 473 | 
 474 | 1. all the text contained in a ground truth carea is fully contained within a
 475 |   single `ocr_carea`,
 476 | 2. no text outside a ground truth `carea` is contained within an
 477 |   `ocr_carea`, and 
 478 | 3. the `ocr_careas` appear in the same order as the text flow
 479 |   relationships between the ground truth careas.
 480 | 
 481 | 1、单个ocr_carea完全包含基准中包含的所有文本
 482 | 2、ocr_carea不会包含基准carea以外的文本
 483 | 3、ocr_careas与基准careas之间的文本流的关系有相同的顺序。
 484 | 
 485 | 
 486 | The following floats are defined:
 487 | 
 488 | * `ocr_float`
 489 | * `ocr_separator`
 490 | * `ocr_textfloat`
 491 | * `ocr_textimage`
 492 | * `ocr_image`
 493 | * `ocr_linedrawing` – something that could be represented well and naturally in
 494 |   a vector graphics format like SVG (even if it is actually represented as PNG)
 495 | * `ocr_photo` – something that requires JPEG or PNG to be represented well
 496 | * `ocr_header`
 497 | * `ocr_footer`
 498 | * `ocr_pageno`
 499 | * `ocr_table`
 500 | 
 501 | Floats should not be nested.
 502 | 
 503 | 以下是定义好的浮动属性：
 504 | * ocr_float
 505 | * ocr_separator
 506 | * ocr_textfloat
 507 | * ocr_textimage
 508 | * ocr_image
 509 | * ocr_linedrawing -可以在如SVG（即使它实际上是表示为PNG）的矢量图形格式中中很自然的表达
 510 | * ocr_photo -这需要JPEG或PNG来表示良好
 511 | * ocr_header
 512 | * ocr_footer
 513 | * ocr_pageno
 514 | * ocr_table
 515 | 
 516 | 
 517 | 浮动元素不应被嵌套。
 518 | 
 519 | 
 520 | 
 521 | ## 6 Inline Representations
 522 | 
 523 | There is some content that should behave and flow like text
 524 | 
 525 | 其中有一些内容应该和文本类似
 526 | 
 527 | * `ocr_glyph` – an individual glyph represented as an image (e.g., an unrecognized character)
 528 |   * must contain a single `<IMG>` tag, or be present on one
 529 | * `ocr_glyphs` – multiple glyphs represented as an image (e.g., an unrecognized word)
 530 |   * must contain a single `<IMG>` tag, or be present on one
 531 | * `ocr_dropcap` – an individual glyph representing a dropcap
 532 |   * may contain text or an `<IMG>` tag; the `ALT` of the image tag should
 533 |     contain the corresponding text
 534 | * `ocr_glyphs` – a collection of glyphs represented as an image
 535 |   * must contain a single `<IMG>` tag, or be present on one
 536 | * `ocr_chem` – a chemical formula
 537 |   * must contain either a single `<IMG>` tag or ChemML markup, or be present on one
 538 | * `ocr_math` – a mathematical formula
 539 |   * must contain either a single `<IMG>` tag or MathML markup, or be present on one
 540 | 
 541 | 
 542 | * ocr_glyph –-单个字形表示为图像（例如，一个无法识别的字符）
 543 |   * 必须包含单个IMG标签，or be present on one
 544 | * ocr_glyphs –多个字形表示一个图像（例如，一个无法识别的单词）
 545 |   * 必须包含单个IMG标签，或者可以单独表达为一个
 546 | * ocr_dropcap –单个字形表示一个dropcap
 547 |   * 可能包含文本或IMG标签;图像标记的ALT应包含相应的文本
 548 | * ocr_glyphs -字形的集合，表示为一幅图像
 549 |   * 必须包含单个IMG标签，或者可以存在于一个
 550 | * ocr_chem –一个化学式
 551 |   * 必须包含一个IMG标签或ChemML标记，或出现1
 552 | * ocr_math -数学公式
 553 |   * 必须包含一个IMG标签或MathML标记，或出现1
 554 | 
 555 | 
 556 | 
 557 | Mathematical and chemical formulas that float must be put into an `ocr_float`
 558 | section.
 559 | 
 560 | 
 561 | 数学和化学公式的浮动必须加入ocr_float部分。
 562 | 
 563 | Mathematical and chemical formulas that are “display” mode should be put into
 564 | an `ocr_display` section.
 565 | 
 566 | 数学和化学公式的“显示”模式应该加入ocr_display部分。
 567 | 
 568 | Non-breaking spaces must be represented using the HTML `&nbsp;` entity.
 569 | 
 570 | 非中断/换行空格必须使用HTML `&nbsp;` 实体来表示。
 571 | 
 572 | Soft hyphens must be represented using the HTML `&shy;` entity.
 573 | 
 574 | 软连字符必须使用HTML `&shy;` 实体表示。
 575 | 
 576 | Different space widths should be indicated using HTML and `&ensp;`, `&emsp`, `&thinsp;`,
 577 | `&zwnj;`, `&zwj;`.
 578 | 
 579 | 不同空格宽度应该使用HTML和 `&ensp;`、 `&emsp`, `&thinsp;`， `&zwnj;`， `&zwj;` 表示。
 580 | 
 581 | 
 582 | The HTML `&lrm;` and `&rlm;` entities (indicating writing direction) must not
 583 | be used; all writing direction changes must be indicated with tags.
 584 | 
 585 | 
 586 | 在HTML `&lrm;` 和 `&rlm;` 实体（指示书写方向）不能再使用; 所有的书写方向变化必须用标签来表示。
 587 | 
 588 | Other superscripts and subscripts must be represented using the HTML `<SUP>` and
 589 | `<SUB>` tags, even if special Unicode characters are available.
 590 | 
 591 | 其他上标和下标必须使用HTML `<SUP>` 和 `<SUB>` 标签，即使存在特殊的Unicode字符。
 592 | 
 593 | Furigana and similar constructs must be represented using their correct Unicode
 594 | encoding.
 595 | 
 596 | 假名和类似的结构，必须使用其正确的Unicode编码来表示。
 597 | 
 598 | 
 599 | ## 7 字符信息 Character Information
 600 | 
 601 | Character-level information may be put on any element that contains only a
 602 | single "line" of text; if no other layout element applies, the `ocr_cinfo`
 603 | element may be used.
 604 | 
 605 | 字符级信息可能放在任何一个只包含一个文本“线”的元素;如果没有其他的布局元件适用，`ocr_cinfo` 元件可以如下使用
 606 | 
 607 | 
 608 | * `cuts c1 c2 c3 …`
 609 |   * character segmentation cuts (see below)
 610 |   * there must be a bbox property relative to which the cuts can be interpreted
 611 | * `nlp c1 c2 c3 …`
 612 |   * estimate of the negative log probabilities of each character by the recognizer
 613 | 
 614 | * cuts C1 C2 C3 ...
 615 |   * 字符片段的分割（见下文）
 616 |   * 相对于哪个分割可以被解释必须有一个bbox属性
 617 | * nlp C1 C2 C3 ...
 618 |   * 由识别程序估算每个字符的负对数概率
 619 | 
 620 | 
 621 | For left-to-write writing directions, cuts are sequences of deltas in the x and
 622 | y direction; the first delta in each path is an offset in the x direction
 623 | relative to the last x position of the previous path. The subsequent deltas
 624 | alternate between up and right moves.
 625 | 
 626 | 对于从左到右的书写方向，分割是x和y方向的σ序列;在各路径中的首个σ是x方向相对于前一路径的最后x位置的偏移。向上和向右移动的后续增量备用。
 627 | 
 628 | 
 629 | Assume a bounding box of `(0,0,300,100)`; then
 630 | 
 631 | 假设边界框为（0,0,300,100）;然后
 632 | 
 633 | 
 634 | ````
 635 | cuts("10 11 7 19") =
 636 |     [ [(10,0),(10,100)], [(21,0),(21,100)], [(28,0),(28,100)], [(47,0),(47,100)] ]
 637 | cuts("10,50,3 11,30,-3") =
 638 |     [ [(10,0),(10,50),(13,50),(13,100)], [(21,0),(21,30),(18,30),(18,100)] ]
 639 | ```
 640 | 
 641 | Here is an example:
 642 | 这有一个例子  
 643 | 
 644 | ```html
 645 | <span class="ocr_cinfo" title="bbox 0 0 300 100; nlp 1.7 2.3 3.9 2.7; cuts 9 11 7,8,-2 15 3">hello</span>
 646 | ```
 647 | 
 648 | 
 649 | Cuts are between all codepoints contained within the element, including any
 650 | whitespace and control characters.  Simply use a delta of 0 (zero) for
 651 | invisible codepoints.
 652 | 
 653 | cuts是包含在元素中的所有代码点，包括任何空白和控制字符之间。 只需使用0（零）增量隐形码点。
 654 | 
 655 | 
 656 | Writing directions other than left-to-right specify cuts as if the bounding box
 657 | for the element had been rotated by a multiple of 90 degrees such that the
 658 | writing direction is left to right, then rotated back.
 659 | 
 660 | 除了从左到右，其他书写方向指定分割，如果该元素的边界框已被旋转多个90度，使得书写方向是从左向右，需要转动回来。
 661 | 
 662 | It is undefined what happens when cut paths intersect, with the exception that
 663 | a delta of 0 always corresponds to an invisible codepoint.
 664 | 
 665 | 未定义当分割路径交错时会发生什么，特例是delta 0总是对应于一个无形的代码点。
 666 | 
 667 | ## 8 OCR 引擎的特定标记 OCR Engine-Specific Markup
 668 | 
 669 | A few abstractions are used as intermediate abstractions in OCR engines,
 670 | although they do not have a meaning that can be defined either in terms of
 671 | typesetting or logical function. Representing them may be useful to represent
 672 | existing OCR output, say for workflow abstractions.
 673 | 
 674 | 一些抽象的被作为OCR引擎中间抽象，尽管按排版或逻辑函数定义他们没有具体含义。表达他们可能对于表示现有的OCR输出是有用的，说说工作流程的抽象。
 675 | 
 676 | Common suggested engine-specific markup are:
 677 | 
 678 | 常见的引擎特定标记是：
 679 | 
 680 | * `ocrx_block`
 681 |   * any kind of "block" returned by an OCR system
 682 |   * engine-specific because the definition of a "block" depends on the engine
 683 | * `ocrx_line`
 684 |   * any kind of "line" returned by an OCR system that differs from the standard ocr_line above
 685 |   * might be some kind of "logical" line
 686 | * `ocrx_word`
 687 |   * any kind of "word" returned by an OCR system
 688 |   * engine specific because the definition of a "word" depends on the engine
 689 | 
 690 | * ocrx_block
 691 |   * 任何一种通过OCR系统返回的“快”
 692 |   * 引擎特有的，因为一个“块”的定义取决于引擎
 693 | * ocrx_line
 694 |   * OCR系统返回的任何形式的“line”，与上面的标准ocr_line不同
 695 |   * 可能是某种“逻辑”行
 696 | * ocrx_word
 697 |   * 任何形式的“字”通过OCR系统返回
 698 |   * 引擎特定的，因为一个“字”的定义取决于发动机
 699 | 
 700 | 
 701 | The meaning of these tags is OCR engine specific. However, generators should
 702 | attempt to ensure the following properties:
 703 | 
 704 | * an `ocrx_block` should not contain content from multiple ocr_careas
 705 | * the union of all `ocrx_blocks` should approximately cover all `ocr_careas`
 706 | * an `ocrx_block` should contain either a float or body text, but not both
 707 | * an `ocrx_block` should contain either an image or text, but not both
 708 | * an `ocrx_line` should correspond as closely as possible to an `ocr_line`
 709 | * `ocrx_cinfo` should nest inside `ocrx_line`
 710 | * `ocrx_cinfo` should contain only `x_conf`, `x_bboxes`, and `cuts` attributes
 711 | 
 712 | 
 713 | 这些标签的含义是OCR引擎特有的。然而，发生器应当尽可能的确保以下属性：
 714 | * 一个ocrx_block不应该包含来自多个ocr_careas内容
 715 | * 所有ocrx_blocks的联合应该近似覆盖所有ocr_careas
 716 | * 一个ocrx_block应该包含一个浮动float或正文文本，但不能同时包含
 717 | * 一个ocrx_block应包含的图像或文字，但不能同时包含
 718 | * 一个ocrx_line应尽可能地对应于ocr_line
 719 | * ocrx_cinfo应该嵌套在ocrx_line里面
 720 | * ocrx_cinfo应该只包含x_conf，x_bboxes和cuts属性
 721 | 
 722 | The following properties are defined:
 723 | 
 724 | * `x_font s`
 725 |   * OCR-engine specific font names
 726 | * `x_fsize n`
 727 |   * OCR-engine specific font size
 728 | * `x_boxes b1x0 b1y0 b1x1 b1y1 b2x0 b2y0 b2x1 b2y1 …`
 729 |   * OCR-engine specific boxes associated with each codepoint contained in the
 730 |     element
 731 |   * note that the bbox property is a property for the bounding box of a layout
 732 |     element, not of individual characters
 733 |   * in particular, use `<span class="ocr_cinfo" title="x_bboxes ....">`, not
 734 |     `<span class="ocr_cinfo" title="bbox ...">`
 735 | * `x_confs c1 c2 c3 …`
 736 |   * OCR-engine specific character confidences
 737 |   * `c1` etc. must be numbers
 738 |   * higher values should express higher confidences
 739 |   * if possible, convert character confidences to values between 0 and 100 and
 740 |     have them approximate posterior probabilities (expressed in %)
 741 | * `x_wconf n`
 742 |   * OCR-engine specific confidence for the entire contained substring
 743 |   * n must be a number
 744 |   * higher values should express higher confidences
 745 |   * if possible, convert word confidences to values between 0 and 100 and have
 746 |     them approximate posterior probabilities (expressed in %)
 747 | 
 748 | 下列属性定义如下：
 749 | * x_font s
 750 |   * OCR引擎特定的字体名称
 751 | * x_fsize n
 752 |   * OCR引擎特定的字体大小
 753 | * x_boxes b1x0 b1y0 b1x1 b1y1 b2x0 b2y0 b2x1 b2y1 ...
 754 |   * 与每个元素所包含的编码点相关联的OCR引擎特定方框
 755 |   * 注意，bbox属性是一个布局元素的边框属性，而不是单个字符
 756 |   * 尤其是，使用<span class =“ocr_cinfo” title="x_bboxes ....">，
 757 |      而不是<span class =“ocr_cinfo” title=“BBOX ...”>
 758 | * x_confs C1 C2 C3 ...
 759 |   * OCR引擎特定的字符可信度
 760 |   * C1等，必须是数字
 761 |   * 较高的值应表达较高的可信度
 762 |   * 如果可能的话，转换的字符置信度到值0和100之间，并让它们近似后验概率（以％表示）
 763 | * x_wconf n
 764 |   * 包含子串的全部的OCR引擎特定置信度
 765 |   * n必须是一个数
 766 |   * 较高的值应表达较高的可信度
 767 |   * 如果可能的话，转换字置信度到值0和100之间，并让它们近似后验概率（以％表示）
 768 | 
 769 | ## 9 字体，文本颜色，语言，方向 Font, Text Color, Language, Direction
 770 | 
 771 | OCR-generated font and text color information is encoded using standard HTML
 772 | and CSS attributes on elements with a class of `ocr_...` or `ocrx_...`.
 773 | Language and writing direction should be indicated using the HTML standard
 774 | attributes `lang=` and `dir=`, or alternatively can be indicated as properties on
 775 | elements.
 776 | 
 777 | OCR生成的字体和文本颜色信息使用标准的HTML和CSS中带ocr _...或ocrx _类的元素属性。语言和书写方向应使用HTML标准属性lang= 和dir= 来表示，或也可以表示为元素的属性。
 778 | 
 779 | OCR information and presentation information can be separated by putting the
 780 | CSS info related to the CSS in an outer element with an `ocr_` or `ocrx_` class,
 781 | and then overriding it for the presentation by nesting another `<SPAN>` with the
 782 | actual presentation information inside that:
 783 | 
 784 | 通过将相关的CSS的信息加入ocr_或ocrx_类外部元件，然后通过嵌套另一个有内部的实际显示信息的<SPAN>来超越它以达到OCR信息和表达信息的分离：
 785 | 
 786 | ```
 787 | <span class="ocr_cinfo" style="ocr style"><span style="presentation style"> ... </span></span>
 788 | ```
 789 | 
 790 | The CSS3 text layout attributes can be used when necessary. For example, CSS
 791 | supports writing-mode, direction, glyph-orientation [ISO-15924-based
 792 | script](http://www.unicode.org/iso15924/codelists.html), text-indent, etc.
 793 | 
 794 | CSS3的文本布局属性可在必要时使用。例如，CSS支持写入模式，方向，字形取向基于ISO-15924脚本，文本缩进，等等。
 795 | 
 796 | ## 10 替代分割/读数 Alternative Segmentations / Readings
 797 | 
 798 | Alternative segmentations and readings are indicated by a `<SPAN>` with
 799 | `class="alternatives"`. It must contains `<INS>` and `<DEL>` elements. The first
 800 | contained element should be `<INS>` and represent the most probable interpretation,
 801 | the subsequent ones `<DEL>`. Each `<INS>` and `<DEL>` element should have `class="alt"` and a
 802 | property of either `nlp` or `x_cost`. These `<SPAN>`, `<INS>`, and `<DEL>` tags can nest
 803 | arbitrarily.
 804 | 
 805 | 可选分割和读数通过一个class=”alternatives”的<SPAN>属性表示。它必须包含<INS>和<DEL>元素。第一个包含的元素应该是<INS>，代表最可能的解释，后续的是<DEL>。每个<INS>和<DEL>元素应该有“class= “alt”和属性“nlp”或“x_cost”。这些<SPAN>，<INS>，和<DEL>标签可以任意嵌套。
 806 | 
 807 | Example:
 808 | 
 809 | 例如：
 810 | 
 811 | ```html
 812 | <SPAN class="alternatives">
 813 | <INS class="alt" title="nlp 0.3">hello</INS>
 814 | <DEL class="alt" title="nlp 1.1">hallo</DEL>
 815 | </SPAN>
 816 | ```
 817 | 
 818 | Whitespace within the `<SPAN>` but outside the contained `<INS>`/`<DEL>`
 819 | elements is ignored and should be inserted to improve readability of the HTML
 820 | when viewed in a browser.
 821 | 
 822 | 在span里面但是在包含INS / DEL元素之外的空白将被忽略，但是应该被插入，以提高浏览器中查看时HTML的可读性。
 823 | 
 824 | ## 11 组元素和多个层级 Grouped Elements and Multiple Hierarchies
 825 | 
 826 | The different levels of layout information (logical, physical, engine-specific)
 827 | each form hierarchies, but those hierarchies may not be mutually compatible;
 828 | for example, a single ocr_page may contain information from multiple sections
 829 | or chapters. To represent both hierarchies within a single document, elements
 830 | may be grouped together.  That is, two elements with the same class may be
 831 | treated as one element by adding a "groupid identifier" property to them and
 832 | using the same identifier. 
 833 | 
 834 | 布局信息（逻辑，物理，引擎特有的）的不同层次每一个形成一个层次结构，但这些层次结构可能不相互兼容;例如，一个单一的ocr_page可含有多个部分或章节的信息。为了表示一个文档中两个层次结构，元素可以组合在一起。也就是说，通过增加一个“组识别符号”（grouped identifier）属性，并使用相同的标识符,使具有相同类中的两个元件被视为一个元素。
 835 | 
 836 | 
 837 | Grouped elements should be logically consistent with the markup they represent;
 838 | for example, it is probably not sensible to use grouped elements to interleave
 839 | parts of two different chapters.  Therefore, grouped elements should usually be
 840 | adjacent in the markup.
 841 | 
 842 | 分组元素应与他们所代表的标记逻辑一致的;例如，使用分组的元素来交错两个不同的章节部分是不明智的。因此，分组元素通常应该在标记附近。
 843 | 
 844 | Applications using hOCR may choose to manipulate grouped elements directly, but
 845 | the simplest way of dealing with them is to transform a document with grouped
 846 | elements into one without grouped elements prior to further processing by first
 847 | removing tags that are not of interest for the subsequent processing step, and
 848 | then collapsing grouped elements into single elements.  For example, output
 849 | that contains both logical and physical layout information, where the logical
 850 | layout information uses grouped elements, can be transformed by removing all
 851 | the physical layout information, and then collapsing all split ocr_chapter
 852 | elements into single ocr_chapter elements based on the groupid.  The result is
 853 | a simple DOM tree.  This transformation can be provided generically as a
 854 | pre-processor or Javascript.
 855 | 
 856 | 使用hOCR的应用程序/系统可以选择直接操纵分组的元素，但是处理它们的最简单的方法是 首先除去那些后续处理步骤不感兴趣的标签，将一个有分组元素的文件转换为不含分组元素文件，然后使分组元素折叠成单一的元件。例如，同时包含逻辑和物理布局信息的输出，其中逻辑布局信息使用被分组元素，可通过除去所有的物理布局信息，然后基于组折叠所有分离的ocr_chapter元素成一个单独的ocr_chapter元素。其结果是一个简单的DOM树。这种转变可以提供一般地预处理器或Javascript。
 857 | 
 858 | The presence of grouped elements does not need to be indicated in the header;
 859 | when it affects their operations, hOCR processors should check for the presence
 860 | of grouped elements in the output and fail with an error message if they cannot
 861 | correctly process the hOCR information.
 862 | 
 863 | 分组元素的存在并不需要在标题中指出；当它影响他们的操作时，hOCR处理器应该检查在输出分组元素的存在和如果他们不能正确处理hOCR信息查看错误信息。
 864 | 
 865 | ## 12 功能 Capabilities
 866 | 
 867 | Any program generating files in this output format must indicate in the
 868 | document metadata what kind of markup it is capable of generating. This
 869 | includes listing the exact set of markup sections that the system could have
 870 | generated, even if it did not actually generate them for the particular
 871 | document.
 872 | 
 873 | 任何以这个输出格式的生成文件程序必须在文档元数据中指出它能够产生什么样的标记。这包括列出该系统可能产生的标记的确切集合，即使它实际上没有产生它们用于特定文档。
 874 | 
 875 | The capability to generate specific properties is given by the prefix `ocrp_…`;
 876 | the important properties are:
 877 | 
 878 | 生成特定属性的能力是由前缀ocrp_ ...给出;重要的属性有：
 879 | 
 880 | * `ocrp_lang` – capable of generating lang= attributes
 881 | * `ocrp_dir` – capable of generating dir= attributes
 882 | * `ocrp_poly` – capable of generating polygonal bounds
 883 | * `ocrp_font` – capable of generating font information (standard font information)
 884 | * `ocrp_nlp` – capable of generating nlp confidences
 885 | 
 886 | * ocrp_lang -能够产生lang= attributes
 887 | * ocrp_dir -能够产生dir= attributes
 888 | * ocrp_poly -能够产生多边形界
 889 | * ocrp_font -能够产生字体信息（标准字体信息）
 890 | * ocrp_nlp -能够产生NLP置信
 891 | 
 892 | 
 893 | The capability to generate other specific embedded formats is given by the
 894 | prefix `ocr_embeddedformat_<formatname>`.
 895 | 
 896 | 生成其他特定的嵌入式格式的能力是由前缀ocr_embeddedformat_ <FORMATNAME>给出。
 897 | 
 898 | If an OCR engine represents a particular tag but cannot determine reading order
 899 | for that tag, it must must specify a capability of `ocr_<tag>_unordered`.
 900 | 
 901 | 如果OCR引擎代表一个特定的标记，但不能确定该标签读取命令，它必须指定ocr_<tag>_unordered.
 902 | 
 903 | If a document lists a certain capabilities but no element or attribute is found
 904 | that corresponds to that capability, users of the document may infer that the
 905 | content is absent in the source document. If a capability is not listed, the
 906 | corresponding element or attribute must not be present in the document.
 907 | 
 908 | 如果文档列出了某些功能，但没有元素或属性是发现对应于该功能，该文件的用户可以推断该内容是在源文档中不存在。如果未列出的功能，相应的元素或属性不能出现在文档中。
 909 | 
 910 | ## 13 配置 Profiles
 911 | 
 912 | hOCR provides standard means of marking up information, but it does not mandate
 913 | the presence or absence of particular kinds of information.  For example, an
 914 | hOCR file may contain only logical markup, only physical markup, or only
 915 | engine-specific markup. As a result, merely knowing that OCR output is hOCR
 916 | compliant doesn't tell us whether that file is actually useful for subsequent
 917 | processing.
 918 | 
 919 | hOCR提供标记了信息的标准方法，但它并不强制特定类型的信息的存在或不存在。例如，一个hOCR文件可能仅包含逻辑标记，只有物理标记，或者仅引擎特定标记。其结果是，仅仅知道OCR输出是hOCR标准并没有告诉我们，文件对于后续处理是否有用。
 920 | 
 921 | OCR systems can use hOCR in various different ways internally, but we will
 922 | eventually define some common profiles that mandate what kinds of information
 923 | needs to be present in particular kinds of output.
 924 | 
 925 | OCR系统可以在各种不同的方式使用hOCR内部，但我们最终会确定一些通用的配置文件定义在特定输出上，什么类型的信息需要存在。
 926 | 
 927 | Of particular importance are:
 928 | 
 929 | 特别重要的是： 
 930 | 
 931 | * physical layout profile: OCR output in XHTML format with a defined set of
 932 |   common physical layout markup capabilities (page, carea, floats, line).
 933 |   Logical layout may be present as well, but the document tree structure must
 934 |   represent the physical layout structure, with logical layout elements split
 935 |   and grouped as needed.
 936 | 
 937 | * logical layout profile: OCR output in XHTML format with a defined set of
 938 |   common logical layout markup capabilities (linear, chapter, section,
 939 |   subsection).  Physical layout may be present as well, but the document tree
 940 |   structure must represent the logical layout structure, with logical layout
 941 |   elements split and grouped as needed.
 942 | 
 943 | 
 944 | * 物理布局配置：OCR输出XHTML格式定义的一组常见的物理布局标记功能（page，carea，floats，line）。逻辑布局也可以存在，但是文档树结构必须代表的物理布局结构，根据需要分离和组合逻辑布局元素。
 945 | 
 946 | * 逻辑布局简介：OCR输出XHTML格式定义的一组常见的逻辑布局标记功能（线性，章，段，分段）（linear, chapter, section, subsection）。物理布局也可能存在，但文档树结构必须代表逻辑布局结构，根据需要分离和组合逻辑布局元素。
 947 | 
 948 | 
 949 | 
 950 | Other possible profiles might be defined for specific engines or specific document classes:
 951 | 
 952 | 针对特定引擎或特定的文档类型，可能定义如下配置：
 953 | 
 954 | * common commercial OCR output (e.g., Abbyy)
 955 |   * ocr_page
 956 |   * ocrx_block, ocrx_line, ocrx_word
 957 |   * ocrp_lang
 958 |   * ocrp_font
 959 | * book target
 960 |   * all logical structuring elements (as applicable), except ocr_linear
 961 |   * ocr_page
 962 | * newspaper target
 963 |   * all logical structuring elements (as applicable)
 964 |   * articles map on ocr_linear
 965 |   * ocr_page
 966 | 
 967 | 
 968 | * 普通商业OCR输出（例如，Abbyy）
 969 |   * ocr_page
 970 |   * ocrx_block，ocrx_line，ocrx_word
 971 |   * ocrp_lang
 972 |   * ocrp_font
 973 | * 书 target
 974 |   * 所有的逻辑结构元素（as applicable），除了ocr_linear
 975 |   * ocr_page
 976 | * 报纸target
 977 |   * 所有的逻辑结构元素（as applicable）
 978 |   * 文章地图上ocr_linear
 979 |   * ocr_page
 980 | 
 981 | 
 982 | ## 14 必需的元信息 Required Meta Information
 983 | 
 984 | The OCR system is required to indicate the following using meta tags in the header:
 985 | 
 986 | 要求OCR系统必须在报头中使用元标签指示以下：
 987 | 
 988 | * `<meta name="ocr-system" content="name version"/>`
 989 | * `<meta name="ocr-capabilities" content="capabilities"/>`
 990 |   * see the capabilities defined above
 991 | 
 992 | The OCR system should indicate the following information
 993 | 
 994 | 该OCR系统应注明以下信息
 995 | 
 996 | * name=ocr-number-of-pages content=number-of-pages
 997 | * `<meta name="ocr-langs" content=[languages-considered-by-ocr]/>`
 998 |   * use [ISO 639-1](https://www.loc.gov/standards/iso639-2/php/code_list.php) codes
 999 |   * value may be `unknown`
1000 | * `<meta name="ocr-scripts" content=[scripts-considered-by-ocr]/>`
1001 |   * use [ISO 15924](http://www.unicode.org/iso15924/codelists.html) letter codes
1002 |   * value may be `unknown`
1003 | 
1004 | 
1005 | 
1006 | ## 15 HTML Markup
1007 | 
1008 | The HTML-based markup is orthogonal to the hOCR-based markup; that is, both can
1009 | be chosen independent of one another. The only thing that needs to be
1010 | consistent between the two markups is the text contained within the tags. hOCR
1011 | and other embedded format tags can be put on HTML tags, or they can be put on
1012 | their own `<DIV>`/`<SPAN>` tags.
1013 | 
1014 | 基于HTML的标记是垂直于基于hOCR的标记;也就是说，既可以独立使用。两个标记唯一需要保持一致的事情就是包含在标签中的文本。hOCR和其他嵌入式格式的标签可以放在HTML标签，也可以在自己的 `<DIV>` /`< SPAN>` 标签。
1015 | 
1016 | There are many different choices possible and reasonable for the HTML markup,
1017 | depending on the use and further processing of the document. Each such choice
1018 | must be indicated in the meta data for the document.
1019 | 
1020 | HTML标记有许多不同的可能和可信的选择，这取决于文件的使用和进一步的处理。每个这样的选择必须在用于文件的元数据来指示。
1021 | 
1022 | Many mappings derived from existing tools are quite similar, and most follow
1023 | the restrictions and recommendations below already without further
1024 | modifications.
1025 | 
1026 | 从现有的工具衍生出许多映射都十分相似，文件适用于不通的处理和使用，大部分服从这个限制，下面建议已经没再进一步的修改。
1027 | 
1028 | Depending on the particular HTML markup used in the document, the document is
1029 | suitable for different kinds of processing and use. The formats have the
1030 | following intents:
1031 | 
1032 | 根据文档中使用的特定的HTML标记，该文件是适用于不同种类的加工和使用。格式具有以下意思：
1033 | 
1034 | * `html_none`: straightforward equivalent of Goodoc or XDOC
1035 | * `html_ocr`: straightforward recording of commercial OCR system output
1036 | * `html_absolute`: target format for services like Google's View as HTML
1037 | * `html_xytable`: target format for layout-preserving on-screen document viewing
1038 | * `html_simpl`: target format for convenient on-line viewing and intermediate format for indexing
1039 | 
1040 | * html_none：相当于等价Goodoc 或者XDOC
1041 | * html_ocr：商业OCR系统输出的简单记录
1042 | * html_absolute：像谷歌的查看HTML的服务目标格式
1043 | * html_xytable：对目标格式布局保持屏幕上的文档查看
1044 | * html_simple：便捷的在线观看和中间格式进行索引目标格式
1045 | 
1046 | 
1047 | As long as a format contains the hOCR information, it can be reprocessed by
1048 | layout analysis software and converted into one of the other formats. In
1049 | particular, we envision layout analysis tools for converting any hOCR document
1050 | into `html_absolute`, `html_xytable`, and `html_simple`. Furthermore,
1051 | internally, a layout analysis system might use `html_xytable` as an
1052 | intermediate format for converting hOCR into `html_simple`.
1053 | 
1054 | 
1055 | 只要一个格式包含hOCR信息，它可以通过布局分析软件进行再加工并转换成其他格式之一。特别是，我们设想布局分析工具将任何hOCR文档转换成html_absolute，html_xytable和html_simple。此外，在内部，将hOCR转换成html_simple布局分析系统可能使用html_xytable作为中间格式。
1056 | 
1057 | ### 15.1 HTML内容的限制 Restrictions on HTML Content
1058 | 
1059 | To avoid problems, any use of HTML markup must follow the following rules:
1060 | 
1061 | 为了避免出现问题，任何使用HTML标记必须遵循以下规则：
1062 | 
1063 | * HTML content must not use class names that conflict with any of those defined in this document (`ocr_*`)
1064 | * HTML content must not use the title= attribute on any element with an ocr_* class for any purposes other than encoding OCR-related properties as described in this document
1065 | 
1066 | * HTML内容不得使用与文档中定义的（“ocr_ *”）起冲突的类名
1067 | * HTML内容不得在有ocr_*类别的元素上使用title=attribute,除了文档描述的OCR相关属性编码。
1068 | 
1069 | 
1070 | ### 15.2 映射的建议 Recommendations for Mappings
1071 | 
1072 | When possible, any mapping of logical structure onto HTML should try to follow the following rules:
1073 | 
1074 | 如果可能的话，逻辑结构到HTML任何映射应尽量遵循以下规则：
1075 | 
1076 | * the mapping should be "natural" -- similar to what an author of the document
1077 |   might have entered into a WYSIWYG content creation tool
1078 | * text should be in reading order
1079 | * all tags should be used for the intended purpose (and only for the intended
1080 |   purpose) as defined in the [HTML 4 spec](https://www.w3.org/TR/html4/).
1081 | * floats are contained in `<DIV>` elements with a `style` that includes a float attribute
1082 | * repeating floating page elements (header/footer) should be repeated and occur
1083 |   in their natural location in reading order (e.g., between pages)
1084 | * embedded images and SVG should be contained in files in the same directory
1085 |   (no `/` in the URL) and embedded with `<IMG>` and `<EMBED>` tags, respectively
1086 | 
1087 | * 映射应该是“自然的”-类似于文档的作者可能已经进入了一个所见即所得的内容创作工具时可能得到的
1088 | * 文字应该是按读取顺序
1089 | * 所有标签应该用于期望的目的（以及仅用于期望的目的），如同HTML4规范中定义
1090 | * 浮动属性包含在包括一个浮标属性的样式的<DIV>元素中
1091 | * 重复浮动页面元素（页眉/页脚）应现以阅读顺序在它们的自然位置（例如，页面之间）重复和出现
1092 | * 嵌入图像和SVG应包含在同一个目录下的文件（在URL中没有“/”）和分别嵌入<IMG>和<EMBED>标记。
1093 | 
1094 | 
1095 | Specifically
1096 | 
1097 | 特别地：  
1098 | 
1099 | 
1100 | * `<EM>` and `<STRONG>` should represent emphasis, and are preferred to `<B>`, `<I>`, and `<U>`
1101 | * `<B>`, `<I>`, and `<U>` should represent a change in the corresponding
1102 |   attribute for the current font (but an OCR font specification must still be
1103 |   given)
1104 | * `<P>` should represent paragraph breaks
1105 | * `<BR>` should represent explicit linebreaks (not linebreak that happen because of text flow)
1106 | * `<H1>`, …, `<H6>` should represent the logical nesting structure (if any) of the document
1107 | * `<A>` should represent hyperlinks and references within the document
1108 | * `<BLOCKQUOTE>` should represent indented quotations, but not other uses of indented text.
1109 | * `<UL>`, `<OL>`, `<DL>` should represent lists
1110 | * `<TABLE>` should represent tables, including correct use of the `<TH>` tag
1111 | 
1112 | * `<EM>`和`<STRONG>`应该代表强调，并首选`<B>`，`<I>`和`<U>`
1113 | * `<B>`，`<I>`和`<U>`应代表在当前字体的相应属性的变化（但仍必须被给予一个OCR字体规范）
1114 | * `<P>`应该代表分段符
1115 | * `<BR>`应该代表明确的换行（没有断行这种情况发生，因为文本流）
1116 | * `<H1>`，...，`<H6>`应该代表文档的逻辑嵌套结构（如果有的话）
1117 | * `<A>`应该表示文档中的超链接和引用
1118 | * `<BLOCKQUOTE>`应该代表缩进引用，但不能缩进的文本的其他用途。
1119 | * `<UL>`，`<OL>`，`<DL>`应代表名单lists
1120 | * `<TABLE>`应该代表表，包括正确使用<TH>标记
1121 | 
1122 | 
1123 | 
1124 | If necessary, the markup may use the following non-standard tags:
1125 | 
1126 | 如果需要的话，该标记可以使用以下非标准代码：
1127 | 
1128 | * `<NOBR>` to indicate that line breaking is not permitted for the enclosed content
1129 | * `<WBR>` to indicate that line breaking is permitted at that location
1130 | 
1131 | 
1132 | * `<NOBR>`，表明封闭的内容不允许断行
1133 | * `<WBR>`，以表明在该位置断行被允许
1134 | 
1135 | 
1136 | #### 15.2.1 html_none
1137 | 
1138 | The simplest HTML markup for hOCR formats contains no logical markup at all; it
1139 | is simply a collection of `<DIV>` and `<SPAN>` elements with associated hOCR
1140 | information. Note that such documents can still be rendered visually through
1141 | the use of CSS.
1142 | 
1143 | 对于hOCR格式最简单的HTML标记根本不包含逻辑标记;它只是具有相关hOCR信息的<DIV>和<SPAN>元素的集合。请注意，这些文件仍然可以直观地通过使用CSS渲染。
1144 | 
1145 | #### 15.2.2 html_simple
1146 | 
1147 | This is a format that follows the restrictions and recommendations above, and only uses the following tags:
1148 | 
1149 | 这是遵循上面的限制和建议的格式，和仅使用以下标记：
1150 | 
1151 | * `<H1>` … `<H6>`
1152 | * `<P>`, `<BR>`
1153 | * `<B>`, `<I>`, and `<U>` for appearance changes (bold, italic, underline)
1154 | * `<FONT>` for any other appearance changes
1155 | * `<A>`
1156 | * `<DIV>` with a float style for floats
1157 | * `<TABLE>` for tables
1158 | * `<IMG>` for images
1159 | * all SVG must be externally embedded with the `<EMBED>` tag
1160 | * the use of other embedded formats is permitted
1161 | * all other uses of `<DIV>`, `<SPAN>`, `<INS>`, and `<DEL>` only for hOCR tags or other embedded formats (hCard, …)
1162 | 
1163 | * `<H1>` ... `<H6>`
1164 | * `<P>`，`<BR>`
1165 | * `<B>`，`<I>`和`<U>`的外观变化（粗体，斜体，下划线）
1166 | * `<FONT>`用于任何其它外观变化
1167 | * `<A>`
1168 | * `<DIV>`浮动风格
1169 | * `<TABLE>`对表
1170 | * `<IMG>`的图像
1171 | * 所有SVG必须外部嵌入的<EMBED>标签
1172 | * 允许使用其他格式的嵌入式
1173 | * 仅供hOCR标签或者嵌套格式（使用hCard，...）的所有`<DIV>`，`<SPAN>`，`<INS>`和`<DEL>`的其他用途
1174 | 
1175 | 
1176 | 
1177 | #### 15.2.3 html_ocr_<engine>
1178 | 
1179 | The HTML markup produced by default by the OCR engine for the given document.
1180 | Examples of possible values are:
1181 | 
1182 | 对于给定的文档，由默认OCR引擎产生的HTML标记。可能的值的实例是：
1183 | 
1184 | * `html_ocr_finereader_8`
1185 | * `html_ocr_textbridge_11`
1186 | * `html_ocr_unknown` – the HTML was generated by some OCR engine, but it's unknown which one
1187 | 
1188 | 
1189 | #### 15.2.4 html_absolute_<element>
1190 | 
1191 | The HTML represents absolute positioning of elements on each page. The possible subformats are:
1192 | 
1193 | 在HTML表示每个页面上的元素的绝对定位。可能的子格式如下：
1194 | 
1195 | * `html_absolute_cols` – absolute positioning of cols
1196 | * `html_absolute_pars` – absolute positioning of paragraphs
1197 | * `html_absolute_lines` – absolute positioning of lines
1198 | * `html_absolute_words` – absolute positioning of words
1199 | * `html_absolute_chars` – absolute positioning of characters
1200 | 
1201 | The ["View as HTML" for PDF
1202 | files](https://googlewebmastercentral.blogspot.de/2011/09/pdfs-in-google-search-results.html)
1203 | feature of Google Search uses `html_absolute_lines`; this is probably the most
1204 | reasonable choice for approximating the appearance of the original document.
1205 | 
1206 | * html_absolute_cols - cols的绝对定位
1207 | * html_absolute_pars -段落绝对定位
1208 | * html_absolute_lines -线条绝对定位、html_absolute_words -词语的绝对定位
1209 | * html_absolute_chars -字符绝对定位
1210 | 谷歌搜索的"View as HTML" for PDF files特征使用html_absolute_lines;这可能是用于逼近原始文档的外观最合理的选择
1211 | 
1212 | 
1213 | #### 15.2.5 html_xytable_absolute
1214 | 
1215 | The HTML is a table that gives the XY-cut layout segmentation structure of the
1216 | page in tabular form. Note that in this format, text order does not necessarily
1217 | correspond to reading order.
1218 | 
1219 | HTML是一个表，以表格的形式给出了页面的XY-cut布局分割结构。请注意，在这种格式中，文本顺序并不一定对应于读取顺序。
1220 | 
1221 | The format must contain one `<TABLE>` of class ocr_xycut representing each page.
1222 | The `<TABLE>` structure must represent the absolute size of the original page
1223 | element. The markup of the content of the table itself is as in html_simple.
1224 | 
1225 | 格式必须包含类的一个含有类ocr_xycut的<table>代表每个页面。表结构必须代表原始网页元素的绝对尺寸。表本身的内容的标记类似html_simple。
1226 | 
1227 | #### 15.2.6 html_xytable_relative
1228 | 
1229 | The page representation is as in
1230 | [`html_xytable_absolute`](#1525-html_xytable_absolute), but table element sizes
1231 | are expressed relative (percentages).
1232 | 
1233 | 该页面表示类似 [`html_xytable_absolute`](#1525-html_xytable_absolute)，但表中的单元尺寸表现为相对百分比。
1234 | 
1235 | #### 15.2.7 html_<processor>
1236 | 
1237 | The HTML represents markup that follows the mappings of the given document
1238 | processor to HTML. Note that the document doesn't actually need to have been
1239 | constructed in the processor and that the processor doesn't need to have been
1240 | used to generate the HTML. For example, the `html_latex2html` tag merely
1241 | indicates that, say, a scanned and ocr'ed article uses the same conventions for
1242 | logical markup tags that an equivalent article actually written in LaTeX and
1243 | actually converted to HTML would have used. Possible subformats are:
1244 | 
1245 | 该HTML标记代表下面的给定的文档处理HTML的映射。注意，该文件实际上并不需要由处理器构造和该处理器不需要用于生成HTML。例如，html_latex2html标签仅仅表明，比方说，一个扫描件和ocr'ed文章对于逻辑标记标签使用相同的约定，等效文章实际写入LaTeX文章和实际上转换为HTML需使用的逻辑标记。可能的子格式是：
1246 | 
1247 | 
1248 | * `html_latex2html`
1249 | * `html_msword` – HTML mapping generated by “Save As HTML”
1250 | * `html_ooffice` – HTML mapping generated by “Save As HTML”
1251 | * `html_docbook_xsl` – HTML mapping generated by official XSL style sheets
1252 | 
1253 | 
1254 | ## 16 文件元信息 Document Meta Information
1255 | 
1256 | For document meta information, use the [Dublin Core Embedding into
1257 | HTML](http://dublincore.org/documents/dcq-html/). See also [Citation Guidelines
1258 | for Dublin Core](http://dublincore.org/documents/dc-citation-guidelines/).
1259 | 
1260 | 对于文档元数据信息，使用 [Dublin Core Embedding into
1261 | HTML](http://dublincore.org/documents/dcq-html/).。另请参见[Citation Guidelines
1262 | for Dublin Core](http://dublincore.org/documents/dc-citation-guidelines/).
1263 | 
1264 | ## 17 用法示例 Sample Usage
1265 | 
1266 | See also the [hocr-tools](https://github.com/tmbdev/hocr-tools) for more samples.
1267 | 
1268 | 更多例子请参考 [hocr-tools](https://github.com/tmbdev/hocr-tools).
1269 | 
1270 | The HTML format described here may seem fairly complicated and difficult to
1271 | parse, but because there are lots of tools for manipulating HTML documents,
1272 | they're actually pretty easy to manipulate. Here are some examples:
1273 | 
1274 | 这里描述的HTML格式可能看起来相当复杂，难以解析，但因为有很多用于处理HTML文档的工具，它们实际上是很容易被操纵。 这里有些例子：
1275 | 
1276 | ```python
1277 | import libxml2,re,os,string
1278 | 
1279 | # convert the HTML to XHTML (if necessary)
1280 | os.system("tidy -q -asxhtml < page.html > page.xhtml 2> /dev/null")
1281 | 
1282 | # parse the XML
1283 | doc = libxml2.parseFile('page.xhtml')
1284 | 
1285 | # search all nodes having a class of ocr_line
1286 | lines = doc.xpathEval("//*[@class='ocr_line']")
1287 | 
1288 | # a function for extracting the text from a node
1289 | def get_text(node):
1290 |     textnodes = node.xpathEval(".//text()")
1291 |     s = string.join([node.getContent() for node in textnodes])
1292 |     return re.sub(r'\s+',' ',s)
1293 | 
1294 | # a function for extracting the bbox property from a node
1295 | # note that the title= attribute on a node with an ocr_ class must
1296 | # conform with the OCR spec
1297 | 
1298 | def get_bbox(node):
1299 |     data = node.prop('title')
1300 |     bboxre = re.compile(r'\bbbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')
1301 |     return [int(x) for x in bboxre.search(data).groups()]
1302 | 
1303 | # this extracts all the bounding boxes and the text they contain
1304 | # it doesn't matter what other markup the line node may contain
1305 | for line in lines:
1306 |     print get_bbox(line),get_text(line)
1307 | ```
1308 | 
1309 | Note that the OCR markup, basic HTML markup, and semantic markup can co-exist
1310 | within the same HTML file without interfering with one another.
1311 | 
1312 | 注意，OCR标记，基本的HTML标记，语义标记可以在相同的HTML文件中共存，而不会相互干扰。
1313 | 


--------------------------------------------------------------------------------
/1.2/defs.yml:
--------------------------------------------------------------------------------
  1 | element:
  2 |   ocr_abstract:
  3 |     categories: ['Logical']
  4 |   ocr_author:
  5 |     categories: ['Logical']
  6 |   ocr_blockquote:
  7 |     categories: ['Logical']
  8 |     recommended_tags: ['blockquote']
  9 |   ocr_caption:
 10 |     categories: ['Logical']
 11 |   ocr_carea:
 12 |     categories: ['Typesetting']
 13 |     properties:
 14 |       required: ['bbox']
 15 |   ocr_chapter:
 16 |     categories: ['Logical']
 17 |     recommended_tags: ['h1']
 18 |   ocr_chem:
 19 |     categories: ['Float']
 20 |     properties:
 21 |       required: ['bbox']
 22 |   ocr_cinfo:
 23 |     categories: ['Inline']
 24 |     properties:
 25 |       recommended: ['x_confs', 'x_bboxes', 'cuts']
 26 |       allowed: ['nlp']
 27 |   ocr_column:
 28 |     categories: ['Typesetting']
 29 |     deprecated: true
 30 |   ocr_display:
 31 |     categories: ['Float']
 32 |     properties:
 33 |       required: ['bbox']
 34 |   ocr_document:
 35 |     categories: ['Logical']
 36 |     recommended_tags: ['div']
 37 |   ocr_dropcap:
 38 |     categories: ['Inline']
 39 |   ocr_float:
 40 |     categories: ['Float']
 41 |     properties:
 42 |       required: ['bbox']
 43 |   ocr_footer:
 44 |     categories: ['Float']
 45 |     properties:
 46 |       required: ['bbox']
 47 |   ocr_glyph:
 48 |     categories: ['Inline']
 49 |   ocr_glyphs:
 50 |     categories: ['Inline']
 51 |   ocr_header:
 52 |     categories: ['Float']
 53 |     properties:
 54 |       required: ['bbox']
 55 |   ocr_image:
 56 |     categories: ['Float']
 57 |     properties:
 58 |       required: ['bbox']
 59 |   ocr_line:
 60 |     categories: ['Typesetting']
 61 |     properties:
 62 |       required: ['bbox']
 63 |       allowed: ['baseline', 'hardbreak', 'x_font', 'x_fsize', 'x_bboxes']
 64 |   ocr_linear:
 65 |     categories: ['Typesetting']
 66 |   ocr_linedrawing:
 67 |     categories: ['Float']
 68 |     properties:
 69 |       required: ['bbox']
 70 |   ocr_math:
 71 |     categories: ['Float']
 72 |     properties:
 73 |       required: ['bbox']
 74 |   ocr_noise:
 75 |     categories: ['Inline']
 76 |   ocr_page:
 77 |     categories: ['Typesetting']
 78 |     properties:
 79 |       required: ['bbox']
 80 |       recommended: ['image', 'imagemd5', 'ppageno', 'lpageno']
 81 |       allowed: ['x_source', 'x_scanner', 'scan_res']
 82 |   ocr_pageno:
 83 |     categories: ['Float']
 84 |     properties:
 85 |       required: ['bbox']
 86 |   ocr_par:
 87 |     categories: ['Logical']
 88 |     recommended_tags: ['p']
 89 |   ocr_part:
 90 |     categories: ['Logical']
 91 |     recommended_tags: ['h1']
 92 |   ocr_photo:
 93 |     categories: ['Float']
 94 |     properties:
 95 |       required: ['bbox']
 96 |   ocr_section:
 97 |     categories: ['Logical']
 98 |     recommended_tags: ['h2']
 99 |   ocr_separator:
100 |     categories: ['Typesetting', 'Float']
101 |     properties:
102 |       required: ['bbox']
103 |   ocr_subsection:
104 |     categories: ['Logical']
105 |     recommended_tags: ['h3']
106 |   ocr_subsubsection:
107 |     categories: ['Logical']
108 |     recommended_tags: ['h4']
109 |   ocr_table:
110 |     categories: ['Float']
111 |     properties:
112 |       required: ['bbox']
113 |   ocr_textfloat:
114 |     categories: ["Float"]
115 |     properties:
116 |       required: ['bbox']
117 |   ocr_textimage:
118 |     categories: ['Float']
119 |     properties:
120 |       required: ['bbox']
121 |   ocr_title:
122 |     categories: ['Logical']
123 |     recommended_tags: ['h1']
124 |   ocr_xycut:
125 |     categories: ['Inline']
126 |   ocrx_block:
127 |     categories: ['Inline', 'Engine-Specific']
128 |   ocrx_line:
129 |     categories: ['Inline', 'Engine-Specific']
130 |   ocrx_word:
131 |     categories: ['Inline', 'Engine-Specific']
132 | 
133 | 
134 | property:
135 |   baseline:
136 |     categories: ['Inline']
137 |     example: 'baseline 0.015 -18'
138 |     grammar: |
139 |       <a>property-value</a> = <a>float</a> <a>int</a>
140 |   bbox:
141 |     categories: ['General', 'Layout']
142 |     example: 'bbox 0 0 100 200'
143 |     grammar: |
144 |       <a>property-value</a> = <a>uint</a> <a>uint</a> <a>uint</a> <a>uint</a>
145 |   cflow:
146 |     categories: ['Content Flow']
147 |     example: 'cflow "article1"'
148 |     grammar: |
149 |       <a>property-value</a> = <a>delimited-string</a>
150 |   cuts:
151 |     categories: ['Layout', 'Character']
152 |     related: ['nlp', 'x_bboxes']
153 |     example: 'cuts 9 11 7,8,-2 15 3'
154 |     implied: ['bbox']
155 |     grammar: |
156 |       <a>property-value</a> = +(<a>uint</a> *1(<a>comma</a> <a>uint</a> *1(<a>comma</a> <a>nint</a>)))
157 |   hardbreak:
158 |     categories: ['Inline']
159 |     default: '0'
160 |     grammar: |
161 |       <a>property-value</a> = "0" / "1"
162 |   image:
163 |     categories: ['Page']
164 |     related: ['imagemd5', 'x_source']
165 |     example: 'image "/foo/bar.png"'
166 |     grammar: |
167 |       <a>property-value</a> = <a>delimited-string</a>
168 |   imagemd5:
169 |     categories: ['Page']
170 |     implied: ['image']
171 |     grammar: |
172 |       <a>property-value</a> = <a>doublequote</a> 32(%x41-46 / <a>digit</a>) <a>doublequote</a>
173 |   lpageno:
174 |     categories: ['Page']
175 |     related: ['ppageno']
176 |     example: 'lpageno "IV."'
177 |     grammar: |
178 |       <a>property-value</a> = <a>delimited-string</a> / <a>uint</a>
179 |   ppageno:
180 |     categories: ['Page']
181 |     related: ['lpageno']
182 |     example: 'lpageno 7'
183 |     grammar: |
184 |       <a>property-value</a> = <a>uint</a>
185 |   nlp:
186 |     categories: ['Confidence', 'Character']
187 |     related: ['cuts', 'x_confs']
188 |     implied: ['cuts']
189 |     grammar: |
190 |       <a>property-value</a> = +<a>float</a>
191 |   order:
192 |     categories: ['Content Flow']
193 |     example: 'order 8'
194 |     grammar: |
195 |       <a>property-value</a> = +<a>uint</a>
196 |   poly:
197 |     categories: ['Layout', 'Non-recommended']
198 |     example: 'poly 0 0 0 10 10 10 10 20 0 20'
199 |     grammar: |
200 |       <a>property-value</a> = 2<a>uint</a> 2<a>int</a> *(2<a>int</a>)
201 |   scan_res:
202 |     categories: ['Page']
203 |     related: ['x_scanner']
204 |     example: 'scan_res 300 300'
205 |     grammar: |
206 |       <a>property-value</a> = 2(<a>uint</a>)
207 |   textangle:
208 |     categories: ['Layout']
209 |     example: 'textangle 7.32'
210 |     grammar: |
211 |       <a>property-value</a> = <a>float</a>
212 |   x_bboxes:
213 |     categories: ['Inline', 'Character']
214 |     related: ['cuts']
215 |     example: |
216 |       x_bboxes b1x0 b1y0 b1x1 b1y1 b2x0 b2y0 b2x1 b2y1 ...
217 |       x_bboxes 0 0 10 10 0 10 20 20
218 |     grammar: |
219 |       <a>property-value</a> = 1*(4<a>uint</a>)
220 |   x_confs:
221 |     categories: ['Confidence', 'Character']
222 |     example: |
223 |       x_confs 37.3 51.23 1 100
224 |     grammar: |
225 |       <a>property-value</a> = +<a>float</a>
226 |   x_font:
227 |     categories: ['Font']
228 |     related: ['x_fsize']
229 |     example: 'x_font "Comic Sans MS"'
230 |     grammar: |
231 |       <a>property-value</a> = <a>delimited-string</a>
232 |   x_fsize:
233 |     categories: ['Font']
234 |     related: ['x_font']
235 |     example: 'x_fsize 12'
236 |     grammar: |
237 |       <a>property-value</a> = <a>uint</a>
238 |   x_scanner:
239 |     categories: ['Page']
240 |     related: ['scan_res']
241 |     example: 'scanner "Canon Lide 220"'
242 |     grammar: |
243 |       <a>property-value</a> = <a>delimited-string</a>
244 |   x_source:
245 |     categories: ['Page']
246 |     related: ['image']
247 |     example: |
248 |       x_source "/gfs/cc/clean/012345678911" "17"
249 |       x_source "http://pageserver/012345678911&page=17"
250 |     grammar: |
251 |       <a>property-value</a> = 1*<a>delimited-string</a>
252 |   x_wconf:
253 |     categories: ['Confidence', 'Inline']
254 |     example: |
255 |       x_wconf 97.23
256 |     grammar: |
257 |       <a>property-value</a> = <a>float</a>
258 | 
259 | metadata:
260 |   ocr-system:
261 |     value: ...
262 |     required: true
263 |   ocr-capabilities:
264 |   ocr-number-of-pages:
265 |   ocr-langs:
266 |   ocr-scripts:
267 | 


--------------------------------------------------------------------------------
/1.2/spec.after.html:
--------------------------------------------------------------------------------
1 | <script>
2 | for (var span of document.querySelectorAll(".toc span:not([class])")) {
3 |     if (!span.textContent.match(/^[a-z]/)) continue;
4 |     span.style.color = 'darkred';
5 |     span.style.fontWeight = 'bold';
6 | }
7 | </script>
8 | 


--------------------------------------------------------------------------------
/1.2/spec.before.html:
--------------------------------------------------------------------------------
 1 | <pre class="metadata">
 2 | Title: hOCR - OCR Workflow and Output embedded in HTML
 3 | Shortname: hocr
 4 | Level: 1
 5 | Status: LS
 6 | Group: hocr-spec
 7 | Repository: kba/hocr-spec
 8 | URL: http://kba.github.io/hocr-spec/1.2/
 9 | Editor: Konstantin Baierer, konstantin.baierer@gmail.com, http://github.com/kba
10 | Former Editor: Thomas Breuel, http://www.9x9.com/
11 | Previous Version: https://github.com/kba/hocr-spec/blob/master/1.1/spec.md
12 | Previous Version: https://docs.google.com/document/edit?id=1QQnIQtvdAC_8n92-LhwPcjtAUFwBlzE8EWnKAxlgVf0
13 | Abstract: The purpose of this document is to define an open standard for representing document layout analysis and OCR results as a subset of HTML.
14 | Markup Shorthands: markdown on, biblio on, markup on
15 | </pre>
16 | 
17 | <pre class="link-defaults">
18 | spec:html;type:element;
19 | 	text:a
20 | 	text:script
21 | 	text:style
22 | </pre>
23 | 
24 | <pre class="anchors">
25 | spec:html401;text:font;type:element;url:https://www.w3.org/TR/html401/present/graphics.html#edef-FONT
26 | spec:html401;text:nobr;type:element;urlPrefix:https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nobr
27 | spec:html;type:element;url:https://www.w3.org/TR/html5/text-level-semantics.html#the-sub-and-sup-elements
28 | 	text:sub
29 | 	text:sup
30 | spec:html;type:element-attr;urlPrefix:https://html.spec.whatwg.org/multipage/dom.html
31 | 	text:lang;for:*;url:#attr-lang
32 | 	text:dir;for:*;url:#the-dir-attribute
33 | 	text:title;for:*;url:#the-title-attribute
34 | 	text:alt;for:*;url:#the-alt-attribute
35 | 	text:class;for:*;url:#classes
36 | spec:html;type:element-attr;urlPrefix:https://html.spec.whatwg.org/multipage/embedded-content.html
37 | 	text:class;for:img;url:#attr-img-alt
38 | spec:html;type:element;url:https://html.spec.whatwg.org/multipage/semantics.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements;
39 | 	text:h1
40 | 	text:h2
41 | 	text:h3
42 | 	text:h4
43 | 	text:h5
44 | 	text:h6
45 | spec:webidl;type:dfn;urlPrefix:https://www.w3.org/TR/WebIDL/
46 | 	text:unsigned short;url:#idl-unsigned-short
47 | 	text:short;url:#idl-unsigned-short
48 | 	text:string;url:#idl-DOMstring
49 | 	text:float;url:#idl-float
50 | 	text:octet;url:#idl-octet
51 | </pre>
52 | 
53 | 


--------------------------------------------------------------------------------
/1.2/spec.md:
--------------------------------------------------------------------------------
   1 | Introduction {#introduction}
   2 | ============
   3 | 
   4 | The purpose of this document is to define an open standard for representing
   5 | document layout analysis and OCR results as a subset of HTML. The goal is to
   6 | reuse as much existing technology as possible, and to arrive at a representation
   7 | that makes it easy to store, share, process and display OCR results.
   8 | 
   9 | This specification defines many [features](#terminology) that can represent a
  10 | variety of OCR-related information. However, being built on top of HTML, hOCR is
  11 | designed to make it easy to start simple and gradually use more complex
  12 | constructs when necessary.
  13 | 
  14 | Consider you have an HTML document that encodes a book: Wrapping page elements
  15 | in <code>&lt;div class="<a href="#elementdef-ocr_page">ocr_page</a>"&gt;</code>
  16 | tags will convey the page boundaries to hOCR-capable agents and turn the HTML
  17 | document into an hOCR document.
  18 | 
  19 | Terminology and Representation {#terminology}
  20 | ==============================
  21 | 
  22 | Reusing HTML {#representation}
  23 | ------------
  24 | 
  25 | Issue(96): Reusing HTML: Some text is missing in the first paragraph
  26 | 
  27 | This document describes a representation of various aspects of OCR output in an
  28 | XML-like format. That is, we define a set of tags containing text and other
  29 | tags, together with attributes of those tags. However, since the content we are
  30 | representing is formatted text,
  31 | we are not actually using a new XML for the representation; instead we
  32 | embed the representation in XHTML (or HTML) because  [[XHTML1]] and XHTML processing
  33 | already define many aspects of OCR output representation that would otherwise
  34 | need additional, separate and ad-hoc definitions. These aspects include:
  35 | 
  36 |   * standard representations for common logical structuring elements, including
  37 |     section headings, citations, tables, emphasis, line breaks, quotations,
  38 |     citations, and preformatted text
  39 |   * standard representations for fonts, embedded images, embedded vector
  40 |     graphics, tables, languages, writing direction, colors
  41 |   * standard representations for geometric layout and positioning
  42 |   * output files that are understood without any further modification by widely
  43 |     used viewers (browsers), editors, conversion tools, and indexing tools
  44 |   * libraries for parsing and generating the content
  45 |   * support for document metadata
  46 | 
  47 | We are embedding this information inside HTML by encoding it within valid tags
  48 | and attributes inside HTML. We are going to use the terms <a>elements</a> and
  49 | <a>properties</a> for referring to embedded markup.
  50 | 
  51 | Definitions {#definitions}
  52 | -----------
  53 | 
  54 | ### "element" ### {#definition-element}
  55 | 
  56 | An hOCR element (in this spec simply referred to as an <dfn>element</dfn>) is any HTML tag with a
  57 | <{*/class}> attribute that contains exactly one <dfn lt="Element Name">class
  58 | name</dfn> that starts with `ocr_` or `ocrx_`. Non-OCR related HTML content must
  59 | not use class names that begin with `ocr_` or `ocrx_`.
  60 | 
  61 | Note: When referring to an HTML tag with class `ocr_page`, this spec uses the
  62 | notation <{ocr_page}>
  63 | 
  64 | If an HTML tag is an <a lt="element">hOCR element</a>, then its <{*/title}>
  65 | attribute must not be used for any other purpose than to define <a
  66 | lt="property">hOCR properties</a> and adhere to the <a>properties format</a>.
  67 | 
  68 | For some elements, the specs <dfn lt="Recommended HTML Tag">recommends using
  69 | specific HTML tags</dfn>. This is entirely optional, it may not be possible or
  70 | desirable to actually choose those tags (e.g., when adding hOCR information to
  71 | an existing HTML output routine).
  72 | 
  73 | ### "property" ### {#definition-property}
  74 | 
  75 | <dfn lt="properties">hOCR Properties</dfn> are a set of key-value pairs that convey OCR-specific
  76 | information related to specific <a>elements</a>. They are serialized using a
  77 | <a lt="properties format">specific format</a> in the <{*/title}> attribute of
  78 | the <a>element</a> they refer to.
  79 | 
  80 | Note: When referring to a property `bbox`, this spec uses the notation 'bbox'.
  81 | 
  82 | The <dfn lt="Property Name">name of a property</dfn> must only consist of
  83 | lowercase letters and numbers. Property names must be either from those defined
  84 | in [[#hocr-props]] or begin with `x_` to denote implementation-specific
  85 | extensions.
  86 | 
  87 | Properties may define a <dfn>default value</dfn>. For those elements for which the
  88 | property is not <a lt="Disallowed Properties">disallowed</a> but not explicitly
  89 | specified, the property is assigned to the element with the default value.
  90 | 
  91 | ### "capability" ### {#definition-capability}
  92 | 
  93 | The presence of <a>elements</a> and <a>properties</a> must be explicitly stated
  94 | as a <dfn>capability</dfn>. The rationale is that if a hOCR producer is
  95 | *capable* of producing certain elements and properties, it should inform
  96 | hOCR consumers that they may encounter those elements/properties. If a producer
  97 | is **not capable** of producing certain elements/properties, consumers need not
  98 | look for them.
  99 | 
 100 | Note: When referring to a capability `ocrp_poly`, this spec uses the notation
 101 | ''ocr-capabilities/ocrp_poly''.
 102 | 
 103 | The mechanism for declaring capabilities are described in [[#capabilities]]
 104 | 
 105 | 
 106 | Relationship between elements, properties {#relations}
 107 | -----------------------------------------
 108 | 
 109 | ### element - property ### {#rel-elem-prop}
 110 | 
 111 | There are four levels of association between any <a>element</a> to any
 112 | <a>property</a>:
 113 | 
 114 |   : <dfn>Disallowed Property</dfn>
 115 |   :: The element MUST NOT contain the property
 116 |   :: Unless defined otherwise, all properties are disallowed for any element.
 117 |   : <dfn>Required Property</dfn>
 118 |   :: The element MUST contain the property
 119 |   : <dfn>Recommended Property</dfn>
 120 |   :: The element SHOULD contain the property
 121 |   : <dfn>Allowed Property</dfn>
 122 |   :: The element MAY contain the property
 123 | 
 124 | ### property - property ### {#rel-prop-prop}
 125 | 
 126 | A property present on an element can have on of the following relations to any
 127 | other property:
 128 | 
 129 |   : <dfn>Independent Property</dfn>
 130 |   :: The presence of property A has no influence on the presence of property B
 131 |   :: Unless otherwise defiined, properties are always independent
 132 |   : <dfn>Implied Property</dfn>
 133 |   :: If property A is present, property B must also be present
 134 |   : <dfn>Conflicting Property</dfn>
 135 |   :: If property A is present, property B must not be present
 136 |   : <dfn>Related Property</dfn>
 137 |   :: Property B is related to property A
 138 | 
 139 | Properties Grammar {#grammar}
 140 | ------------------
 141 | 
 142 | The <dfn lt="Properties Format">properties format</dfn> for the properties is as
 143 | follows, expressed in ABNF notation of [[RFC5234]]:
 144 | 
 145 | <pre data-dfn-type="grammar" data-link-type="grammar">
 146 |   <dfn>digit</dfn>            = %x30-39
 147 |   <dfn>uint</dfn>             = +<a>digit</a>
 148 |   <dfn>int</dfn>              = *1"-" <a>uint</a>
 149 |   <dfn>nint</dfn>             = "-" <a>uint</a>
 150 |   <dfn>fraction</dfn>         = "." <a>uint</a>
 151 |   <dfn>float</dfn>            = *<a>uint</a> <a>fraction</a>
 152 | 
 153 |   <dfn>whitespace</dfn>       = +%20  ; one or more spaces ' '
 154 |   <dfn>comma</dfn>            = %2C   ; comma ','
 155 |   <dfn>semicolon</dfn>        = %3B   ; semicolon ';'
 156 |   <dfn>doublequote</dfn>      = %22   ; double quote '"'
 157 |   <dfn>lowercase-letter</dfn> = %x41-5A
 158 |   <dfn>alnum-word</dfn>       = +(<a>lowercase-letter</a> / <a>digit</a>)
 159 |   <dfn>ascii-word</dfn>       = +(%x21-7E - <a>semicolon</a>)    ; printable w/o space/semicolon
 160 |   <dfn>ascii-string</dfn>     = +(%x20-7E - <a>doublequote</a>)  ; printable ascii without doublequote
 161 |   <dfn>delimited-string</dfn> = <a>doublequote</a> <a>ascii-string</a> <a>doublequote</a>
 162 | 
 163 |   <dfn>properties-format</dfn> = <a>key-value-pair</a> *(*<a>whitespace</a> <a>semicolon</a> *<a>whitespace</a> <a>key-value-pair</a>)
 164 |   <dfn>spec-property-name</dfn> = ("<a href="#propdef-bbox">bbox</a>" / "<a href="#propdef-baseline">baseline</a>" / "<a href="#propdef-cflow">cflow</a>" / "<a href="#propdef-cuts">cuts</a>" / "<a href="#propdef-hardbreak">hardbreak</a>" /
 165 |                         "<a href="#propdef-image">image</a>" / "<a href="#propdef-imagemd5">imagemd5</a>" / "<a href="#propdef-lpageno">lpageno</a>" / "<a href="#propdef-nlp">nlp</a>" / "<a href="#propdef-order">order</a>" /
 166 |                         "<a href="#propdef-poly">poly</a>" / "<a href="#propdef-ppageno">ppageno</a>" / "<a href="#propdef-scan_res">scan_res</a>" / "<a href="#propdef-textangle">textangle</a>" /
 167 |                         "<a href="#propdef-x_bboxes">x_bboxes</a>" / "<a href="#propdef-x_confs">x_confs</a>" / "<a href="#propdef-x_font">x_font</a>" / "<a href="#propdef-x_fsize">x_fsize</a>" /
 168 |                         "<a href="#propdef-x_scanner">x_scanner</a>" / "<a href="#propdef-x_source">x_source</a>" / "<a href="#propdef-x_wconf">x_wconf</a>" )
 169 |   <dfn>engine-property-name</dfn> = "x_" <a>alnum-word</a>
 170 |   <dfn>key-value-pair</dfn> = <a>property-name</a> <a>whitespace</a> <a>property-value</a>
 171 |   <dfn>property-name</dfn> = <a>spec-property-name</a> / <a>engine-property-name</a>
 172 |   <dfn>property-value</dfn> = (<a>ascii-word</a> / <a>delimited-string</a>) *(<a>whitespace</a> (<a>ascii-word</a> / <a>delimited-string</a>) )
 173 | </pre>
 174 | 
 175 | This is just the general grammar, the individual <a>properties</a> will define
 176 | the exact <dfn>property grammar</dfn> that overrides <a
 177 | grammar>property-name</a> and <a grammar>property-value</a>.
 178 | 
 179 | <div class="example">
 180 | ```html
 181 | <div class="ocr_page" id="page_1">
 182 |   <div class="ocr_carea" id="column_2" title="bbox 313 324 733 1922">
 183 |     <div class="ocr_par" id="par_7"> ... </div>
 184 |     <div class="ocr_par" id="par_19"> ... </div>
 185 |   </div>
 186 | </div>
 187 | ```
 188 | </div>
 189 | 
 190 | The elements of hOCR {#hocr-elements}
 191 | ====================
 192 | 
 193 | The <a>elements</a> in hOCR can be broadly <dfn lt="Element Categories">categorized</dfn> as follows:
 194 | 
 195 |   : <dfn>Typesetting Elements</dfn>
 196 |   :: Elements that describe those areas of a page that nest but don't generally
 197 |     overlap
 198 |   :: See [[#sec-typesetting-elements]]
 199 | 
 200 |   : <dfn>Float Elements</dfn>
 201 |   :: Elements that describe those areas of a page that are not part of the flow
 202 |     but are positioned 
 203 |   :: See [[#sec-float-elements]]
 204 | 
 205 |   : <dfn>Logical Elements</dfn>
 206 |   :: These elements describe a page and its components in traditional
 207 |     typesetting.
 208 |   :: See [[#sec-logical-elements]]
 209 | 
 210 |   : <dfn>Inline elements</dfn>
 211 |   :: These elements describe content beyond the level of text lines
 212 |   :: See [[#sec-inline-elements]]
 213 | 
 214 |   : <dfn>Engine-Specific elements</dfn>
 215 |   :: Elements whose semantics are engine-specific
 216 |   :: See [[#sec-engine-elements]]
 217 | 
 218 | 
 219 | Typesetting Elements {#sec-typesetting-elements}
 220 | --------------------
 221 | 
 222 | The following typesetting related elements are based on a typesetting model as
 223 | found in most typesetting systems, including
 224 | [XSL:FO](https://www.w3.org/TR/xsl11/#fo-section),
 225 | [(La)TeX](https://latex-project.org/guides/usrguide.pdf),
 226 | [LibreOffice](https://wiki.documentfoundation.org/images/e/e6/WG42-WriterGuideLO.pdf),
 227 | and Microsoft Word.
 228 | 
 229 | In those systems, each page is divided into a number of areas. Each area can
 230 | either be a part of the body text (or multiple body texts, in the case of
 231 | newspaper layouts). The content of the areas derives from a linear stream of
 232 | textual content, which flows into the areas, filling them linewise in their
 233 | preferred directions.
 234 | 
 235 | ### <dfn element>ocr_page</dfn> ### {#sec-ocr_page}
 236 | 
 237 | <pre class="include">path: include/defs/ocr_page</pre>
 238 | 
 239 | The <{ocr_page}> element must be present in all hOCR documents.
 240 | 
 241 | ### <dfn element>ocr_column</dfn> ### {#sec-ocr_column}
 242 | 
 243 | <pre class="include">path: include/defs/ocr_column</pre>
 244 | 
 245 | <div class="annoying-warning">
 246 | **OBSOLETE**
 247 | 
 248 | Please use <{ocr_carea}> instead
 249 | </div>
 250 | 
 251 | ### <dfn element>ocr_carea</dfn> ### {#sec-ocr_carea}
 252 | 
 253 | <pre class="include">path: include/defs/ocr_carea</pre>
 254 | 
 255 | "ocr content area" or "body area"
 256 | 
 257 | Used to be called <del>ocr_column</del>
 258 | 
 259 | The <{ocr_carea}> elements should appear in reading order unless this is impossible
 260 | because of some other structuring requirement. If the document contains multiple
 261 | <{ocr_linear}> streams, then each <{ocr_carea}> must indicate which stream it belongs
 262 | to.
 263 | 
 264 | Note that for many documents, the actual ground truth careas are well-defined
 265 | by the document style of the original document before printing and scanning.
 266 | From a single page, the `careas` of the original document style cannot be
 267 | recovered exactly. However, the partition of a document by <{ocr_carea}> for an
 268 | individual page shall be considered correct relative to ground truth if
 269 | 
 270 |   1. all the text contained in a ground truth carea is fully contained within a
 271 |     single <{ocr_carea}>,
 272 |   2. no text outside a ground truth `carea` is contained within an
 273 |     <{ocr_carea}>, and 
 274 |   3. the <{ocr_carea}> appear in the same order as the text flow
 275 |     relationships between the ground truth careas.
 276 | 
 277 | ### <dfn element>ocr_line</dfn> ### {#sec-ocr_line}
 278 | 
 279 | <pre class="include">path: include/defs/ocr_line</pre>
 280 | 
 281 | In typesetting systems, content areas are filled with “blocks”, but most of
 282 | those blocks are not recoverable or semantically meaningful. However, one type
 283 | of block is visible and very important for OCR engines: the line. Lines are
 284 | typesetting blocks that only contain glyphs (“inlines” in XSL terminology).
 285 | They are represented by the <{ocr_line}> area.
 286 | 
 287 | <{ocr_line}> should be in a <{span}>
 288 | 
 289 | ### <dfn element>ocr_separator</dfn> ### {#sec-ocr_separator}
 290 | 
 291 | <pre class="include">path: include/defs/ocr_separator</pre>
 292 | 
 293 | Any separator or similar element
 294 | 
 295 | ### <dfn element>ocr_noise</dfn> ### {#sec-ocr_noise}
 296 | 
 297 | <pre class="include">path: include/defs/ocr_noise</pre>
 298 | 
 299 | Any noise element that isn't part of typesetting
 300 | 
 301 | Float elements {#sec-float-elements}
 302 | --------------
 303 | 
 304 | Overlaid onto the page is a set of floating elements; floating elements exist
 305 | outside the normal reading order. Floating elements may be introduced by the
 306 | textual content, or they may be related to the page itself (anchoring is a
 307 | logical property). In typesetting systems, floating elements may be anchored to
 308 | the page, to paragraphs, or to the content stream. Floating elements can
 309 | overlap content areas and render on top of or under content, or they can force
 310 | content to flow around them. The default for floating elements in this spec is
 311 | that their anchor is undefined (it is a logical property, not a typesetting
 312 | property), and that text flows around them. Note that with rectangular content
 313 | areas and rectangular floats, already a wide variety of non-rectangular text
 314 | shapes can be realized.
 315 | 
 316 | Issue: There is currently no way of indicating anchoring or flow-around
 317 | properties for floating elements; properties need to be defined for this.
 318 | 
 319 | Floats should not be nested.
 320 | The following floats are defined:
 321 | 
 322 | ### <dfn element>ocr_float</dfn> ### {#sec-ocr_float}
 323 | 
 324 | <pre class="include">path: include/defs/ocr_float</pre>
 325 | 
 326 | ### <dfn element>ocr_textfloat</dfn> and <dfn element>ocr_textimage</dfn> ### {#floats-text}
 327 | 
 328 | <pre class="include">path: include/defs/ocr_textfloat</pre>
 329 | <pre class="include">path: include/defs/ocr_textimage</pre>
 330 | 
 331 | ### <dfn element>ocr_image</dfn>, <dfn element>ocr_linedrawing</dfn> and <dfn element>ocr_photo</dfn> ### {#floats-image}
 332 | 
 333 | <pre class="include">path: include/defs/ocr_image</pre>
 334 | 
 335 | <pre class="include">path: include/defs/ocr_linedrawing</pre>
 336 | 
 337 | Something that could be represented well and naturally in a vector graphics
 338 | format like SVG (even if it is actually represented as PNG)
 339 | 
 340 | <pre class="include">path: include/defs/ocr_photo</pre>
 341 | 
 342 | Something that requires JPEG or PNG to be represented well
 343 | 
 344 | ### <dfn element>ocr_header</dfn> and <dfn element>ocr_footer</dfn> ### {#float-foot-head}
 345 | 
 346 | <pre class="include">path: include/defs/ocr_header</pre>
 347 | <pre class="include">path: include/defs/ocr_footer</pre>
 348 | 
 349 | ### <dfn element>ocr_pageno</dfn> ### {#sec-ocr_pageno}
 350 | 
 351 | <pre class="include">path: include/defs/ocr_pageno</pre>
 352 | 
 353 | ### <dfn element>ocr_table</dfn> ### {#sec-ocr_table}
 354 | 
 355 | <pre class="include">path: include/defs/ocr_table</pre>
 356 | 
 357 | Logical Elements {#sec-logical-elements}
 358 | ----------------
 359 | 
 360 | Issue: [Logical Tags/classes](https://github.com/kba/hocr-spec/issues/66)
 361 | 
 362 | The classes defined in this section for logically structuring a hOCR document
 363 | have their standard meaning as used in the publishing industry and tools like
 364 | LaTeX, MS Word, and others.
 365 | 
 366 | Tags must be nested as indicated by the following list, but not all tags within the
 367 | hierarchy need to be present.
 368 | 
 369 | 	* <{ocr_document}>
 370 | 		* <{ocr_linear}>
 371 | 			* <{ocr_title}>
 372 | 			* <{ocr_author}>
 373 | 			* <{ocr_abstract}>
 374 | 			* <{ocr_part}>
 375 | 				* <{ocr_chapter}>
 376 | 					* <{ocr_section}> ▻ <{ocr_subsection}> ▻ <{ocr_subsubsection}>
 377 | 						* <{ocr_display}>
 378 | 						* <{ocr_blockquote}>
 379 | 						* <{ocr_par}>
 380 | 
 381 | For all of these elements except <{ocr_linear}>, there exists a natural linear
 382 | ordering defined by reading order (<{ocr_linear}> indicates that the elements
 383 | contained in it have a linear ordering). At the level of <{ocr_linear}>, there
 384 | may not be a single distinguished order. A common example of <{ocr_linear}> is a
 385 | newspaper, in which a single newspaper may contain many linear, but there is no
 386 | unique reading order for the different linear. OCR evaluation tools should
 387 | therefore be sensitive to the order of all elements other than <{ocr_linear}>.
 388 | 
 389 | Textual information like section numbers and bullets must be represented as
 390 | text inside the containing element.
 391 | 
 392 | Documents whose logical structure does not map naturally onto these logical
 393 | structuring elements must not use them for other purposes.
 394 | 
 395 | 
 396 | ### <dfn element>ocr_document</dfn> ### {#sec-ocr_document}
 397 | 
 398 | <pre class="include">path: include/defs/ocr_document</pre>
 399 | 
 400 | ### <dfn element>ocr_title</dfn>, <dfn element>ocr_author</dfn> and <dfn element>ocr_abstract</dfn> ### {#titlepage-elements}
 401 | 
 402 | <pre class="include">path: include/defs/ocr_title</pre>
 403 | 
 404 | <pre class="include">path: include/defs/ocr_author</pre>
 405 | 
 406 | <pre class="include">path: include/defs/ocr_abstract</pre>
 407 | 
 408 | ### <dfn element>ocr_part</dfn> and <dfn element>ocr_chapter</dfn> ### {#sec-part-chapter}
 409 | 
 410 | <pre class="include">path: include/defs/ocr_part</pre>
 411 | 
 412 | <pre class="include">path: include/defs/ocr_chapter</pre>
 413 | 
 414 | ### <dfn element>ocr_section</dfn>, <dfn element>ocr_subsection</dfn> and <dfn element>ocr_subsubsection</dfn> ### {#sectioning}
 415 | 
 416 | <pre class="include">path: include/defs/ocr_section</pre>
 417 | 
 418 | <pre class="include">path: include/defs/ocr_subsection</pre>
 419 | 
 420 | <pre class="include">path: include/defs/ocr_subsubsection</pre>
 421 | 
 422 | ### <dfn element>ocr_display</dfn>, <dfn element>ocr_blockquote</dfn> and <dfn element>ocr_par</dfn> ### {#special-paragraphs}
 423 | 
 424 | <pre class="include">path: include/defs/ocr_display</pre>
 425 | 
 426 | <pre class="include">path: include/defs/ocr_blockquote</pre>
 427 | 
 428 | <pre class="include">path: include/defs/ocr_par</pre>
 429 | 
 430 | ### <dfn element>ocr_linear</dfn> ### {#sec-ocr_linear}
 431 | 
 432 | <pre class="include">path: include/defs/ocr_linear</pre>
 433 | 
 434 | ### <dfn element>ocr_caption</dfn> ### {#sec-ocr_caption}
 435 | 
 436 | <pre class="include">path: include/defs/ocr_caption</pre>
 437 | 
 438 | Image captions may be indicated using the <{ocr_caption}> element; such an
 439 | element refers to the image(s) contained within the same float, or the
 440 | immediately adjacent image if both the image and the <{ocr_caption}> element are
 441 | in running text.
 442 | 
 443 | 
 444 | Inline Elements {#sec-inline-elements}
 445 | ---------------
 446 | 
 447 | Issue(51):
 448 | 
 449 | There is some content that should behave and flow like text
 450 | 
 451 | ### Unrecognized characters and words: <dfn element>ocr_glyph</dfn> and <dfn element>ocr_glyphs</dfn> ### {#unrecognized}
 452 | 
 453 | <pre class="include">path: include/defs/ocr_glyph</pre>
 454 | 
 455 | <ul>
 456 | 
 457 |   * An individual glyph represented as an image (e.g., an unrecognized character)
 458 |   * Must contain a single <{img}> tag, or be present on one
 459 | 
 460 | </ul>
 461 | 
 462 | <pre class="include">path: include/defs/ocr_glyphs</pre>
 463 | 
 464 | <ul>
 465 | 
 466 |   * Multiple glyphs represented as an image (e.g., an unrecognized word)
 467 |   * Must contain a single <{img}> tag, or be present on one
 468 | 
 469 | </ul>
 470 | 
 471 | ### <dfn element>ocr_dropcap</dfn> ### {#dropcap}
 472 | 
 473 | <pre class="include">path: include/defs/ocr_dropcap</pre>
 474 | 
 475 |   * An individual glyph representing a dropcap
 476 |   * May contain text or an <{img}> tag; the <{img/alt}> of the image tag should contain
 477 |     the corresponding text
 478 | 
 479 | ### Mathematical and chemical formulas: <dfn element>ocr_math</dfn> and <dfn element>ocr_chem</dfn> ### {#formulas}
 480 | 
 481 | <pre class="include">path: include/defs/ocr_math</pre>
 482 | <pre class="include">path: include/defs/ocr_chem</pre>
 483 | 
 484 | Mathematical and chemical formulas that float must be put into an <{ocr_float}>
 485 | section. Formulas that are “display” mode should be put into
 486 | an <{ocr_display}> section. <{ocr_math}> and <{ocr_chem}>
 487 | 
 488 | <{ocr_math}> must either be or contain either a single <{img}> tag or [[MathML]] markup
 489 | 
 490 | <{ocr_chem}> must either be or contain either a single <{img}> tag or [[CML]] markup
 491 | 
 492 | ### Unspecified inline content: <dfn element>ocr_cinfo</dfn> ### {#sec-ocr_cinfo}
 493 | 
 494 | Issue: Define <dfn element>ocrx_cinfo</dfn>
 495 | 
 496 |   * If no other layout element applies, the <{ocr_cinfo}> element may be used.
 497 |   * <{ocrx_cinfo}> should nest inside <{ocrx_line}>
 498 |   * <{ocrx_cinfo}> should contain only 'x_confs', 'x_bboxes', and 'cuts' attributes
 499 | 
 500 | OCR Engine-Specific elements {#sec-engine-elements}
 501 | ----------------------------
 502 | 
 503 | A few abstractions are used as intermediate abstractions in OCR engines,
 504 | although they do not have a meaning that can be defined either in terms of
 505 | typesetting or logical function. Representing them may be useful to represent
 506 | existing OCR output, say for workflow abstractions.
 507 | 
 508 | Common suggested engine-specific markup are:
 509 | 
 510 | ### <dfn element>ocrx_block</dfn> ### {#ocrx_block}
 511 | 
 512 | <pre class="include">path: include/defs/ocrx_block</pre>
 513 | 
 514 | Issue: [ocr_carea vs ocrx_block](https://github.com/kba/hocr-spec/issues/28)
 515 | 
 516 |   * any kind of "block" returned by an OCR system
 517 |   * engine-specific because the definition of a "block" depends on the engine
 518 | 
 519 | Generators should attempt to ensure the following properties:
 520 | 
 521 |   * An <{ocrx_block}> should not contain content from multiple <{ocr_carea}>.
 522 |   * The union of all <{ocrx_block|ocrx_blocks}> should approximately cover all <{ocr_carea}>.
 523 |   * an <{ocrx_block}> should contain either a float or body text, but not both
 524 |   * an <{ocrx_block}> should contain either an image or text, but not both
 525 | 
 526 | ### <dfn element>ocrx_line</dfn> ### {#sec-ocrx_line}
 527 | 
 528 | <pre class="include">path: include/defs/ocrx_line</pre>
 529 | 
 530 | Issue: [ocr_line vs ocrx_line](https://github.com/kba/hocr-spec/issues/19)
 531 | 
 532 |   * any kind of "line" returned by an OCR system that differs from the standard <{ocr_line}> above
 533 |   * might be some kind of "logical" line
 534 |   * an <{ocrx_line}> should correspond as closely as possible to an <{ocr_line}>
 535 | 
 536 | ### <dfn element>ocrx_word</dfn> ### {#sec-ocrx_word}
 537 | 
 538 | <pre class="include">path: include/defs/ocrx_word</pre>
 539 | 
 540 |   * any kind of "word" returned by an OCR system
 541 |   * engine specific because the definition of a "word" depends on the engine
 542 | 
 543 | The properties of hOCR {#hocr-props}
 544 | ======================
 545 | 
 546 | The <a>properties</a> in hOCR can be broadly <dfn lt="Property
 547 | Categories">categorized</dfn> as follows:
 548 | 
 549 |   : <dfn>General Properties</dfn>
 550 |   :: These properties can apply to most elements
 551 | 
 552 |   : <dfn>Non-Recommended Properties</dfn>
 553 |   :: These properties can apply to most elements but should not be used unless
 554 |     there is no alternative:
 555 | 
 556 |   : <dfn>Inline Properties</dfn>
 557 |   :: These properties apply to content on or below the level of <{ocr_line}> /
 558 |     <{ocrx_line}>
 559 | 
 560 |   : <dfn>Layout Properties</dfn>
 561 |   :: These properties relate to placement of <a>elements</a> on the page
 562 | 
 563 |   : <dfn>Font Properties</dfn>
 564 |   :: These properties convey font information
 565 | 
 566 |   : <dfn>Character Properties</dfn>
 567 |   :: These properties convey character level information
 568 | 
 569 |   : <dfn>Page Properties</dfn>
 570 |   :: These properties convey information on the whole page
 571 | 
 572 |   : <dfn>Content Flow Properties</dfn>
 573 |   :: These properties are related to the reading order and flow of content on the page
 574 | 
 575 |   : <dfn>Confidence Properties</dfn>
 576 |   :: These properties are related to the confidence of the hOCR producer that
 577 |     the text in the <a>element</a> has been correctly recognized
 578 | 
 579 | The <dfn property>baseline</dfn> property {#baseline}
 580 | -----------------------------------------
 581 | 
 582 | <pre class="include">path: include/defs/baseline</pre>
 583 | 
 584 | This property applies primarily to textlines.
 585 | 
 586 | The baseline is described by a polynomial of order `n` with the coefficients
 587 | `pn ...  p0` with `n = 1` for a linear (i.e. straight) line.
 588 | 
 589 | The polynomial is in the coordinate system of the line, with the bottom left of
 590 | the bounding box as the origin.
 591 | 
 592 | <div class="example">
 593 | 
 594 | The hOCR output for the first line of
 595 | [eurotext.tif](https://github.com/tesseract-ocr/tesseract/blob/master/testing/eurotext.tif)
 596 | contains the following information:
 597 | 
 598 | ```html
 599 | <span class='ocr_line' id='line_1_1'
 600 |     title="bbox 105 66 823 113; baseline 0.015 -18">...</span>
 601 | ```
 602 | 
 603 | 'bbox' is the bounding box of the line in image coordinates (blue). The two
 604 | numbers for the baseline are the slope (1st number) and constant term (2nd
 605 | number) of a linear equation describing the baseline relative to the bottom
 606 | left corner of the bounding box (red). The baseline crosses the y-axis at `-18`
 607 | and its slope angle is `arctan(0.015) = 0.86°`.
 608 | 
 609 | <figure><img
 610 |   alt="baseline explained"
 611 |   src="../images/baseline.png"/>
 612 | </figure>
 613 | 
 614 | </div>
 615 | 
 616 | 
 617 | The <dfn property>bbox</dfn> property {#bbox}
 618 | -------------------------------------
 619 | 
 620 | <pre class="include">path: include/defs/bbox</pre>
 621 | 
 622 | The 'bbox' - short for "bounding box" - of an element is a rectangular box
 623 | around this element, which is defined by the upper-left corner (x0, y0) and
 624 | the lower-right corner (x1, y1).
 625 | 
 626 |   * the values are with reference to the top-left corner of the document image
 627 |     and measured in pixels
 628 |   * the order of the values are `x0 y0 x1 y1` = "left top right bottom"
 629 |   * use 'x_bboxes' below for character bounding boxes
 630 |   * do not use 'bbox' unless the bounding box of the layout component is, in
 631 |     fact, rectangular
 632 |   * some non-rectangular layout components may have rectangular bounding boxes
 633 |     if the non-rectangularity is caused by floating elements around which text flows
 634 | 
 635 | <div class="example">
 636 | 
 637 | ```html
 638 | <span class='ocr_line' id='line_1'
 639 |     title="bbox 10 20 160 30">...</span>
 640 | ```
 641 | 
 642 | The bounding box 'bbox' of this line is shown in blue and it is span
 643 | by the upper-left corner (10, 20) and the lower-right corner (160, 30).
 644 | All coordinates are measured with reference to the top-left corner of
 645 | the document image which border is drawn in black.
 646 | 
 647 | <figure><img
 648 |   alt="bbox explained"
 649 |   src="../images/bbox-crop.png"/>
 650 | </figure>
 651 | 
 652 | </div>
 653 | 
 654 | The <dfn property>cflow</dfn> property {#cflow}
 655 | --------------------------------------
 656 | 
 657 | <pre class="include">path: include/defs/cflow</pre>
 658 | 
 659 | This property relates the flow between multiple <{ocr_carea}> elements,
 660 | and between <{ocr_carea}> and <{ocr_linear}> elements.
 661 | 
 662 | The content flow on the page that this element is a part of
 663 | 
 664 |   * s must be a unique string for each content flow
 665 |   * must be present on <{ocr_carea}> and <{ocrx_block}> tags when reading
 666 |     order is attempted and multiple content flows are present
 667 |   * presence must be declared in the document meta data
 668 | 
 669 | 
 670 | The <dfn property>cuts</dfn> property {#cuts}
 671 | -------------------------------------
 672 | <pre class="include">path: include/defs/cuts</pre>
 673 | 
 674 | <ul>
 675 | 
 676 |   * character segmentation cuts (see below)
 677 |   * there must be a 'bbox' property relative to which the 'cuts' can be interpreted
 678 | 
 679 | </ul>
 680 | 
 681 | For left-to-write writing directions, cuts are sequences of deltas in the x and
 682 | y direction; the first delta in each path is an offset in the x direction
 683 | relative to the last x position of the previous path. The subsequent deltas
 684 | alternate between up and right moves.
 685 | 
 686 | <div class="example">
 687 | 
 688 | Assume a bounding box of `(0,0,300,100)`; then
 689 | 
 690 | ```python
 691 | cuts("10 11 7 19") =
 692 |     [ [(10,0),(10,100)], [(21,0),(21,100)], [(28,0),(28,100)], [(47,0),(47,100)] ]
 693 | cuts("10,50,3 11,30,-3") =
 694 |     [ [(10,0),(10,50),(13,50),(13,100)], [(21,0),(21,30),(18,30),(18,100)] ]
 695 | ```
 696 | 
 697 | ```html
 698 | <span class="ocr_cinfo" title="bbox 0 0 300 100; nlp 1.7 2.3 3.9 2.7; cuts 9 11 7,8,-2 15 3">hello</span>
 699 | ```
 700 | </div>
 701 | 
 702 | 
 703 | Cuts are between all codepoints contained within the element, including any
 704 | whitespace and control characters.  Simply use a delta of 0 (zero) for
 705 | invisible codepoints.
 706 | 
 707 | Writing directions other than left-to-right specify cuts as if the bounding box
 708 | for the element had been rotated by a multiple of 90 degrees such that the
 709 | writing direction is left to right, then rotated back.
 710 | 
 711 | It is undefined what happens when cut paths intersect, with the exception that
 712 | a delta of 0 always corresponds to an invisible codepoint.
 713 | 
 714 | The <dfn property>hardbreak</dfn> property {#hardbreak}
 715 | ------------------------------------------
 716 | 
 717 | <pre class="include">path: include/defs/hardbreak</pre>
 718 | 
 719 |   * a zero (default) indicates that the end of the line is not a hard
 720 |     (explicit) line break, but a break due to text flow
 721 |   * a one indicates that the line is a hard (explicit) line break
 722 | 
 723 | Any special characters representing the desired end-of-line processing must be
 724 | present inside the <{ocr_line}> element. Examples of such special characters are a
 725 | soft hyphen ("­", `U+00AD`), a hard line break (`<br>`), or whitespace (` `) for soft
 726 | line breaks.
 727 | 
 728 | The <dfn property>image</dfn> property {#image}
 729 | --------------------------------------
 730 | <pre class="include">path: include/defs/image</pre>
 731 | 
 732 | <ul>
 733 | 
 734 |   * image file name used as input
 735 |   * syntactically, must be a UNIX-like pathname or http URL (no Windows pathnames)
 736 |   * may be relative
 737 |   * cannot be resolved to the actual file in general (e.g., if the hOCR file
 738 |     becomes separated from the image file)
 739 |   * if the hOCR file is present in a directory hierarchy or file archive, should
 740 |     resolve to the corresponding image file
 741 | 
 742 | </ul>
 743 | 
 744 | The <dfn property>imagemd5</dfn> property {#imagemd5}
 745 | -----------------------------------------
 746 | <pre class="include">path: include/defs/imagemd5</pre>
 747 | 
 748 | <ul>
 749 | 
 750 |   * MD5 fingerprint of the image file that this page was derived from
 751 |   * allows re-associating pages with source images
 752 | 
 753 | </ul>
 754 | 
 755 | The <dfn property>lpageno</dfn> property {#lpageno}
 756 | ----------------------------------------
 757 | 
 758 | <pre class="include">path: include/defs/lpageno</pre>
 759 | 
 760 | <ul>
 761 |   * the logical page number expressed on the page
 762 |   * may not be numerical (e.g., Roman numerals)
 763 |   * usually is unique
 764 |   * must not be present unless it has been recognized from the page and is unambiguous
 765 | 
 766 | </ul>
 767 | 
 768 | The <dfn property>ppageno</dfn> property {#ppageno}
 769 | ----------------------------------------
 770 | 
 771 | <pre class="include">path: include/defs/ppageno</pre>
 772 | 
 773 |   * the physical page number
 774 |   * the front cover is page number 0
 775 |   * should be unique
 776 |   * must not be present unless the pages in the document have a physical ordering
 777 |   * must not be present unless it is well defined and unique
 778 | 
 779 | The <dfn property>nlp</dfn> property {#nlp}
 780 | ------------------------------------
 781 | <pre class="include">path: include/defs/nlp</pre>
 782 | 
 783 |   * estimate of the negative log probabilities of each character by the recognizer
 784 | 
 785 | 
 786 | The <dfn property>order</dfn> property {#order}
 787 | --------------------------------------
 788 | 
 789 | <pre class="include">path: include/defs/order</pre>
 790 | 
 791 | The reading order of the element (an integer)
 792 | 
 793 |   * this property must not be used unless there is no other way of representing
 794 |     the reading order of the page by element ordering within the page, since
 795 |     many tools will not be able to deal with content that is not in reading order
 796 |   * presence must be declared in the document meta data
 797 | 
 798 | The <dfn property>poly</dfn> property {#poly}
 799 | -------------------------------------
 800 | 
 801 | <pre class="include">path: include/defs/poly</pre>
 802 | 
 803 | A closed polygon for elements with non-rectangular bounds
 804 | 
 805 |   * this property must not be used unless there is no other way of
 806 |     representing the layout of the page using rectangular bounding boxes,
 807 |     since most tools will simply not have the capability of dealing with
 808 |     non-rectangular layouts
 809 |   * note that the natural and correct representation of many non-rectangular
 810 |     layouts is in terms of rectangular content areas and rectangular floats
 811 |   * documents using polygonal borders anywhere must indicate this by adding
 812 |     ''ocr-capabilities/ocrp_poly'' to the list of 'ocr-capabilities' (see
 813 |     [[#capabilities]])
 814 |   * documents should attempt to provide a reasonable 'bbox' equivalent as well
 815 | 
 816 | 
 817 | The <dfn property>scan_res</dfn> property {#scan_res}
 818 | -----------------------------------------
 819 | 
 820 | <pre class="include">path: include/defs/scan_res</pre>
 821 | 
 822 | The scanning resolution in DPI
 823 | 
 824 | The <dfn property>textangle</dfn> property {#textangle}
 825 | ------------------------------------------
 826 | 
 827 | <pre class="include">path: include/defs/textangle</pre>
 828 | 
 829 | The angle in degrees by which textual content has been rotate relative to the
 830 | rest of the page (if not present, the angle is assumed to be zero); rotations
 831 | are counter-clockwise, so an angle of 90 degrees is vertical text running from
 832 | bottom to top in Latin script; note that this is different from reading order,
 833 | which should be indicated using standard HTML properties
 834 | 
 835 | The <dfn property>x_bboxes</dfn> property {#x_bboxes}
 836 | -----------------------------------------
 837 | 
 838 | <pre class="include">path: include/defs/x_bboxes</pre>
 839 | 
 840 |   * OCR-engine specific boxes associated with each codepoint contained in the
 841 |     element
 842 |   * note that the 'bbox' property is a property for the bounding box of a layout
 843 |     element, not of individual characters
 844 |   * in particular, use `<span class="ocr_cinfo" title="x_bboxes ....">`, not
 845 |     `<span class="ocr_cinfo" title="bbox ...">`
 846 | 
 847 | 
 848 | The <dfn property>x_font</dfn> property {#x_font}
 849 | ------------------------------
 850 | 
 851 | <pre class="include">path: include/defs/x_font</pre>
 852 | 
 853 | 'x_font' is an OCR-engine specific font name (a string).
 854 | 
 855 | The <dfn property>x_fsize</dfn> property {#x_fsize}
 856 | ----------------------------------------
 857 | 
 858 | <pre class="include">path: include/defs/x_fsize</pre>
 859 | 
 860 | 'x_fsize' is the OCR-engine specific font size (an unsigned integer).
 861 | 
 862 | The <dfn property>x_confs</dfn> property {#x_confs}
 863 | ----------------------------------------
 864 | 
 865 | <pre class="include">path: include/defs/x_confs</pre>
 866 | 
 867 | <ul>
 868 | 
 869 |   * OCR-engine specific character confidences
 870 |   * values must be numbers
 871 |   * higher values should express higher confidences
 872 |   * if possible, convert character confidences to values between 0 and 100 and
 873 |     have them approximate posterior probabilities (expressed in %)
 874 | 
 875 | </ul>
 876 | 
 877 | The <dfn property>x_scanner</dfn> property {#x_scanner}
 878 | -----------------------------------------------------------------------------
 879 | 
 880 | <pre class="include">path: include/defs/x_scanner</pre>
 881 | 
 882 | A representation of the scanner
 883 | 
 884 | The <dfn property>x_source</dfn> property {#x_source}
 885 | -----------------------------------------
 886 | <pre class="include">path: include/defs/x_source</pre>
 887 | 
 888 |   * an implementation-dependent representation of the document source
 889 |   * could be a URL or a /gfs/ path
 890 |   * offsets within a multipage format (e.g., TIFF) may be represented using
 891 |     additional strings or using URL parameters or fragments
 892 | 
 893 | The <dfn property>x_wconf</dfn> property {#x_wconf}
 894 | ----------------------------------------
 895 | 
 896 | <pre class="include">path: include/defs/x_wconf</pre>
 897 | 
 898 | <ul>
 899 | 
 900 |   * OCR-engine specific confidence for the entire contained substring
 901 |   * value must be a number
 902 |   * higher values should express higher confidences
 903 |   * if possible, convert word confidences to values between 0 and 100 and have
 904 |     them approximate posterior probabilities (expressed in %)
 905 | 
 906 | </ul>
 907 | 
 908 | 
 909 | 
 910 | Encoding Guidelines {#guidelines}
 911 | ===================
 912 | 
 913 | Recommendations for Mappings {#html-mappings}
 914 | ----------------------------
 915 | 
 916 | When possible, any mapping of logical structure onto HTML should try to follow the following rules:
 917 | 
 918 |   * the mapping should be "natural" -- similar to what an author of the document
 919 |     might have entered into a WYSIWYG content creation tool
 920 |   * text should be in reading order
 921 |   * all tags should be used for the intended purpose (and only for the intended
 922 |     purpose) as defined in the [[HTML401]] spec.
 923 |   * floats are contained in <{div}> elements with a `style` that includes a float attribute
 924 |   * repeating floating page elements (header/footer) should be repeated and occur
 925 |     in their natural location in reading order (e.g., between pages)
 926 |   * embedded images and SVG should be contained in files in the same directory
 927 |     (no `/` in the URL) and embedded with <{img}> and <{embed}> tags, respectively
 928 | 
 929 | Specifically
 930 | 
 931 |   * <{em}> and <{strong}> should represent emphasis, and are preferred to <{b}>, <{i}>, and <{u}>
 932 |   * <{b}>, <{i}>, and <{u}> should represent a change in the corresponding
 933 |     attribute for the current font (but an OCR font specification must still be
 934 |     given)
 935 |   * <{p}> should represent paragraph breaks
 936 |   * <{br}> should represent explicit linebreaks (not linebreak that happen because of text flow)
 937 |   * <{h1}>, ..., <{h6}> should represent the logical nesting structure (if any) of the document
 938 |   * <{a}> should represent hyperlinks and references within the document
 939 |   * <{blockquote}> should represent indented quotations, but not other uses of indented text.
 940 |   * <{ul}>, <{ol}>, <{dl}> should represent lists
 941 |   * <{table}> should represent tables, including correct use of the <{th}> tag
 942 | 
 943 | If necessary, the markup may use the following non-standard tags:
 944 | 
 945 |   * <{nobr}> to indicate that line breaking is not permitted for the enclosed content
 946 |   * <{wbr}> to indicate that line breaking is permitted at that location
 947 | 
 948 | Styling hOCR with CSS {#sec-css}
 949 | ---------------------
 950 | 
 951 | OCR information and presentation information can be separated by putting the
 952 | CSS info related to the CSS in an outer element with an `ocr_` or `ocrx_` class,
 953 | and then overriding it for the presentation by nesting another <{span}> with the
 954 | actual presentation information inside that:
 955 | 
 956 | ```html
 957 | <span class="ocr_cinfo" style="ocr style"><span style="presentation style"> ... </span></span>
 958 | ```
 959 | 
 960 | Language, Writing Direction {#sec-lang}
 961 | ---------------------------
 962 | 
 963 | OCR-generated font and text color information is encoded using standard HTML
 964 | and CSS attributes on elements with a class of `ocr_...` or `ocrx_...`.
 965 | 
 966 | Language and writing direction should be indicated using the HTML standard
 967 | attributes <{*/lang}> and <{*/dir}>.
 968 | 
 969 | [Furigana and similar constructs](https://en.wikipedia.org/wiki/Ruby_character)
 970 | must be represented using their correct Unicode encoding.
 971 | 
 972 | The HTML <a href="https://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.2.5">`&lrm;` and
 973 | `&rlm;` entities</a> (indicating writing direction) must not be used; all
 974 | writing direction changes must be indicated with new tags with an appropriate
 975 | <{*/dir}> attribute.
 976 | 
 977 | The CSS3 text layout attributes can be used when necessary. For example, CSS
 978 | supports writing-mode, direction, glyph-orientation [[ISO15924]]-based
 979 | script ([list of codes](http://www.unicode.org/iso15924/codelists.html)), text-indent, etc.
 980 | 
 981 | Superscript and Subscript {#sub-sup}
 982 | -------------------------
 983 | 
 984 | Superscripts and subscripts, when not in <{ocr_math}> or <{ocr_chem}> formulas,
 985 | must be represented using the HTML <{sup}> and <{sub}> tags, even if special
 986 | Unicode characters are available.
 987 | 
 988 | Whitespace {#whitespace}
 989 | ----------
 990 | 
 991 | Non-breaking spaces must be represented using the HTML `&nbsp;` entity.
 992 | 
 993 | Different space widths should be indicated using HTML and `&ensp;`, `&emsp;`,
 994 | `&thinsp;`, `&zwnj;`, `&zwj;`.
 995 | 
 996 | Hyphenation {#hyphenation}
 997 | -----------
 998 | 
 999 | Issue(7): How to handle hyphens?
1000 | 
1001 | Issue(altoxml/schema#41): Non Linear Hyphens
1002 | 
1003 | Soft hyphens must be represented using the HTML `&shy;` entity.
1004 | 
1005 | Alternative Segmentations / Readings {#segmentation}
1006 | ------------------------------------
1007 | 
1008 | Issue: [Delete x_cost](https://github.com/kba/hocr-spec/issues/9)
1009 | 
1010 | Alternative segmentations and readings are indicated by a <{span}> with
1011 | `class="alternatives"`. It must contains <{ins}> and <{del}> elements. The first
1012 | contained element should be <{ins}> and represent the most probable interpretation,
1013 | the subsequent ones <{del}>. Each <{ins}> and <{del}> element should have `class="alt"` and a
1014 | property of either 'nlp' or 'x_cost'. These <{span}>, <{ins}>, and <{del}> tags can nest
1015 | arbitrarily.
1016 | 
1017 | <div class="example">
1018 | ```html
1019 | <span class="alternatives">
1020 | <ins class="alt" title="nlp 0.3">hello</ins>
1021 | <del class="alt" title="nlp 1.1">hallo</del>
1022 | </span>
1023 | ```
1024 | </div>
1025 | 
1026 | Whitespace within the <{span}> but outside the contained <{ins}>/<{del}>
1027 | elements is ignored and should be inserted to improve readability of the HTML
1028 | when viewed in a browser.
1029 | 
1030 | Grouped Elements and Multiple Hierarchies {#groups}
1031 | -----------------------------------------
1032 | 
1033 | The different levels of layout information (logical, physical, engine-specific)
1034 | each form hierarchies, but those hierarchies may not be mutually compatible;
1035 | for example, a single <{ocr_page}> may contain information from multiple sections
1036 | or chapters. To represent both hierarchies within a single document, elements
1037 | may be grouped together.  That is, two elements with the same class may be
1038 | treated as one element by adding a "groupid identifier" property to them and
1039 | using the same identifier. 
1040 | 
1041 | Grouped elements should be logically consistent with the markup they represent;
1042 | for example, it is probably not sensible to use grouped elements to interleave
1043 | parts of two different chapters.  Therefore, grouped elements should usually be
1044 | adjacent in the markup.
1045 | 
1046 | Applications using hOCR may choose to manipulate grouped elements directly, but
1047 | the simplest way of dealing with them is to transform a document with grouped
1048 | elements into one without grouped elements prior to further processing by first
1049 | removing tags that are not of interest for the subsequent processing step, and
1050 | then collapsing grouped elements into single elements.  For example, output
1051 | that contains both logical and physical layout information, where the logical
1052 | layout information uses grouped elements, can be transformed by removing all
1053 | the physical layout information, and then collapsing all split <{ocr_chapter}>
1054 | elements into single <{ocr_chapter}> elements based on the groupid.  The result is
1055 | a simple DOM tree.  This transformation can be provided generically as a
1056 | pre-processor or Javascript.
1057 | 
1058 | The presence of grouped elements does not need to be indicated in the header;
1059 | when it affects their operations, hOCR processors should check for the presence
1060 | of grouped elements in the output and fail with an error message if they cannot
1061 | correctly process the hOCR information.
1062 | 
1063 | 
1064 | Metadata {#metadata}
1065 | ========
1066 | 
1067 | The creator of the hOCR document can indicate the following information
1068 | information using <{meta}> tags in the <{head}> section.
1069 | 
1070 |   : <dfn property>ocr-system</dfn>
1071 |   :: Indicates software and version that generated the hOCR document
1072 |   :: Every hOCR document *must* have exactly one 'ocr-system' metadata field
1073 | 
1074 |   : <dfn property>ocr-capabilities</dfn>
1075 |   :: Features consumers of the hOCR document can expect
1076 |   :: See [[#capabilities]] for possible values
1077 |   :: Every hOCR document *must* have exactly one 'ocr-capabilities' metadata field
1078 | 
1079 |   : <dfn property>ocr-number-of-pages</dfn>
1080 |   :: The number of <{ocr_page}> in the document
1081 | 
1082 |   : <dfn property>ocr-langs</dfn>
1083 |   :: Use [ISO 639-1](https://www.loc.gov/standards/iso639-2/php/code_list.php) codes
1084 |   :: Value may be `unknown`
1085 | 
1086 |   : <dfn property>ocr-scripts</dfn>
1087 |   :: Use [ISO 15924](http://www.unicode.org/iso15924/codelists.html) letter codes
1088 |   :: Value may be `unknown`
1089 | 
1090 | Document metadata {#document-metadata}
1091 | -----------------
1092 | 
1093 | For document meta information, use the [Dublin Core Embedding into
1094 | HTML](http://dublincore.org/documents/dcq-html/). See also [Citation Guidelines
1095 | for Dublin Core](http://dublincore.org/documents/dc-citation-guidelines/).
1096 | 
1097 | Capabilities {#capabilities}
1098 | ------------
1099 | 
1100 | Any program generating files in this output format must indicate in the
1101 | document metadata what kind of markup it is capable of generating. This
1102 | includes listing the exact set of markup sections that the system could have
1103 | generated, even if it did not actually generate them for the particular
1104 | document.
1105 | 
1106 | If a document lists a certain capabilities but no element or attribute is found
1107 | that corresponds to that capability, users of the document may infer that the
1108 | content is absent in the source document. If a capability is not listed, the
1109 | corresponding element or attribute must not be present in the document.
1110 | 
1111 | The capability to generate specific properties is given by the prefix `ocrp_...`;
1112 | the important properties are:
1113 | 
1114 | <dl dfn-for="ocr-capabilities">
1115 | 
1116 |   : <dfn value>ocrp_lang</dfn>
1117 |   :: Capable of generating <{*/lang}> attributes
1118 | 
1119 |   : <dfn value>ocrp_dir</dfn>
1120 |   :: Capable of generating <{*/dir}> attributes
1121 | 
1122 |   : <dfn value>ocrp_poly</dfn>
1123 |   :: Capable of generating [polygonal bounds](#poly)
1124 | 
1125 |   : <dfn value>ocrp_font</dfn>
1126 |   :: Capable of generating font information (standard font information)
1127 | 
1128 |   : <dfn value>ocrp_nlp</dfn>
1129 |   :: Capable of generating 'nlp|nlp confidences'
1130 | 
1131 |   : `ocr_embeddedformat_<formatname>`
1132 |   :: The capability to generate other specific embedded formats is given by the
1133 |     prefix `ocr_embeddedformat_<formatname>`.
1134 | 
1135 |   : `ocr_<tag>_unordered`
1136 |   :: If an OCR engine represents a particular tag but cannot determine reading
1137 |     order for that tag, it must must specify a capability of
1138 |     `ocr_<tag>_unordered`.
1139 | 
1140 | </dl>
1141 | 
1142 | Profiles - Restricting hOCR markup {#profiles}
1143 | ----------------------------------
1144 | 
1145 | hOCR provides standard means of marking up information, but it does not mandate
1146 | the presence or absence of particular kinds of information.  For example, an
1147 | hOCR file may contain only logical markup, only physical markup, or only
1148 | engine-specific markup. As a result, merely knowing that OCR output is hOCR
1149 | compliant doesn't tell us whether that file is actually useful for subsequent
1150 | processing.
1151 | 
1152 | OCR systems can use hOCR in various different ways internally, but we will
1153 | eventually define some common profiles that mandate what kinds of information
1154 | needs to be present in particular kinds of output.
1155 | 
1156 | Of particular importance are:
1157 | 
1158 |   * physical layout profile: OCR output in XHTML format with a defined set of
1159 |     common physical layout markup capabilities (page, carea, floats, line).
1160 |     Logical layout may be present as well, but the document tree structure must
1161 |     represent the physical layout structure, with logical layout elements split
1162 |     and grouped as needed.
1163 | 
1164 |   * logical layout profile: OCR output in XHTML format with a defined set of
1165 |     common logical layout markup capabilities (linear, chapter, section,
1166 |     subsection).  Physical layout may be present as well, but the document tree
1167 |     structure must represent the logical layout structure, with logical layout
1168 |     elements split and grouped as needed.
1169 | 
1170 | Other possible profiles might be defined for specific engines or specific
1171 | document classes:
1172 | 
1173 |   * common commercial OCR output (e.g., Abbyy)
1174 |     * <{ocr_page}>
1175 |     * <{ocrx_block}>, <{ocrx_line}>, <{ocrx_word}>
1176 |     * ''ocr-capabilities/ocrp_lang''
1177 |     * ''ocr-capabilities/ocrp_font''
1178 |   * book target
1179 |     * all logical structuring elements (as applicable), except <{ocr_linear}>
1180 |     * <{ocr_page}>
1181 |   * newspaper target
1182 |     * all logical structuring elements (as applicable)
1183 |     * articles map on <{ocr_linear}>
1184 |     * <{ocr_page}>
1185 | 
1186 | Formats: Restricting HTML Markup {#html-markup}
1187 | --------------------------------
1188 | 
1189 | The HTML-based markup is orthogonal to the hOCR-based markup; that is, both can
1190 | be chosen independent of one another. The only thing that needs to be
1191 | consistent between the two markups is the text contained within the tags. hOCR
1192 | and other embedded format tags can be put on HTML tags, or they can be put on
1193 | their own <{div}>/<{span}> tags.
1194 | 
1195 | There are many different choices possible and reasonable for the HTML markup,
1196 | depending on the use and further processing of the document. Each such choice
1197 | must be indicated in the meta data for the document.
1198 | 
1199 | Many mappings derived from existing tools are quite similar, and most follow
1200 | the restrictions and recommendations below already without further
1201 | modifications.
1202 | 
1203 | Depending on the particular HTML markup used in the document, the document is
1204 | suitable for different kinds of processing and use. The formats have the
1205 | following intents:
1206 | 
1207 |   : <a>html_none</a> (see [[#format-none]])
1208 |   :: Straightforward equivalent of Goodoc or [[XDOC]]
1209 |   : <a>html_simple</a>
1210 |   :: Target format for convenient on-line viewing and intermediate format for indexing
1211 |   : <a>html_xytable_absolute</a>, <a>html_xytable_relative</a>
1212 |   :: Target format for layout-preserving on-screen document viewing
1213 |   : Formats defined in [[#format-ocr]]
1214 |   :: Straightforward recording of commercial OCR system output
1215 |   : Formats defined in [[#format-absolute]]
1216 |   :: Target format for services like Google's View as HTML
1217 | 
1218 | As long as a format contains the hOCR information, it can be reprocessed by
1219 | layout analysis software and converted into one of the other formats. In
1220 | particular, we envision layout analysis tools for converting any hOCR document
1221 | into <a>html_absolute</a>, <a>html_xytable_absolute</a>, and
1222 | <a>html_simple</a>. Furthermore, internally, a layout analysis system might
1223 | use <a>html_xytable_absolute</a> as an intermediate format for converting hOCR
1224 | into <a>html_simple</a>.
1225 | 
1226 | 
1227 | ### HTML without logical markup ### {#format-none}
1228 | 
1229 | The <dfn>html_none</dfn> format contains no logical markup at all; it is
1230 | simply a collection of <{div}> and <{span}> elements with associated hOCR
1231 | information. Note that such documents can still be rendered visually through
1232 | the use of CSS.
1233 | 
1234 | ### HTML with limited logical elements ### {#format-simple}
1235 | 
1236 | The <dfn>html_simple</dfn> format follows the restrictions and
1237 | recommendations above, and only uses the following tags:
1238 | 
1239 |   * <{h1}> ...  <{h6}>
1240 |   * <{p}>, <{br}>
1241 |   * <{b}>, <{i}>, and <{u}> for appearance changes (bold, italic, underline)
1242 |   * <{font}> for any other appearance changes
1243 |   * <{a}>
1244 |   * <{div}> with a float style for floats
1245 |   * <{table}> for tables
1246 |   * <{img}> for images
1247 |   * all SVG must be externally embedded with the <{embed}> tag
1248 |   * the use of other embedded formats is permitted
1249 |   * all other uses of <{div}>, <{span}>, <{ins}>, and <{del}> only for hOCR tags or other embedded formats (hCard, …)
1250 | 
1251 | ### HTML produced by OCR engines ### {#format-ocr}
1252 | 
1253 | HTML markup produced by default by the OCR engine for the given document
1254 | must follow the template `html_ocr_<engine>`.
1255 | 
1256 | Examples of possible values are:
1257 | 
1258 |   : <dfn>html_ocr_unknown</dfn>
1259 |   :: The HTML was generated by some OCR engine, but it's unknown which one
1260 |   : <dfn>html_ocr_finereader_8</dfn>
1261 |   : <dfn>html_ocr_textbridge_11</dfn>
1262 | 
1263 | ### HTML with absolute positioning ### {#format-absolute}
1264 | 
1265 |   : <dfn>html_absolute</dfn>
1266 |   :: The HTML represents absolute positioning of elements on each page. 
1267 | 
1268 | Possible subformats are:
1269 | 
1270 |   : <dfn>html_absolute_cols</dfn>
1271 |   :: absolute positioning of cols
1272 | 
1273 |   : <dfn>html_absolute_pars</dfn> 
1274 |   :: absolute positioning of paragraphs
1275 | 
1276 |   : <dfn>html_absolute_lines</dfn> 
1277 |   :: absolute positioning of lines
1278 | 
1279 |   : <dfn>html_absolute_words</dfn> 
1280 |   :: absolute positioning of words
1281 | 
1282 |   : <dfn>html_absolute_chars</dfn> 
1283 |   :: absolute positioning of characters
1284 | 
1285 | The ["View as HTML" for PDF
1286 | files](https://googlewebmastercentral.blogspot.de/2011/09/pdfs-in-google-search-results.html)
1287 | feature of Google Search uses <a>html_absolute_lines</a>; this is probably the most
1288 | reasonable choice for approximating the appearance of the original document.
1289 | 
1290 | ### HTML as table ### {#format-table}
1291 | 
1292 |   : <dfn>html_xytable</dfn>
1293 |   :: The HTML is a table that gives the XY-cut layout segmentation structure of
1294 |     the page in tabular form.
1295 |   :: Note that in this format, text order does not necessarily correspond to
1296 |     reading order.
1297 |   :: The format must contain one <{table}> of class <dfn element>ocr_xycut</dfn>
1298 |     representing each page. The markup of the content of the table itself is as in
1299 |     <a>html_simple</a>.
1300 | 
1301 | Possible subformats are:
1302 | 
1303 | : <dfn>html_xytable_absolute</dfn>
1304 | :: The <{table}> structure must represent the absolute size of the original page element.
1305 | 
1306 | : <dfn>html_xytable_relative</dfn>
1307 | :: Table element sizes are expressed relative (percentages).
1308 | 
1309 | ### HTML from word processors ### {#format-wordprocessor}
1310 | 
1311 | The HTML represents markup that follows the mappings of the given document
1312 | processor to HTML.
1313 | 
1314 | Note that the document doesn't actually need to have been constructed in the
1315 | processor and that the processor doesn't need to have been used to generate
1316 | the HTML. For example, the <a>html_latex2html</a> tag merely indicates that,
1317 | say, a scanned and ocr'ed article uses the same conventions for logical markup
1318 | tags that an equivalent article actually written in LaTeX and actually
1319 | converted to HTML would have used.
1320 | 
1321 |   : <dfn>html_latex2html</dfn>
1322 | 
1323 |   : <dfn>html_msword</dfn>
1324 |   :: HTML mapping generated by “Save As HTML”
1325 | 
1326 |   : <dfn>html_ooffice</dfn>
1327 |   :: HTML mapping generated by “Save As HTML”
1328 | 
1329 |   : <dfn>html_docbook_xsl</dfn>
1330 |   :: HTML mapping generated by official XSL style sheets
1331 | 
1332 | Example {#metadata-example}
1333 | -------
1334 | 
1335 | <div class=example>
1336 | ```html
1337 | <html>
1338 |   <head>
1339 |     <meta name="ocr-system" content="tesseract v3.03"/>
1340 |     <meta name="ocr-capabilities" content="ocr_page ocr_line ocrp_lang"/>
1341 |     <meta name="ocr-langs" content="aa la zu"/>
1342 |     <meta name="ocr-scripts" content="Arab Khmr"/>
1343 |     <meta name="ocr-number-of-pages" content="112"/>
1344 |     ...
1345 |   </head>
1346 |   ...
1347 | </html>
1348 | ```
1349 | 
1350 | Indicate that the work this hOCR file represents:
1351 | 
1352 |   * was produced by Tesseract v3.03
1353 |   * will provide <{ocr_page}> and <{ocr_line}> elements with <{*/lang}> attribute
1354 |   * contains text written in the Afar, Latin or Zulu languages
1355 |   * contains text written in Arabic and Khmer script
1356 |   * has `112` pages
1357 | 
1358 | </div>
1359 | 
1360 | ---
1361 | 
1362 | Appendix A: Revision History {#history}
1363 | ============================
1364 | 
1365 | hOCR has been originally developed by Thomas Breuel.
1366 | 
1367 | See the [releases](https://github.com/kba/hocr-spec/releases/) and full [commit
1368 | history](https://github.com/kba/hocr-spec/commits/) for a revision history.
1369 | 
1370 | 
1371 | Appendix B: Sample Usage {#sample-usage}
1372 | ========================
1373 | 
1374 | See also the [hocr-tools](https://github.com/tmbdev/hocr-tools) for more samples.
1375 | 
1376 | The HTML format described here may seem fairly complicated and difficult to
1377 | parse, but because there are lots of tools for manipulating HTML documents,
1378 | they're actually pretty easy to manipulate. Here are some examples:
1379 | 
1380 | ```python
1381 | import libxml2,re,os,string
1382 | 
1383 | # convert the HTML to XHTML (if necessary)
1384 | os.system("tidy -q -asxhtml < page.html > page.xhtml 2> /dev/null")
1385 | 
1386 | # parse the XML
1387 | doc = libxml2.parseFile('page.xhtml')
1388 | 
1389 | # search all nodes having a class of ocr_line
1390 | lines = doc.xpathEval("//*[@class='ocr_line']")
1391 | 
1392 | # a function for extracting the text from a node
1393 | def get_text(node):
1394 |     textnodes = node.xpathEval(".//text()")
1395 |     s = string.join([node.getContent() for node in textnodes])
1396 |     return re.sub(r'\s+',' ',s)
1397 | 
1398 | # a function for extracting the bbox property from a node
1399 | # note that the title= attribute on a node with an ocr_ class must
1400 | # conform with the OCR spec
1401 | 
1402 | def get_bbox(node):
1403 |     data = node.prop('title')
1404 |     bboxre = re.compile(r'\bbbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')
1405 |     return [int(x) for x in bboxre.search(data).groups()]
1406 | 
1407 | # this extracts all the bounding boxes and the text they contain
1408 | # it doesn't matter what other markup the line node may contain
1409 | for line in lines:
1410 |     print get_bbox(line),get_text(line)
1411 | ```
1412 | 
1413 | Note that the OCR markup, basic HTML markup, and semantic markup can co-exist
1414 | within the same HTML file without interfering with one another.
1415 | 
1416 | Appendix C: IANA Considerations {#iana}
1417 | ===============================
1418 | 
1419 | Issue: [XML namespace for hOCR HTML?](https://github.com/kba/hocr-spec/issues/2)
1420 | 
1421 | Issue: [What DOCTYPE for hOCR HTML?](https://github.com/kba/hocr-spec/issues/1)
1422 | 
1423 | Media Type {#media-type}
1424 | ----------
1425 | 
1426 | In accordance to [[RFC4289]]
1427 | 
1428 | Issue: [correct MIME type for hOCR?](https://github.com/kba/hocr-spec/issues/27)
1429 | 
1430 |   : MIME media type name
1431 |   :: `text`
1432 |   : MIME subtype name:
1433 |   :: `vnd.hocr+html`
1434 |   : Required parameters:
1435 |   : Optional parameters:
1436 |   : Encoding considerations:
1437 |   :: hOCR documents should be encoded as UTF-8
1438 |   : Security considerations:
1439 |   : Interoperability considerations:
1440 |   : Applications which use this media type:
1441 |   : File extension(s):
1442 |   :: `*.html`, `*.hocr`
1443 | 
1444 | 
1445 | 
1446 | <!--
1447 | vim: tw=80 sw=2 ts=2 et
1448 | -->
1449 | 


--------------------------------------------------------------------------------
/1.2/templates/element:
--------------------------------------------------------------------------------
 1 | <dl class="def">
 2 | 
 3 |   <dt><a lt="Element Name">Name</a></dt>
 4 |   {% if deprecated %}
 5 |   <dd><del>{{ name }}</del> (Deprecated)</dd>
 6 |   {% else %}
 7 |   <dd><a element>{{name}}</a></dd>
 8 |   {% endif %}
 9 | 
10 | {% if recommended_tags %}
11 |   <dt><a>Recommended HTML Tags</a></dt>
12 |   <dd>{% for tag in recommended_tags %}<a element>{{ tag }}</a> {% if not loop.last %}, {% endif %} {% endfor %}</dd>
13 | {% endif %}
14 | 
15 | {% if categories %}
16 |   <dt><a lt="Element Categories">Categories</a></dt>
17 |   <dd>{% for category in categories %}<a>{{ category }} Elements</a> {% if not loop.last %}, {% endif %} {% endfor %}</dd>
18 | {% endif %}
19 | 
20 | {% if properties %}
21 |     <dt><a href="#rel-elem-prop">Properties</a></dt>
22 |     <dd>
23 |         <dl style="margin-top: 2px">
24 |         {% for cat in ['required', 'recommended', 'allowed'] %}
25 |             {% if properties[cat] and properties[cat]|length %}
26 |             <dt style="display: inline; margin: 0"><a lt="{{ cat|capitalize }} Properties">{{ cat|capitalize }}</a>:</dt>
27 |             <dd style="display: inline; margin: 0">{% for prop in properties[cat] %}
28 |                 '{{ prop }}'{% if not loop.last %},{% endif %} {% endfor %}</dd>
29 |             {% endif %}
30 |         <div></div>
31 |         {% endfor %}
32 |         </dl>
33 |     </dd>
34 | {% endif %}
35 | 
36 | 
37 | </dl>
38 | 


--------------------------------------------------------------------------------
/1.2/templates/property:
--------------------------------------------------------------------------------
 1 | <dl class=def>
 2 | 
 3 |   : <a lt="Property Name">Name</a>
 4 |   :: {{ name }}
 5 | 
 6 | {% if categories %}
 7 |   : <a lt="Property Categories">Categories</a>
 8 |   :: {% for cat in categories %}<a lt="{{ cat }} Properties">{{ cat }}</a>{% if not loop.last %}, {% endif %} {% endfor %}
 9 | {% endif %}
10 | 
11 | {% if related %}
12 |   : <a lt="Related Properties">Related</a>
13 |   :: {% for prop in related %}<a property>{{ prop }}</a>{% if not loop.last %}, {% endif %} {% endfor %}
14 | {% endif %}
15 | 
16 | {% if conflicting %}
17 |   : <a lt="Conflicting Properties">Conflicting</a>
18 |   :: {% for prop in conflicting %}<a property>{{ prop }}</a>{% if not loop.last %}, {% endif %} {% endfor %}
19 | {% endif %}
20 | 
21 | {% if implied %}
22 |   : <a lt="Implied Properties">Implied</a>
23 |   :: {% for prop in implied %}<a property>{{ prop }}</a>{% if not loop.last %}, {% endif %} {% endfor %}
24 | {% endif %}
25 | 
26 |   : <a lt="Property Grammar">Grammar</a>
27 |   :: <pre data-dfn-type="grammar" data-link-type="grammar" class=highlight>
28 |     <a>property-name</a> = "{{ name }}"</a>
29 |     {{ grammar|indent(4) }}
30 |     </pre>
31 | 
32 | {% if example %}
33 |   : Example
34 |   :: <pre style='display:block;padding: .5em' highlight=c>
35 |     {{ example|indent(4) }}
36 |     </code>
37 | {% endif %}
38 | 
39 | {% if default %}
40 |   : <a>Default Value</a>
41 |   :: <code style='display:block;padding: .5em' highlight=c>{{ name }} {{ default }}</code>
42 | {% endif %}
43 | 
44 | </dl>
45 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | VERSION := 1.2
 2 | 
 3 | SPEC_BIBLIO = biblio.json
 4 | SPEC_BEFORE = $(VERSION)/spec.before.html
 5 | SPEC_AFTER = $(VERSION)/spec.after.html
 6 | SPEC_MD = $(VERSION)/spec.md
 7 | SPEC_BS = $(VERSION)/index.bs
 8 | SPEC_HTML = $(VERSION)/index.html
 9 | 
10 | BIKESHED = $(shell for cmd in bikeshed docker;do type >/dev/null 2>&1 $$cmd && echo $$cmd && break;done)
11 | BIKESHED_ARGS = -f
12 | BIKESHED_SPEC_ARGS =
13 | 
14 | SPEC_DEFS = $(VERSION)/include/defs/bbox
15 | SPEC_DEFS_YML = $(VERSION)/defs.yml
16 | SPEC_DEFS_TEMPLATES = $(shell find $(VERSION)/templates/ -type f)
17 | GEN_DEFS = python3 gen-defs.py
18 | 
19 | $(SPEC_HTML): $(SPEC_BS)
20 | 	@case "$(BIKESHED)" in \
21 | 		bikeshed) bikeshed $(BIKESHED_ARGS) spec $(BIKESHED_SPEC_ARGS) $(SPEC_BS) ;; \
22 | 		docker)   docker run --rm -it -v $(PWD):/data kbai/bikeshed $(BIKESHED_ARGS) spec $(BIKESHED_SPEC_ARGS) $(SPEC_BS) ;; \
23 | 		*)        echo 'Unsupported bikeshed backend "$(BIKESHED)"'; exit 1 ;; esac
24 | 	@rm -f $(SPEC_BS)
25 | 
26 | $(SPEC_BS): $(SPEC_BEFORE) $(SPEC_MD) $(SPEC_BIBLIO) $(SPEC_AFTER) $(SPEC_DEFS)
27 | 	@echo 'Rebuilding spec...'
28 | 	@cat  $(SPEC_BEFORE)           > $(SPEC_BS)
29 | 	@echo '<pre class="biblio">'   >> $(SPEC_BS)
30 | 	@cat  $(SPEC_BIBLIO)           >> $(SPEC_BS)
31 | 	@echo '</pre>'                 >> $(SPEC_BS)
32 | 	@cat  $(SPEC_MD) $(SPEC_AFTER) >> $(SPEC_BS)
33 | 
34 | $(SPEC_DEFS): $(SPEC_DEFS_YML) $(SPEC_DEFS_TEMPLATES)
35 | 	@$(GEN_DEFS) --basepath $(VERSION)
36 | 
37 | clean:
38 | 	$(RM) $(SPEC_HTML) $(SPEC_BS)
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | hocr-spec
 2 | =========
 3 | 
 4 | [![Join the chat at https://gitter.im/kba/hocr-spec](https://badges.gitter.im/kba/hocr-spec.svg)](https://gitter.im/kba/hocr-spec?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 5 | 
 6 | The hOCR Embedded OCR Workflow and Output Format
 7 | 
 8 | ## About
 9 | 
10 | This repository contains the [hOCR](https://en.wikipedia.org/wiki/HOCR) format
11 | specification originally written by [Thomas Breuel](https://github.com/tmbdev).
12 | 
13 | ## Versions
14 | 
15 | * `1.0` [English](https://docs.google.com/document/d/1QQnIQtvdAC_8n92-LhwPcjtAUFwBlzE8EWnKAxlgVf0/preview)
16 |   * Google Doc the original text by @tmbdev
17 |   * Last substantial edit in May 2010
18 | * `1.1` [English](./1.1/spec.md), [中文 (Chinese)](./1.1/spec_zh_CN.md)
19 |   * Port of the Google Doc
20 |   * Cleaning obvious errata (duplicate content)
21 |   * More fine-grained heading structure
22 |   * Table of contents
23 |   * Chinese translation provided by [@littlePP24](https://github.com/littlePP24) and [@wanghaisheng](https://github.com/wanghaisheng)
24 |   * Last substantial edit in September 2016
25 | * `1.2` [English](https://kba.github.io/hocr-spec/1.2/)
26 |   * Create a WHATWG-like spec using [bikeshed](https://github.com/tabatkins/bikeshed)
27 |   * Add issues where appropriate
28 |   * Semantically backwards-compatible with both 1.0 and 1.1
29 | 
30 | ## Contribute
31 | 
32 | There is no formal body. Feel free to use the [Github
33 | issues](https://github.com/kba/hocr-spec/issues) for discussion and questions.
34 | Pull requests are very welcome.
35 | 
36 | For quick questions you can use the [hocr-spec gitter
37 | channel](https://gitter.im/kba/hocr-spec).
38 | 
39 | ## Building the spec
40 | 
41 | To build the spec, you will need to have installed:
42 |   * `GNU make` 
43 |   * One of the following programs installed:
44 |     * [bikeshed](https://github.com/tabatkins/bikeshed)
45 |     * [docker](https://docker.com)
46 |   * Python 3
47 | 
48 | To install the python requirements:
49 | 
50 | ```sh
51 | pip3 install --user -r requirements.txt
52 | ```
53 | 
54 | The Makefile will first look for a local bikeshed installation and fallback to docker
55 | to use the [bikeshed docker container](https://hub.docker.com/kbai/bikeshed) 
56 | to build the spec.
57 | 
58 | To change the spec, adapt
59 |   * `<VERSION>/spec.md` to change the body of the spec
60 |   * `<VERSION>/spec.before.html` to change
61 |     * the [bikeshed metadata](https://tabatkins.github.io/bikeshed/#metadata)
62 |     * the [references to terms from other specs](https://tabatkins.github.io/bikeshed/#custom-dfns)
63 |   * `<VERSION>/spec.after.html` to change
64 |     * Javascript to run in the generated spec document
65 |   * `<VERSION>/defs.yml` to change the definition lists for elements and properties
66 | 
67 | Then run `make VERSION=<VERSION>` to build that spec.
68 | 
69 | Examples:
70 |   * To build the `1.2` version: `make VERSION=1.2` or simply `make`
71 |   * To build the `1.2-zh` version: `make VERSION=1.2-zh`
72 | 
73 | ## Open Tasks
74 | 
75 | The goal of this project is to make the hOCR specification more accessible and
76 | easier to maintain.
77 | 
78 | * Cross-reference other specs
79 | * Harmonize style
80 | * Add samples
81 | * [...](https://github.com/kba/hocr-spec/issues)
82 | 


--------------------------------------------------------------------------------
/biblio.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "CML": {
 3 |     "authors": ["Peter Murray-Rust", "Henry Rzepa"],
 4 |     "href": "http://www.xml-cml.org/",
 5 |     "title": "Chemical Markup Language - CML"
 6 |   },
 7 |   "XDOC": {
 8 |     "authors": ["Daniel S. Connelly", "Beth Paddock", "Rebecca Harvey"],
 9 |     "date": "May 1999",
10 |     "href": "https://web.archive.org/web/20160731161638/http://vividata.com/manuals/core12xdc.pdf",
11 |     "title": "XDOC DATA FORMAT. Technical Specification"
12 |   }
13 |  }
14 | 


--------------------------------------------------------------------------------
/gen-defs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from jinja2 import Environment, FileSystemLoader, Template
 4 | import sys
 5 | import argparse
 6 | import yaml
 7 | import os
 8 | 
 9 | 
10 | class DefGenerator:
11 | 
12 |     def __init__(self, basepath, outputdir=None, defs_yml=None, templatedir=None):
13 |         if not templatedir: templatedir = "{0}/templates".format(basepath)
14 |         if not defs_yml: defs_yml = "{0}/defs.yml".format(args.basepath)
15 |         if not outputdir: outputdir = "{0}/include/defs".format(basepath)
16 |         self.outputdir = outputdir
17 |         env = Environment(loader=FileSystemLoader(templatedir),
18 |                           lstrip_blocks=True, trim_blocks=True)
19 |         self.templates = {}
20 |         for name in ['property', 'element']:
21 |             self.templates[name] = env.get_template(name)
22 |         with open(defs_yml) as f:
23 |             self.specs = yaml.load(f)
24 |         os.makedirs(self.outputdir, exist_ok=True)
25 | 
26 |     def generate(self):
27 |         for cat in self.templates:
28 |             sys.stderr.write("[{0}]\n\t".format(cat))
29 |             for name in self.specs[cat]:
30 |                 definition = self.specs[cat][name]
31 |                 definition['name'] = name
32 |                 fname = "{0}/{1}".format(self.outputdir, name)
33 |                 with open(fname, 'w') as f:
34 |                     sys.stderr.write(name + " ")
35 |                     f.write(self.templates[cat].render(definition))
36 |             sys.stderr.write("\n")
37 | 
38 | if __name__ == '__main__':
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument('--basepath', default=os.getcwd(),
41 |             help='Path for defs.yml and templates/. Default: %(default)s')
42 |     parser.add_argument('--defs_yml',
43 |             help='Definitions YAML. Default: [basepath]/defs.yml')
44 |     parser.add_argument('--templatedir',
45 |             help='Templates directory. Default: [basepath]/templates')
46 |     parser.add_argument('--outputdir',
47 |             help='Output directory. Default: [basepath]/include/defs')
48 |     args = parser.parse_args()
49 |     generator = DefGenerator(**vars(args))
50 |     generator.generate()
51 | 


--------------------------------------------------------------------------------
/hocr-spec.md:
--------------------------------------------------------------------------------
1 | 1.1/spec.md


--------------------------------------------------------------------------------
/images/baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kba/hocr-spec/70e12781939644370a2a72c509449c475caeb604/images/baseline.png


--------------------------------------------------------------------------------
/images/bbox-crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kba/hocr-spec/70e12781939644370a2a72c509449c475caeb604/images/bbox-crop.png


--------------------------------------------------------------------------------
/images/bbox.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kba/hocr-spec/70e12781939644370a2a72c509449c475caeb604/images/bbox.odg


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Redirecting to Latest version</title>
 5 |   </head>
 6 |   <body>
 7 |     <script>
 8 |       window.location.replace("1.2" + window.location.hash);
 9 |     </script>
10 |   </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Jinja2>=2.8
2 | pyyaml>=3.1.12
3 | 


--------------------------------------------------------------------------------