├── LICENSE ├── README.md ├── lex.go ├── name.go ├── page.go ├── pdfpasswd └── main.go ├── ps.go ├── read.go └── text.go /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDF Reader 2 | 3 | A simple Go library which enables reading PDF files. 4 | 5 | Features 6 | - Get plain text content (without format) 7 | - Get Content (including all font and formatting information) 8 | 9 | ## Install: 10 | 11 | `go get -u github.com/rsc/pdf` 12 | 13 | 14 | ## Read plain text 15 | 16 | ```golang 17 | package main 18 | 19 | import ( 20 | "bytes" 21 | "fmt" 22 | 23 | "github.com/rsc/pdf" 24 | ) 25 | 26 | func main() { 27 | content, err := readPdf("test.pdf") // Read local pdf file 28 | if err != nil { 29 | panic(err) 30 | } 31 | fmt.Println(content) 32 | return 33 | } 34 | 35 | func readPdf(path string) (string, error) { 36 | r, err := pdf.Open(path) 37 | if err != nil { 38 | return "", err 39 | } 40 | 41 | var buf bytes.Buffer 42 | buf.ReadFrom(p.GetPlainText()) 43 | return buf.String(), nil 44 | } 45 | ``` 46 | 47 | ## Read all text with styles from PDF 48 | 49 | ```golang 50 | func readPdf2(path string) (string, error) { 51 | r, err := pdf.Open(path) 52 | if err != nil { 53 | return "", err 54 | } 55 | totalPage := r.NumPage() 56 | 57 | for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { 58 | p := r.Page(pageIndex) 59 | if p.V.IsNull() { 60 | continue 61 | } 62 | var lastTextStyle pdf.Text 63 | texts := p.Content().Text 64 | for _, text := range texts { 65 | if isSameSentence(text, lastTextStyle) { 66 | lastTextStyle.S = lastTextStyle.S + text.S 67 | } else { 68 | fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S) 69 | lastTextStyle = text 70 | } 71 | } 72 | } 73 | return "", nil 74 | } 75 | ``` 76 | 77 | ## Demo 78 | ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) -------------------------------------------------------------------------------- /lex.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Reading of PDF tokens and objects from a raw byte stream. 6 | 7 | package pdf 8 | 9 | import ( 10 | "io" 11 | "strconv" 12 | 13 | "github.com/pkg/errors" 14 | ) 15 | 16 | // A token is a PDF token in the input stream, one of the following Go types: 17 | // 18 | // bool, a PDF boolean 19 | // int64, a PDF integer 20 | // float64, a PDF real 21 | // string, a PDF string literal 22 | // keyword, a PDF keyword 23 | // name, a PDF name without the leading slash 24 | // 25 | type token interface{} 26 | 27 | // A name is a PDF name, without the leading slash. 28 | type name string 29 | 30 | // A keyword is a PDF keyword. 31 | // Delimiter tokens used in higher-level syntax, 32 | // such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords. 33 | type keyword string 34 | 35 | // A buffer holds buffered input bytes from the PDF file. 36 | type buffer struct { 37 | r io.Reader // source of data 38 | buf []byte // buffered data 39 | pos int // read index in buf 40 | offset int64 // offset at end of buf; aka offset of next read 41 | tmp []byte // scratch space for accumulating token 42 | unread []token // queue of read but then unread tokens 43 | allowEOF bool 44 | allowObjptr bool 45 | allowStream bool 46 | eof bool 47 | key []byte 48 | useAES bool 49 | objptr objptr 50 | } 51 | 52 | // newBuffer returns a new buffer reading from r at the given offset. 53 | func newBuffer(r io.Reader, offset int64) *buffer { 54 | return &buffer{ 55 | r: r, 56 | offset: offset, 57 | buf: make([]byte, 0, 4096), 58 | allowObjptr: true, 59 | allowStream: true, 60 | } 61 | } 62 | 63 | func (b *buffer) seek(offset int64) { 64 | b.offset = offset 65 | b.buf = b.buf[:0] 66 | b.pos = 0 67 | b.unread = b.unread[:0] 68 | } 69 | 70 | func (b *buffer) readByte() (byte, error) { 71 | if b.pos >= len(b.buf) { 72 | if _, err := b.reload(); err != nil { 73 | return '\x00', err 74 | } 75 | if b.pos >= len(b.buf) { 76 | return '\n', nil 77 | } 78 | } 79 | c := b.buf[b.pos] 80 | b.pos++ 81 | return c, nil 82 | } 83 | 84 | func (b *buffer) reload() (bool, error) { 85 | n := cap(b.buf) - int(b.offset%int64(cap(b.buf))) 86 | n, err := b.r.Read(b.buf[:n]) 87 | if n == 0 && err != nil { 88 | b.buf = b.buf[:0] 89 | b.pos = 0 90 | if b.allowEOF && err == io.EOF { 91 | b.eof = true 92 | return false, nil 93 | } 94 | return false, errors.Errorf("malformed PDF: reading at offset %d: %v", b.offset, err) 95 | } 96 | b.offset += int64(n) 97 | b.buf = b.buf[:n] 98 | b.pos = 0 99 | return true, nil 100 | } 101 | 102 | func (b *buffer) seekForward(offset int64) { 103 | for b.offset < offset { 104 | if ok, _ := b.reload(); !ok { 105 | return 106 | } 107 | } 108 | b.pos = len(b.buf) - int(b.offset-offset) 109 | } 110 | 111 | func (b *buffer) readOffset() int64 { 112 | return b.offset - int64(len(b.buf)) + int64(b.pos) 113 | } 114 | 115 | func (b *buffer) unreadByte() { 116 | if b.pos > 0 { 117 | b.pos-- 118 | } 119 | } 120 | 121 | func (b *buffer) unreadToken(t token) { 122 | b.unread = append(b.unread, t) 123 | } 124 | 125 | func (b *buffer) readToken() token { 126 | if n := len(b.unread); n > 0 { 127 | t := b.unread[n-1] 128 | b.unread = b.unread[:n-1] 129 | return t 130 | } 131 | 132 | // Find first non-space, non-comment byte. 133 | c, err := b.readByte() 134 | for { 135 | if err != nil { 136 | return err 137 | } 138 | if isSpace(c) { 139 | if b.eof { 140 | return io.EOF 141 | } 142 | c, err = b.readByte() 143 | } else if c == '%' { 144 | for c != '\r' && c != '\n' { 145 | c, err = b.readByte() 146 | } 147 | } else { 148 | break 149 | } 150 | } 151 | 152 | switch c { 153 | case '<': 154 | if b, _ := b.readByte(); b == '<' { 155 | return keyword("<<") 156 | } 157 | b.unreadByte() 158 | return b.readHexString() 159 | 160 | case '(': 161 | return b.readLiteralString() 162 | 163 | case '[', ']', '{', '}': 164 | return keyword(string(c)) 165 | 166 | case '/': 167 | return b.readName() 168 | 169 | case '>': 170 | if b, _ := b.readByte(); b == '>' { 171 | return keyword(">>") 172 | } 173 | b.unreadByte() 174 | fallthrough 175 | 176 | default: 177 | if isDelim(c) { 178 | return errors.Errorf("unexpected delimiter %#q", rune(c)) 179 | } 180 | b.unreadByte() 181 | return b.readKeyword() 182 | } 183 | } 184 | 185 | func (b *buffer) readHexString() token { 186 | tmp := b.tmp[:0] 187 | for { 188 | Loop: 189 | c, err := b.readByte() 190 | if err != nil { 191 | return err 192 | } 193 | if c == '>' { 194 | break 195 | } 196 | if isSpace(c) { 197 | goto Loop 198 | } 199 | Loop2: 200 | c2, err := b.readByte() 201 | if err != nil { 202 | return err 203 | } 204 | if isSpace(c2) { 205 | goto Loop2 206 | } 207 | x := unhex(c)<<4 | unhex(c2) 208 | if x < 0 { 209 | return errors.Errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]) 210 | } 211 | tmp = append(tmp, byte(x)) 212 | } 213 | b.tmp = tmp 214 | return string(tmp) 215 | } 216 | 217 | func unhex(b byte) int { 218 | switch { 219 | case '0' <= b && b <= '9': 220 | return int(b) - '0' 221 | case 'a' <= b && b <= 'f': 222 | return int(b) - 'a' + 10 223 | case 'A' <= b && b <= 'F': 224 | return int(b) - 'A' + 10 225 | } 226 | return -1 227 | } 228 | 229 | func (b *buffer) readLiteralString() token { 230 | tmp := b.tmp[:0] 231 | depth := 1 232 | Loop: 233 | for { 234 | c, err := b.readByte() 235 | if err != nil { 236 | return err 237 | } 238 | switch c { 239 | default: 240 | tmp = append(tmp, c) 241 | case '(': 242 | depth++ 243 | tmp = append(tmp, c) 244 | case ')': 245 | if depth--; depth == 0 { 246 | break Loop 247 | } 248 | tmp = append(tmp, c) 249 | case '\\': 250 | if c, err = b.readByte(); err != nil { 251 | return err 252 | } 253 | switch c { 254 | default: 255 | return errors.Errorf("invalid escape sequence \\%c", c) 256 | case 'n': 257 | tmp = append(tmp, '\n') 258 | case 'r': 259 | tmp = append(tmp, '\r') 260 | case 'b': 261 | tmp = append(tmp, '\b') 262 | case 't': 263 | tmp = append(tmp, '\t') 264 | case 'f': 265 | tmp = append(tmp, '\f') 266 | case '(', ')', '\\': 267 | tmp = append(tmp, c) 268 | case '\r': 269 | if c, _ := b.readByte(); c != '\n' { 270 | b.unreadByte() 271 | } 272 | fallthrough 273 | case '\n': 274 | // no append 275 | case '0', '1', '2', '3', '4', '5', '6', '7': 276 | x := int(c - '0') 277 | for i := 0; i < 2; i++ { 278 | c, err = b.readByte() 279 | if err != nil { 280 | return err 281 | } 282 | if c < '0' || c > '7' { 283 | b.unreadByte() 284 | break 285 | } 286 | x = x*8 + int(c-'0') 287 | } 288 | if x > 255 { 289 | return errors.Errorf("invalid octal escape \\%03o", x) 290 | } 291 | tmp = append(tmp, byte(x)) 292 | } 293 | } 294 | } 295 | b.tmp = tmp 296 | return string(tmp) 297 | } 298 | 299 | func (b *buffer) readName() token { 300 | tmp := b.tmp[:0] 301 | for { 302 | c, err := b.readByte() 303 | if err != nil { 304 | return err 305 | } 306 | if isDelim(c) || isSpace(c) { 307 | b.unreadByte() 308 | break 309 | } 310 | if c == '#' { 311 | hi, err := b.readByte() 312 | if err != nil { 313 | return err 314 | } 315 | lo, err := b.readByte() 316 | if err != nil { 317 | return err 318 | } 319 | x := unhex(hi)<<4 | unhex(lo) 320 | if x < 0 { 321 | return errors.Errorf("malformed name") 322 | } 323 | tmp = append(tmp, byte(x)) 324 | continue 325 | } 326 | tmp = append(tmp, c) 327 | } 328 | b.tmp = tmp 329 | return name(string(tmp)) 330 | } 331 | 332 | func (b *buffer) readKeyword() token { 333 | tmp := b.tmp[:0] 334 | for { 335 | c, err := b.readByte() 336 | if err != nil { 337 | return err 338 | } 339 | if isDelim(c) || isSpace(c) { 340 | b.unreadByte() 341 | break 342 | } 343 | tmp = append(tmp, c) 344 | } 345 | b.tmp = tmp 346 | s := string(tmp) 347 | switch { 348 | case s == "true": 349 | return true 350 | case s == "false": 351 | return false 352 | case isInteger(s): 353 | x, err := strconv.ParseInt(s, 10, 64) 354 | if err != nil { 355 | return errors.Errorf("invalid integer %s", s) 356 | } 357 | return x 358 | case isReal(s): 359 | x, err := strconv.ParseFloat(s, 64) 360 | if err != nil { 361 | return errors.Errorf("invalid real %s", s) 362 | } 363 | return x 364 | } 365 | return keyword(string(tmp)) 366 | } 367 | 368 | func isInteger(s string) bool { 369 | if len(s) > 0 && (s[0] == '+' || s[0] == '-') { 370 | s = s[1:] 371 | } 372 | if len(s) == 0 { 373 | return false 374 | } 375 | for _, c := range s { 376 | if c < '0' || '9' < c { 377 | return false 378 | } 379 | } 380 | return true 381 | } 382 | 383 | func isReal(s string) bool { 384 | if len(s) > 0 && (s[0] == '+' || s[0] == '-') { 385 | s = s[1:] 386 | } 387 | if len(s) == 0 { 388 | return false 389 | } 390 | ndot := 0 391 | for _, c := range s { 392 | if c == '.' { 393 | ndot++ 394 | continue 395 | } 396 | if c < '0' || '9' < c { 397 | return false 398 | } 399 | } 400 | return ndot == 1 401 | } 402 | 403 | // An object is a PDF syntax object, one of the following Go types: 404 | // 405 | // bool, a PDF boolean 406 | // int64, a PDF integer 407 | // float64, a PDF real 408 | // string, a PDF string literal 409 | // name, a PDF name without the leading slash 410 | // dict, a PDF dictionary 411 | // array, a PDF array 412 | // stream, a PDF stream 413 | // objptr, a PDF object reference 414 | // objdef, a PDF object definition 415 | // 416 | // An object may also be nil, to represent the PDF null. 417 | type object interface{} 418 | 419 | type dict map[name]object 420 | 421 | type array []object 422 | 423 | type stream struct { 424 | hdr dict 425 | ptr objptr 426 | offset int64 427 | } 428 | 429 | type objptr struct { 430 | id uint32 431 | gen uint16 432 | } 433 | 434 | type objdef struct { 435 | ptr objptr 436 | obj object 437 | } 438 | 439 | func (b *buffer) readObject() object { 440 | tok := b.readToken() 441 | if kw, ok := tok.(keyword); ok { 442 | switch kw { 443 | case "null": 444 | return nil 445 | case "<<": 446 | return b.readDict() 447 | case "[": 448 | return b.readArray() 449 | } 450 | return errors.Errorf("unexpected keyword %q parsing object", kw) 451 | } 452 | 453 | if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 { 454 | var err error 455 | tok, err = decryptString(b.key, b.useAES, b.objptr, str) 456 | if err != nil { 457 | return err 458 | } 459 | } 460 | 461 | if !b.allowObjptr { 462 | return tok 463 | } 464 | 465 | if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 { 466 | tok2 := b.readToken() 467 | if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 { 468 | tok3 := b.readToken() 469 | switch tok3 { 470 | case keyword("R"): 471 | return objptr{uint32(t1), uint16(t2)} 472 | case keyword("obj"): 473 | old := b.objptr 474 | b.objptr = objptr{uint32(t1), uint16(t2)} 475 | obj := b.readObject() 476 | if _, ok := obj.(stream); !ok { 477 | tok4 := b.readToken() 478 | if tok4 != keyword("endobj") { 479 | return errors.Errorf("missing endobj after indirect object definition") 480 | } 481 | } 482 | b.objptr = old 483 | return objdef{objptr{uint32(t1), uint16(t2)}, obj} 484 | } 485 | b.unreadToken(tok3) 486 | } 487 | b.unreadToken(tok2) 488 | } 489 | return tok 490 | } 491 | 492 | func (b *buffer) readArray() object { 493 | var x array 494 | for { 495 | tok := b.readToken() 496 | if tok == nil || tok == keyword("]") { 497 | break 498 | } 499 | b.unreadToken(tok) 500 | x = append(x, b.readObject()) 501 | } 502 | return x 503 | } 504 | 505 | func (b *buffer) readDict() object { 506 | x := make(dict) 507 | for { 508 | tok := b.readToken() 509 | if tok == nil || tok == keyword(">>") { 510 | break 511 | } 512 | n, ok := tok.(name) 513 | if !ok { 514 | return errors.Errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok) 515 | } 516 | x[n] = b.readObject() 517 | } 518 | 519 | if !b.allowStream { 520 | return x 521 | } 522 | 523 | tok := b.readToken() 524 | if tok != keyword("stream") { 525 | b.unreadToken(tok) 526 | return x 527 | } 528 | 529 | c, err := b.readByte() 530 | if err != nil { 531 | return err 532 | } 533 | switch c { 534 | case '\r': 535 | if x, _ := b.readByte(); x != '\n' { 536 | b.unreadByte() 537 | } 538 | case '\n': 539 | // ok 540 | default: 541 | return errors.Errorf("stream keyword not followed by newline") 542 | } 543 | 544 | return stream{x, b.objptr, b.readOffset()} 545 | } 546 | 547 | func isSpace(b byte) bool { 548 | switch b { 549 | case '\x00', '\t', '\n', '\f', '\r', ' ': 550 | return true 551 | } 552 | return false 553 | } 554 | 555 | func isDelim(b byte) bool { 556 | switch b { 557 | case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%': 558 | return true 559 | } 560 | return false 561 | } 562 | -------------------------------------------------------------------------------- /page.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package pdf 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "io" 11 | "strings" 12 | 13 | "github.com/pkg/errors" 14 | ) 15 | 16 | // A Page represent a single page in a PDF file. 17 | // The methods interpret a Page dictionary stored in V. 18 | type Page struct { 19 | V Value 20 | } 21 | 22 | // Page returns the page for the given page number. 23 | // Page numbers are indexed starting at 1, not 0. 24 | // If the page is not found, Page returns a Page with p.V.IsNull(). 25 | func (r *Reader) Page(num int) Page { 26 | num-- // now 0-indexed 27 | page := r.Trailer().Key("Root").Key("Pages") 28 | Search: 29 | for page.Key("Type").Name() == "Pages" { 30 | count := int(page.Key("Count").Int64()) 31 | if count < num { 32 | return Page{} 33 | } 34 | kids := page.Key("Kids") 35 | for i := 0; i < kids.Len(); i++ { 36 | kid := kids.Index(i) 37 | if kid.Key("Type").Name() == "Pages" { 38 | c := int(kid.Key("Count").Int64()) 39 | if num < c { 40 | page = kid 41 | continue Search 42 | } 43 | num -= c 44 | continue 45 | } 46 | if kid.Key("Type").Name() == "Page" { 47 | if num == 0 { 48 | return Page{kid} 49 | } 50 | num-- 51 | } 52 | } 53 | } 54 | return Page{} 55 | } 56 | 57 | // NumPage returns the number of pages in the PDF file. 58 | func (r *Reader) NumPage() int { 59 | return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64()) 60 | } 61 | 62 | // GetPlainText returns all the text in the PDF file 63 | func (r *Reader) GetPlainText() (io.Reader, error) { 64 | pages := r.NumPage() 65 | var buf bytes.Buffer 66 | fonts := make(map[string]*Font) 67 | for i := 1; i <= pages; i++ { 68 | p := r.Page(i) 69 | for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap 70 | if _, ok := fonts[name]; !ok { 71 | f := p.Font(name) 72 | fonts[name] = &f 73 | } 74 | } 75 | r, err := p.GetPlainText(fonts) 76 | if err != nil { 77 | return nil, err 78 | } 79 | _, err = buf.ReadFrom(r) 80 | if err != nil { 81 | return nil, err 82 | } 83 | } 84 | return &buf, nil 85 | } 86 | 87 | func (p Page) findInherited(key string) Value { 88 | for v := p.V; !v.IsNull(); v = v.Key("Parent") { 89 | if r := v.Key(key); !r.IsNull() { 90 | return r 91 | } 92 | } 93 | return Value{} 94 | } 95 | 96 | /* 97 | func (p Page) MediaBox() Value { 98 | return p.findInherited("MediaBox") 99 | } 100 | 101 | func (p Page) CropBox() Value { 102 | return p.findInherited("CropBox") 103 | } 104 | */ 105 | 106 | // Resources returns the resources dictionary associated with the page. 107 | func (p Page) Resources() Value { 108 | return p.findInherited("Resources") 109 | } 110 | 111 | // Fonts returns a list of the fonts associated with the page. 112 | func (p Page) Fonts() []string { 113 | return p.Resources().Key("Font").Keys() 114 | } 115 | 116 | // Font returns the font with the given name associated with the page. 117 | func (p Page) Font(name string) Font { 118 | return Font{p.Resources().Key("Font").Key(name), nil} 119 | } 120 | 121 | // A Font represent a font in a PDF file. 122 | // The methods interpret a Font dictionary stored in V. 123 | type Font struct { 124 | V Value 125 | enc TextEncoding 126 | } 127 | 128 | // BaseFont returns the font's name (BaseFont property). 129 | func (f Font) BaseFont() string { 130 | return f.V.Key("BaseFont").Name() 131 | } 132 | 133 | // FirstChar returns the code point of the first character in the font. 134 | func (f Font) FirstChar() int { 135 | return int(f.V.Key("FirstChar").Int64()) 136 | } 137 | 138 | // LastChar returns the code point of the last character in the font. 139 | func (f Font) LastChar() int { 140 | return int(f.V.Key("LastChar").Int64()) 141 | } 142 | 143 | // Widths returns the widths of the glyphs in the font. 144 | // In a well-formed PDF, len(f.Widths()) == f.LastChar()+1 - f.FirstChar(). 145 | func (f Font) Widths() []float64 { 146 | x := f.V.Key("Widths") 147 | var out []float64 148 | for i := 0; i < x.Len(); i++ { 149 | out = append(out, x.Index(i).Float64()) 150 | } 151 | return out 152 | } 153 | 154 | // Width returns the width of the given code point. 155 | func (f Font) Width(code int) float64 { 156 | first := f.FirstChar() 157 | last := f.LastChar() 158 | if code < first || last < code { 159 | return 0 160 | } 161 | return f.V.Key("Widths").Index(code - first).Float64() 162 | } 163 | 164 | // Encoder returns the encoding between font code point sequences and UTF-8. 165 | func (f *Font) Encoder() TextEncoding { 166 | if f.enc == nil { // caching the Encoder so we don't have to continually parse charmap 167 | f.enc = f.getEncoder() 168 | } 169 | return f.enc 170 | } 171 | 172 | func (f Font) getEncoder() TextEncoding { 173 | if !f.V.Key("ToUnicode").IsNull() { 174 | return f.charmapEncoding() 175 | } 176 | 177 | enc := f.V.Key("Encoding") 178 | switch enc.Kind() { 179 | case Name: 180 | switch enc.Name() { 181 | case "WinAnsiEncoding": 182 | return &byteEncoder{&winAnsiEncoding} 183 | case "MacRomanEncoding": 184 | return &byteEncoder{&macRomanEncoding} 185 | case "Identity-H": 186 | return f.charmapEncoding() 187 | default: 188 | println("unknown encoding", enc.Name()) 189 | return &nopEncoder{} 190 | } 191 | case Dict: 192 | return &dictEncoder{enc.Key("Differences")} 193 | case Null: 194 | return f.charmapEncoding() 195 | default: 196 | println("unexpected encoding", enc.String()) 197 | return &nopEncoder{} 198 | } 199 | } 200 | 201 | func (f *Font) charmapEncoding() TextEncoding { 202 | toUnicode := f.V.Key("ToUnicode") 203 | if toUnicode.Kind() == Stream { 204 | m, err := readCmap(toUnicode) 205 | if err != nil { 206 | return &nopEncoder{} 207 | } 208 | return m 209 | } 210 | 211 | return &byteEncoder{&pdfDocEncoding} 212 | } 213 | 214 | type dictEncoder struct { 215 | v Value 216 | } 217 | 218 | func (e *dictEncoder) Decode(raw string) (text string) { 219 | r := make([]rune, 0, len(raw)) 220 | for i := 0; i < len(raw); i++ { 221 | ch := rune(raw[i]) 222 | n := -1 223 | for j := 0; j < e.v.Len(); j++ { 224 | x := e.v.Index(j) 225 | if x.Kind() == Integer { 226 | n = int(x.Int64()) 227 | continue 228 | } 229 | if x.Kind() == Name { 230 | if int(raw[i]) == n { 231 | r := nameToRune[x.Name()] 232 | if r != 0 { 233 | ch = r 234 | break 235 | } 236 | } 237 | n++ 238 | } 239 | } 240 | r = append(r, ch) 241 | } 242 | return string(r) 243 | } 244 | 245 | // A TextEncoding represents a mapping between 246 | // font code points and UTF-8 text. 247 | type TextEncoding interface { 248 | // Decode returns the UTF-8 text corresponding to 249 | // the sequence of code points in raw. 250 | Decode(raw string) (text string) 251 | } 252 | 253 | type nopEncoder struct { 254 | } 255 | 256 | func (e *nopEncoder) Decode(raw string) (text string) { 257 | return raw 258 | } 259 | 260 | type byteEncoder struct { 261 | table *[256]rune 262 | } 263 | 264 | func (e *byteEncoder) Decode(raw string) (text string) { 265 | r := make([]rune, 0, len(raw)) 266 | for i := 0; i < len(raw); i++ { 267 | r = append(r, e.table[raw[i]]) 268 | } 269 | return string(r) 270 | } 271 | 272 | type byteRange struct { 273 | low string 274 | high string 275 | } 276 | 277 | type bfchar struct { 278 | orig string 279 | repl string 280 | } 281 | 282 | type bfrange struct { 283 | lo string 284 | hi string 285 | dst Value 286 | } 287 | 288 | type cmap struct { 289 | space [4][]byteRange // codespace range 290 | bfrange []bfrange 291 | bfchar []bfchar 292 | } 293 | 294 | func (m *cmap) Decode(raw string) (text string) { 295 | var r []rune 296 | Parse: 297 | for len(raw) > 0 { 298 | for n := 1; n <= 4 && n <= len(raw); n++ { // number of digits in character replacement (1-4 possible) 299 | for _, space := range m.space[n-1] { // find matching codespace Ranges for number of digits 300 | if space.low <= raw[:n] && raw[:n] <= space.high { // see if value is in range 301 | text := raw[:n] 302 | raw = raw[n:] 303 | for _, bfchar := range m.bfchar { // check for matching bfchar 304 | if len(bfchar.orig) == n && bfchar.orig == text { 305 | r = append(r, []rune(utf16Decode(bfchar.repl))...) 306 | continue Parse 307 | } 308 | } 309 | for _, bfrange := range m.bfrange { // check for matching bfrange 310 | if len(bfrange.lo) == n && bfrange.lo <= text && text <= bfrange.hi { 311 | if bfrange.dst.Kind() == String { 312 | s := bfrange.dst.RawString() 313 | if bfrange.lo != text { // value isn't at the beginning of the range so scale result 314 | b := []byte(s) 315 | b[len(b)-1] += text[len(text)-1] - bfrange.lo[len(bfrange.lo)-1] // increment last byte by difference 316 | s = string(b) 317 | } 318 | r = append(r, []rune(utf16Decode(s))...) 319 | continue Parse 320 | } 321 | if bfrange.dst.Kind() == Array { 322 | fmt.Printf("array %v\n", bfrange.dst) 323 | } else { 324 | fmt.Printf("unknown dst %v\n", bfrange.dst) 325 | } 326 | r = append(r, noRune) 327 | continue Parse 328 | } 329 | } 330 | r = append(r, noRune) 331 | continue Parse 332 | } 333 | } 334 | } 335 | println("no code space found") 336 | r = append(r, noRune) 337 | raw = raw[1:] 338 | } 339 | return string(r) 340 | } 341 | 342 | func readCmap(toUnicode Value) (*cmap, error) { 343 | n := -1 344 | var m cmap 345 | err := Interpret(toUnicode, func(stk *Stack, op string) error { 346 | switch op { 347 | case "findresource": 348 | stk.Pop() // category 349 | stk.Pop() // key 350 | stk.Push(newDict()) 351 | case "begincmap": 352 | stk.Push(newDict()) 353 | case "endcmap": 354 | stk.Pop() 355 | case "begincodespacerange": 356 | n = int(stk.Pop().Int64()) 357 | case "endcodespacerange": 358 | if n < 0 { 359 | return errors.New("missing begincodespacerange") 360 | } 361 | for i := 0; i < n; i++ { 362 | hi, lo := stk.Pop().RawString(), stk.Pop().RawString() 363 | if len(lo) == 0 || len(lo) != len(hi) { 364 | return errors.New("bad codespace range") 365 | } 366 | m.space[len(lo)-1] = append(m.space[len(lo)-1], byteRange{lo, hi}) 367 | } 368 | n = -1 369 | case "beginbfchar": 370 | n = int(stk.Pop().Int64()) 371 | case "endbfchar": 372 | if n < 0 { 373 | return errors.New("missing beginbfchar") 374 | } 375 | for i := 0; i < n; i++ { 376 | repl, orig := stk.Pop().RawString(), stk.Pop().RawString() 377 | m.bfchar = append(m.bfchar, bfchar{orig, repl}) 378 | } 379 | case "beginbfrange": 380 | n = int(stk.Pop().Int64()) 381 | case "endbfrange": 382 | if n < 0 { 383 | return errors.New("missing beginbfrange") 384 | } 385 | for i := 0; i < n; i++ { 386 | dst, srcHi, srcLo := stk.Pop(), stk.Pop().RawString(), stk.Pop().RawString() 387 | m.bfrange = append(m.bfrange, bfrange{srcLo, srcHi, dst}) 388 | } 389 | case "defineresource": 390 | stk.Pop().Name() // category 391 | value := stk.Pop() 392 | stk.Pop().Name() // key 393 | stk.Push(value) 394 | default: 395 | println("interp\t", op) 396 | } 397 | return nil 398 | }) 399 | if err != nil { 400 | return nil, err 401 | } 402 | return &m, err 403 | } 404 | 405 | type matrix [3][3]float64 406 | 407 | var ident = matrix{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}} 408 | 409 | func (x matrix) mul(y matrix) matrix { 410 | var z matrix 411 | for i := 0; i < 3; i++ { 412 | for j := 0; j < 3; j++ { 413 | for k := 0; k < 3; k++ { 414 | z[i][j] += x[i][k] * y[k][j] 415 | } 416 | } 417 | } 418 | return z 419 | } 420 | 421 | // A Text represents a single piece of text drawn on a page. 422 | type Text struct { 423 | Font string // the font used 424 | FontSize float64 // the font size, in points (1/72 of an inch) 425 | X float64 // the X coordinate, in points, increasing left to right 426 | Y float64 // the Y coordinate, in points, increasing bottom to top 427 | W float64 // the width of the text, in points 428 | S string // the actual UTF-8 text 429 | } 430 | 431 | // A Rect represents a rectangle. 432 | type Rect struct { 433 | Min, Max Point 434 | } 435 | 436 | // A Point represents an X, Y pair. 437 | type Point struct { 438 | X float64 439 | Y float64 440 | } 441 | 442 | // Content describes the basic content on a page: the text and any drawn rectangles. 443 | type Content struct { 444 | Text []Text 445 | Rect []Rect 446 | } 447 | 448 | type gstate struct { 449 | Tc float64 450 | Tw float64 451 | Th float64 452 | Tl float64 453 | Tf Font 454 | Tfs float64 455 | Tmode int 456 | Trise float64 457 | Tm matrix 458 | Tlm matrix 459 | Trm matrix 460 | CTM matrix 461 | } 462 | 463 | // GetPlainText returns the page's all text without format. 464 | // fonts can be passed in (to improve parsing performance) or left nil 465 | func (p Page) GetPlainText(fonts map[string]*Font) (io.Reader, error) { 466 | strm := p.V.Key("Contents") 467 | var enc TextEncoding = &nopEncoder{} 468 | 469 | if fonts == nil { 470 | fonts = make(map[string]*Font) 471 | for _, font := range p.Fonts() { 472 | f := p.Font(font) 473 | fonts[font] = &f 474 | } 475 | } 476 | 477 | var textBuilder bytes.Buffer 478 | showText := func(s string) { 479 | for _, ch := range enc.Decode(s) { 480 | textBuilder.WriteRune(ch) 481 | } 482 | } 483 | 484 | err := Interpret(strm, func(stk *Stack, op string) error { 485 | n := stk.Len() 486 | args := make([]Value, n) 487 | for i := n - 1; i >= 0; i-- { 488 | args[i] = stk.Pop() 489 | } 490 | 491 | switch op { 492 | default: 493 | return nil 494 | case "T*": // move to start of next line 495 | showText("\n") 496 | case "Tf": // set text font and size 497 | if len(args) != 2 { 498 | return errors.New("bad TL") 499 | } 500 | if font, ok := fonts[args[0].Name()]; ok { 501 | enc = font.Encoder() 502 | } else { 503 | enc = &nopEncoder{} 504 | } 505 | case "\"": // set spacing, move to next line, and show text 506 | if len(args) != 3 { 507 | return errors.New("bad \" operator") 508 | } 509 | fallthrough 510 | case "'": // move to next line and show text 511 | if len(args) != 1 { 512 | return errors.New("bad ' operator") 513 | } 514 | fallthrough 515 | case "Tj": // show text 516 | if len(args) != 1 { 517 | return errors.New("bad Tj operator") 518 | } 519 | showText(args[0].RawString()) 520 | case "TJ": // show text, allowing individual glyph positioning 521 | v := args[0] 522 | for i := 0; i < v.Len(); i++ { 523 | x := v.Index(i) 524 | if x.Kind() == String { 525 | showText(x.RawString()) 526 | } 527 | } 528 | } 529 | return nil 530 | }) 531 | if err != nil { 532 | return nil, err 533 | } 534 | return &textBuilder, nil 535 | } 536 | 537 | // Content returns the page's content. 538 | func (p Page) Content() (Content, error) { 539 | var enc TextEncoding = &nopEncoder{} 540 | 541 | var g = gstate{ 542 | Th: 1, 543 | CTM: ident, 544 | } 545 | 546 | var text []Text 547 | showText := func(s string) { 548 | n := 0 549 | for _, ch := range enc.Decode(s) { 550 | Trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM) 551 | w0 := g.Tf.Width(int(s[n])) 552 | n++ 553 | if ch != ' ' { 554 | f := g.Tf.BaseFont() 555 | if i := strings.Index(f, "+"); i >= 0 { 556 | f = f[i+1:] 557 | } 558 | text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) 559 | } 560 | tx := w0/1000*g.Tfs + g.Tc 561 | if ch == ' ' { 562 | tx += g.Tw 563 | } 564 | tx *= g.Th 565 | g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) 566 | } 567 | } 568 | 569 | var rect []Rect 570 | var gstack []gstate 571 | 572 | var strms []Value 573 | contents := p.V.Key("Contents") 574 | switch contents.Kind() { 575 | case Stream: 576 | strms = append(strms, contents) 577 | case Array: 578 | for i := 0; i < contents.Len(); i++ { 579 | strms = append(strms, contents.Index(i)) 580 | } 581 | default: 582 | return Content{}, errors.New("expected page contents to be a stream or an array") 583 | } 584 | 585 | for _, strm := range strms { 586 | err := Interpret(strm, func(stk *Stack, op string) error { 587 | n := stk.Len() 588 | args := make([]Value, n) 589 | for i := n - 1; i >= 0; i-- { 590 | args[i] = stk.Pop() 591 | } 592 | switch op { 593 | default: 594 | //fmt.Println(op, args) 595 | return nil 596 | 597 | case "cm": // update g.CTM 598 | if len(args) != 6 { 599 | return errors.New("bad g.Tm") 600 | } 601 | var m matrix 602 | for i := 0; i < 6; i++ { 603 | m[i/2][i%2] = args[i].Float64() 604 | } 605 | m[2][2] = 1 606 | g.CTM = m.mul(g.CTM) 607 | 608 | case "gs": // set parameters from graphics state resource 609 | gs := p.Resources().Key("ExtGState").Key(args[0].Name()) 610 | font := gs.Key("Font") 611 | if font.Kind() == Array && font.Len() == 2 { 612 | //fmt.Println("FONT", font) 613 | } 614 | 615 | case "f": // fill 616 | case "g": // setgray 617 | case "l": // lineto 618 | case "m": // moveto 619 | 620 | case "cs": // set colorspace non-stroking 621 | case "scn": // set color non-stroking 622 | 623 | case "re": // append rectangle to path 624 | if len(args) != 4 { 625 | return errors.New("bad re") 626 | } 627 | x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64() 628 | rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}}) 629 | 630 | case "q": // save graphics state 631 | gstack = append(gstack, g) 632 | 633 | case "Q": // restore graphics state 634 | n := len(gstack) - 1 635 | g = gstack[n] 636 | gstack = gstack[:n] 637 | 638 | case "BT": // begin text (reset text matrix and line matrix) 639 | g.Tm = ident 640 | g.Tlm = g.Tm 641 | 642 | case "ET": // end text 643 | 644 | case "T*": // move to start of next line 645 | x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} 646 | g.Tlm = x.mul(g.Tlm) 647 | g.Tm = g.Tlm 648 | 649 | case "Tc": // set character spacing 650 | if len(args) != 1 { 651 | return errors.New("bad g.Tc") 652 | } 653 | g.Tc = args[0].Float64() 654 | 655 | case "TD": // move text position and set leading 656 | if len(args) != 2 { 657 | return errors.New("bad Td") 658 | } 659 | g.Tl = -args[1].Float64() 660 | fallthrough 661 | case "Td": // move text position 662 | if len(args) != 2 { 663 | return errors.New("bad Td") 664 | } 665 | tx := args[0].Float64() 666 | ty := args[1].Float64() 667 | x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}} 668 | g.Tlm = x.mul(g.Tlm) 669 | g.Tm = g.Tlm 670 | 671 | case "Tf": // set text font and size 672 | if len(args) != 2 { 673 | return errors.New("bad TL") 674 | } 675 | f := args[0].Name() 676 | g.Tf = p.Font(f) 677 | enc = g.Tf.Encoder() 678 | if enc == nil { 679 | println("no cmap for", f) 680 | enc = &nopEncoder{} 681 | } 682 | g.Tfs = args[1].Float64() 683 | 684 | case "\"": // set spacing, move to next line, and show text 685 | if len(args) != 3 { 686 | return errors.New("bad \" operator") 687 | } 688 | g.Tw = args[0].Float64() 689 | g.Tc = args[1].Float64() 690 | args = args[2:] 691 | fallthrough 692 | case "'": // move to next line and show text 693 | if len(args) != 1 { 694 | return errors.New("bad ' operator") 695 | } 696 | x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} 697 | g.Tlm = x.mul(g.Tlm) 698 | g.Tm = g.Tlm 699 | fallthrough 700 | case "Tj": // show text 701 | if len(args) != 1 { 702 | return errors.New("bad Tj operator") 703 | } 704 | showText(args[0].RawString()) 705 | 706 | case "TJ": // show text, allowing individual glyph positioning 707 | v := args[0] 708 | for i := 0; i < v.Len(); i++ { 709 | x := v.Index(i) 710 | if x.Kind() == String { 711 | showText(x.RawString()) 712 | } else { 713 | tx := -x.Float64() / 1000 * g.Tfs * g.Th 714 | g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) 715 | } 716 | } 717 | 718 | case "TL": // set text leading 719 | if len(args) != 1 { 720 | return errors.New("bad TL") 721 | } 722 | g.Tl = args[0].Float64() 723 | 724 | case "Tm": // set text matrix and line matrix 725 | if len(args) != 6 { 726 | return errors.New("bad g.Tm") 727 | } 728 | var m matrix 729 | for i := 0; i < 6; i++ { 730 | m[i/2][i%2] = args[i].Float64() 731 | } 732 | m[2][2] = 1 733 | g.Tm = m 734 | g.Tlm = m 735 | 736 | case "Tr": // set text rendering mode 737 | if len(args) != 1 { 738 | return errors.New("bad Tr") 739 | } 740 | g.Tmode = int(args[0].Int64()) 741 | 742 | case "Ts": // set text rise 743 | if len(args) != 1 { 744 | return errors.New("bad Ts") 745 | } 746 | g.Trise = args[0].Float64() 747 | 748 | case "Tw": // set word spacing 749 | if len(args) != 1 { 750 | return errors.New("bad g.Tw") 751 | } 752 | g.Tw = args[0].Float64() 753 | 754 | case "Tz": // set horizontal text scaling 755 | if len(args) != 1 { 756 | return errors.New("bad Tz") 757 | } 758 | g.Th = args[0].Float64() / 100 759 | } 760 | return nil 761 | }) 762 | if err != nil { 763 | return Content{}, err 764 | } 765 | } 766 | 767 | return Content{text, rect}, nil 768 | } 769 | 770 | // TextVertical implements sort.Interface for sorting 771 | // a slice of Text values in vertical order, top to bottom, 772 | // and then left to right within a line. 773 | type TextVertical []Text 774 | 775 | func (x TextVertical) Len() int { return len(x) } 776 | func (x TextVertical) Swap(i, j int) { x[i], x[j] = x[j], x[i] } 777 | func (x TextVertical) Less(i, j int) bool { 778 | if x[i].Y != x[j].Y { 779 | return x[i].Y > x[j].Y 780 | } 781 | return x[i].X < x[j].X 782 | } 783 | 784 | // TextHorizontal implements sort.Interface for sorting 785 | // a slice of Text values in horizontal order, left to right, 786 | // and then top to bottom within a column. 787 | type TextHorizontal []Text 788 | 789 | func (x TextHorizontal) Len() int { return len(x) } 790 | func (x TextHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] } 791 | func (x TextHorizontal) Less(i, j int) bool { 792 | if x[i].X != x[j].X { 793 | return x[i].X < x[j].X 794 | } 795 | return x[i].Y > x[j].Y 796 | } 797 | 798 | // An Outline is a tree describing the outline (also known as the table of contents) 799 | // of a document. 800 | type Outline struct { 801 | Title string // title for this element 802 | Child []Outline // child elements 803 | } 804 | 805 | // Outline returns the document outline. 806 | // The Outline returned is the root of the outline tree and typically has no Title itself. 807 | // That is, the children of the returned root are the top-level entries in the outline. 808 | func (r *Reader) Outline() Outline { 809 | return buildOutline(r.Trailer().Key("Root").Key("Outlines")) 810 | } 811 | 812 | func buildOutline(entry Value) Outline { 813 | var x Outline 814 | x.Title = entry.Key("Title").Text() 815 | for child := entry.Key("First"); child.Kind() == Dict; child = child.Key("Next") { 816 | x.Child = append(x.Child, buildOutline(child)) 817 | } 818 | return x 819 | } 820 | -------------------------------------------------------------------------------- /pdfpasswd/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Pdfpasswd searches for the password for an encrypted PDF 6 | // by trying all strings over a given alphabet up to a given length. 7 | package main 8 | 9 | import ( 10 | "flag" 11 | "fmt" 12 | "log" 13 | "os" 14 | 15 | "github.com/rsc/pdf" 16 | ) 17 | 18 | var ( 19 | alphabet = flag.String("a", "0123456789", "alphabet") 20 | maxLength = flag.Int("m", 4, "max length") 21 | ) 22 | 23 | func usage() { 24 | fmt.Fprintf(os.Stderr, "usage: pdfpasswd [-a alphabet] [-m maxlength] file\n") 25 | os.Exit(2) 26 | } 27 | 28 | func main() { 29 | log.SetFlags(0) 30 | log.SetPrefix("pdfpasswd: ") 31 | 32 | flag.Usage = usage 33 | flag.Parse() 34 | if flag.NArg() != 1 { 35 | usage() 36 | } 37 | 38 | f, err := os.Open(flag.Arg(0)) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | last := "" 44 | alpha := *alphabet 45 | ctr := make([]int, *maxLength) 46 | pw := func() string { 47 | inc(ctr, len(alpha)+1) 48 | for !valid(ctr) { 49 | inc(ctr, len(alpha)+1) 50 | } 51 | if done(ctr) { 52 | return "" 53 | } 54 | buf := make([]byte, len(ctr)) 55 | var i int 56 | for i = 0; i < len(buf); i++ { 57 | if ctr[i] == 0 { 58 | break 59 | } 60 | buf[i] = alpha[ctr[i]-1] 61 | } 62 | last = string(buf[:i]) 63 | println(last) 64 | return last 65 | } 66 | st, err := f.Stat() 67 | if err != nil { 68 | log.Fatal(err) 69 | } 70 | _, err = pdf.NewReaderEncrypted(f, st.Size(), pw) 71 | if err != nil { 72 | if err == pdf.ErrInvalidPassword { 73 | log.Fatal("password not found") 74 | } 75 | log.Fatal("reading pdf: %v", err) 76 | } 77 | fmt.Printf("password: %q\n", last) 78 | } 79 | 80 | func inc(ctr []int, n int) { 81 | for i := 0; i < len(ctr); i++ { 82 | ctr[i]++ 83 | if ctr[i] < n { 84 | break 85 | } 86 | ctr[i] = 0 87 | } 88 | } 89 | 90 | func done(ctr []int) bool { 91 | for _, x := range ctr { 92 | if x != 0 { 93 | return false 94 | } 95 | } 96 | return true 97 | } 98 | 99 | func valid(ctr []int) bool { 100 | i := len(ctr) 101 | for i > 0 && ctr[i-1] == 0 { 102 | i-- 103 | } 104 | for i--; i >= 0; i-- { 105 | if ctr[i] == 0 { 106 | return false 107 | } 108 | } 109 | return true 110 | } 111 | -------------------------------------------------------------------------------- /ps.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package pdf 6 | 7 | import ( 8 | "fmt" 9 | "io" 10 | 11 | "github.com/pkg/errors" 12 | ) 13 | 14 | // A Stack represents a stack of values. 15 | type Stack struct { 16 | stack []Value 17 | } 18 | 19 | func (stk *Stack) Len() int { 20 | return len(stk.stack) 21 | } 22 | 23 | func (stk *Stack) Push(v Value) { 24 | stk.stack = append(stk.stack, v) 25 | } 26 | 27 | func (stk *Stack) Pop() Value { 28 | n := len(stk.stack) 29 | if n == 0 { 30 | return Value{} 31 | } 32 | v := stk.stack[n-1] 33 | stk.stack[n-1] = Value{} 34 | stk.stack = stk.stack[:n-1] 35 | return v 36 | } 37 | 38 | func newDict() Value { 39 | return Value{r: nil, ptr: objptr{}, data: make(dict)} 40 | } 41 | 42 | // Interpret interprets the content in a stream as a basic PostScript program, 43 | // pushing values onto a stack and then calling the do function to execute 44 | // operators. The do function may push or pop values from the stack as needed 45 | // to implement op. 46 | // 47 | // Interpret handles the operators "dict", "currentdict", "begin", "end", "def", and "pop" itself. 48 | // 49 | // Interpret is not a full-blown PostScript interpreter. Its job is to handle the 50 | // very limited PostScript found in certain supporting file formats embedded 51 | // in PDF files, such as cmap files that describe the mapping from font code 52 | // points to Unicode code points. 53 | // 54 | // There is no support for executable blocks, among other limitations. 55 | // 56 | func Interpret(strm Value, do func(stk *Stack, op string) error) error { 57 | rd := strm.Reader() 58 | b := newBuffer(rd, 0) 59 | b.allowEOF = true 60 | b.allowObjptr = false 61 | b.allowStream = false 62 | var stk Stack 63 | var dicts []dict 64 | Reading: 65 | for { 66 | tok := b.readToken() 67 | if tok == io.EOF { 68 | break 69 | } 70 | if err, ok := tok.(error); ok { 71 | return err 72 | } 73 | if kw, ok := tok.(keyword); ok { 74 | switch kw { 75 | case "null", "[", "]", "<<", ">>": 76 | break 77 | default: 78 | for i := len(dicts) - 1; i >= 0; i-- { 79 | if v, ok := dicts[i][name(kw)]; ok { 80 | stk.Push(Value{r: nil, ptr: objptr{}, data: v}) 81 | continue Reading 82 | } 83 | } 84 | err := do(&stk, string(kw)) 85 | if err != nil { 86 | return err 87 | } 88 | continue 89 | case "dict": 90 | stk.Pop() 91 | stk.Push(Value{r: nil, ptr: objptr{}, data: make(dict)}) 92 | continue 93 | case "currentdict": 94 | if len(dicts) == 0 { 95 | return errors.New("no current dictionary") 96 | } 97 | stk.Push(Value{r: nil, ptr: objptr{}, data: dicts[len(dicts)-1]}) 98 | continue 99 | case "begin": 100 | d := stk.Pop() 101 | if d.Kind() != Dict { 102 | return errors.New("cannot begin non-dict") 103 | } 104 | dicts = append(dicts, d.data.(dict)) 105 | continue 106 | case "end": 107 | if len(dicts) <= 0 { 108 | return errors.New("mismatched begin/end") 109 | } 110 | dicts = dicts[:len(dicts)-1] 111 | continue 112 | case "def": 113 | if len(dicts) <= 0 { 114 | return errors.New("def without open dict") 115 | } 116 | val := stk.Pop() 117 | key, ok := stk.Pop().data.(name) 118 | if !ok { 119 | return errors.New("def of non-name") 120 | } 121 | dicts[len(dicts)-1][key] = val.data 122 | continue 123 | case "pop": 124 | stk.Pop() 125 | continue 126 | } 127 | } 128 | b.unreadToken(tok) 129 | obj := b.readObject() 130 | stk.Push(Value{r: nil, ptr: objptr{}, data: obj}) 131 | } 132 | return nil 133 | } 134 | 135 | type seqReader struct { 136 | rd io.Reader 137 | offset int64 138 | } 139 | 140 | func (r *seqReader) ReadAt(buf []byte, offset int64) (int, error) { 141 | if offset != r.offset { 142 | return 0, fmt.Errorf("non-sequential read of stream") 143 | } 144 | n, err := io.ReadFull(r.rd, buf) 145 | r.offset += int64(n) 146 | return n, err 147 | } 148 | -------------------------------------------------------------------------------- /read.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package pdf implements reading of PDF files. 6 | // 7 | // Overview 8 | // 9 | // PDF is Adobe's Portable Document Format, ubiquitous on the internet. 10 | // A PDF document is a complex data format built on a fairly simple structure. 11 | // This package exposes the simple structure along with some wrappers to 12 | // extract basic information. If more complex information is needed, it is 13 | // possible to extract that information by interpreting the structure exposed 14 | // by this package. 15 | // 16 | // Specifically, a PDF is a data structure built from Values, each of which has 17 | // one of the following Kinds: 18 | // 19 | // Null, for the null object. 20 | // Integer, for an integer. 21 | // Real, for a floating-point number. 22 | // Bool, for a boolean value. 23 | // Name, for a name constant (as in /Helvetica). 24 | // String, for a string constant. 25 | // Dict, for a dictionary of name-value pairs. 26 | // Array, for an array of values. 27 | // Stream, for an opaque data stream and associated header dictionary. 28 | // 29 | // The accessors on Value—Int64, Float64, Bool, Name, and so on—return 30 | // a view of the data as the given type. When there is no appropriate view, 31 | // the accessor returns a zero result. For example, the Name accessor returns 32 | // the empty string if called on a Value v for which v.Kind() != Name. 33 | // Returning zero values this way, especially from the Dict and Array accessors, 34 | // which themselves return Values, makes it possible to traverse a PDF quickly 35 | // without writing any error checking. On the other hand, it means that mistakes 36 | // can go unreported. 37 | // 38 | // The basic structure of the PDF file is exposed as the graph of Values. 39 | // 40 | // Most richer data structures in a PDF file are dictionaries with specific interpretations 41 | // of the name-value pairs. The Font and Page wrappers make the interpretation 42 | // of a specific Value as the corresponding type easier. They are only helpers, though: 43 | // they are implemented only in terms of the Value API and could be moved outside 44 | // the package. Equally important, traversal of other PDF data structures can be implemented 45 | // in other packages as needed. 46 | // 47 | package pdf 48 | 49 | // BUG(rsc): The package is incomplete, although it has been used successfully on some 50 | // large real-world PDF files. 51 | 52 | // BUG(rsc): There is no support for closing open PDF files. If you drop all references to a Reader, 53 | // the underlying reader will eventually be garbage collected. 54 | 55 | // BUG(rsc): The library makes no attempt at efficiency. A value cache maintained in the Reader 56 | // would probably help significantly. 57 | 58 | // BUG(rsc): The support for reading encrypted files is weak. 59 | 60 | // BUG(rsc): The Value API does not support error reporting. The intent is to allow users to 61 | // set an error reporting callback in Reader, but that code has not been implemented. 62 | 63 | import ( 64 | "bytes" 65 | "compress/zlib" 66 | "crypto/aes" 67 | "crypto/cipher" 68 | "crypto/md5" 69 | "crypto/rc4" 70 | "fmt" 71 | "io" 72 | "io/ioutil" 73 | "os" 74 | "sort" 75 | "strconv" 76 | 77 | "github.com/pkg/errors" 78 | ) 79 | 80 | // A Reader is a single PDF file open for reading. 81 | type Reader struct { 82 | f io.ReaderAt 83 | end int64 84 | xref []xref 85 | trailer dict 86 | trailerptr objptr 87 | key []byte 88 | useAES bool 89 | } 90 | 91 | type xref struct { 92 | ptr objptr 93 | inStream bool 94 | stream objptr 95 | offset int64 96 | } 97 | 98 | // Open opens a file for reading. 99 | func Open(file string) (*Reader, error) { 100 | // TODO: Deal with closing file. 101 | f, err := os.Open(file) 102 | if err != nil { 103 | return nil, err 104 | } 105 | fi, err := f.Stat() 106 | if err != nil { 107 | f.Close() 108 | return nil, err 109 | } 110 | return NewReader(f, fi.Size()) 111 | } 112 | 113 | // NewReader opens a file for reading, using the data in f with the given total size. 114 | func NewReader(f io.ReaderAt, size int64) (*Reader, error) { 115 | return NewReaderEncrypted(f, size, nil) 116 | } 117 | 118 | // NewReaderEncrypted opens a file for reading, using the data in f with the given total size. 119 | // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords 120 | // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt 121 | // the file and returns an error. 122 | func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { 123 | buf := make([]byte, 10) 124 | f.ReadAt(buf, 0) 125 | if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' { 126 | return nil, fmt.Errorf("not a PDF file: invalid header") 127 | } 128 | end := size 129 | const endChunk = 100 130 | buf = make([]byte, endChunk) 131 | f.ReadAt(buf, end-endChunk) 132 | for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' { 133 | buf = buf[:len(buf)-1] 134 | } 135 | buf = bytes.TrimRight(buf, "\r\n\t ") 136 | if !bytes.HasSuffix(buf, []byte("%%EOF")) { 137 | return nil, fmt.Errorf("not a PDF file: missing %%%%EOF") 138 | } 139 | i := findLastLine(buf, "startxref") 140 | if i < 0 { 141 | return nil, fmt.Errorf("malformed PDF file: missing final startxref") 142 | } 143 | 144 | r := &Reader{ 145 | f: f, 146 | end: end, 147 | } 148 | pos := end - endChunk + int64(i) 149 | b := newBuffer(io.NewSectionReader(f, pos, end-pos), pos) 150 | if b.readToken() != keyword("startxref") { 151 | return nil, fmt.Errorf("malformed PDF file: missing startxref") 152 | } 153 | startxref, ok := b.readToken().(int64) 154 | if !ok { 155 | return nil, fmt.Errorf("malformed PDF file: startxref not followed by integer") 156 | } 157 | b = newBuffer(io.NewSectionReader(r.f, startxref, r.end-startxref), startxref) 158 | xref, trailerptr, trailer, err := readXref(r, b) 159 | if err != nil { 160 | return nil, err 161 | } 162 | r.xref = xref 163 | r.trailer = trailer 164 | r.trailerptr = trailerptr 165 | if trailer["Encrypt"] == nil { 166 | return r, nil 167 | } 168 | err = r.initEncrypt("") 169 | if err == nil { 170 | return r, nil 171 | } 172 | if pw == nil || err != ErrInvalidPassword { 173 | return nil, err 174 | } 175 | for { 176 | next := pw() 177 | if next == "" { 178 | break 179 | } 180 | if r.initEncrypt(next) == nil { 181 | return r, nil 182 | } 183 | } 184 | return nil, err 185 | } 186 | 187 | // Trailer returns the file's Trailer value. 188 | func (r *Reader) Trailer() Value { 189 | return Value{r: r, ptr: r.trailerptr, data: r.trailer} 190 | } 191 | 192 | func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) { 193 | tok := b.readToken() 194 | if tok == keyword("xref") { 195 | return readXrefTable(r, b) 196 | } 197 | if _, ok := tok.(int64); ok { 198 | b.unreadToken(tok) 199 | return readXrefStream(r, b) 200 | } 201 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", tok) 202 | } 203 | 204 | func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { 205 | obj1 := b.readObject() 206 | obj, ok := obj1.(objdef) 207 | if !ok { 208 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj1)) 209 | } 210 | strmptr := obj.ptr 211 | strm, ok := obj.obj.(stream) 212 | if !ok { 213 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj)) 214 | } 215 | if strm.hdr["Type"] != name("XRef") { 216 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream does not have type XRef") 217 | } 218 | size, ok := strm.hdr["Size"].(int64) 219 | if !ok { 220 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream missing Size") 221 | } 222 | table := make([]xref, size) 223 | 224 | table, err := readXrefStreamData(r, strm, table, size) 225 | if err != nil { 226 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) 227 | } 228 | 229 | for prevoff := strm.hdr["Prev"]; prevoff != nil; { 230 | off, ok := prevoff.(int64) 231 | if !ok { 232 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff) 233 | } 234 | b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off) 235 | obj1 := b.readObject() 236 | obj, ok := obj1.(objdef) 237 | if !ok { 238 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj1)) 239 | } 240 | prevstrm, ok := obj.obj.(stream) 241 | if !ok { 242 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj)) 243 | } 244 | prevoff = prevstrm.hdr["Prev"] 245 | prev := Value{r: r, ptr: objptr{}, data: prevstrm} 246 | if prev.Kind() != Stream { 247 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream is not stream: %v", prev) 248 | } 249 | if prev.Key("Type").Name() != "XRef" { 250 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream does not have type XRef") 251 | } 252 | psize := prev.Key("Size").Int64() 253 | if psize > size { 254 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream larger than last stream") 255 | } 256 | if table, err = readXrefStreamData(r, prev.data.(stream), table, psize); err != nil { 257 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: reading xref prev stream: %v", err) 258 | } 259 | } 260 | 261 | return table, strmptr, strm.hdr, nil 262 | } 263 | 264 | func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xref, error) { 265 | index, _ := strm.hdr["Index"].(array) 266 | if index == nil { 267 | index = array{int64(0), size} 268 | } 269 | if len(index)%2 != 0 { 270 | return nil, fmt.Errorf("invalid Index array %v", objfmt(index)) 271 | } 272 | ww, ok := strm.hdr["W"].(array) 273 | if !ok { 274 | return nil, fmt.Errorf("xref stream missing W array") 275 | } 276 | 277 | var w []int 278 | for _, x := range ww { 279 | i, ok := x.(int64) 280 | if !ok || int64(int(i)) != i { 281 | return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) 282 | } 283 | w = append(w, int(i)) 284 | } 285 | if len(w) < 3 { 286 | return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) 287 | } 288 | 289 | v := Value{r: r, ptr: objptr{}, data: strm} 290 | wtotal := 0 291 | for _, wid := range w { 292 | wtotal += wid 293 | } 294 | buf := make([]byte, wtotal) 295 | data := v.Reader() 296 | for len(index) > 0 { 297 | start, ok1 := index[0].(int64) 298 | n, ok2 := index[1].(int64) 299 | if !ok1 || !ok2 { 300 | return nil, fmt.Errorf("malformed Index pair %v %v %T %T", objfmt(index[0]), objfmt(index[1]), index[0], index[1]) 301 | } 302 | index = index[2:] 303 | for i := 0; i < int(n); i++ { 304 | _, err := io.ReadFull(data, buf) 305 | if err != nil { 306 | return nil, fmt.Errorf("error reading xref stream: %v", err) 307 | } 308 | v1 := decodeInt(buf[0:w[0]]) 309 | if w[0] == 0 { 310 | v1 = 1 311 | } 312 | v2 := decodeInt(buf[w[0] : w[0]+w[1]]) 313 | v3 := decodeInt(buf[w[0]+w[1] : w[0]+w[1]+w[2]]) 314 | x := int(start) + i 315 | for cap(table) <= x { 316 | table = append(table[:cap(table)], xref{}) 317 | } 318 | if table[x].ptr != (objptr{}) { 319 | continue 320 | } 321 | switch v1 { 322 | case 0: 323 | table[x] = xref{ptr: objptr{0, 65535}} 324 | case 1: 325 | table[x] = xref{ptr: objptr{uint32(x), uint16(v3)}, offset: int64(v2)} 326 | case 2: 327 | table[x] = xref{ptr: objptr{uint32(x), 0}, inStream: true, stream: objptr{uint32(v2), 0}, offset: int64(v3)} 328 | default: 329 | fmt.Printf("invalid xref stream type %d: %x\n", v1, buf) 330 | } 331 | } 332 | } 333 | return table, nil 334 | } 335 | 336 | func decodeInt(b []byte) int { 337 | x := 0 338 | for _, c := range b { 339 | x = x<<8 | int(c) 340 | } 341 | return x 342 | } 343 | 344 | func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) { 345 | var table []xref 346 | 347 | table, err := readXrefTableData(b, table) 348 | if err != nil { 349 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) 350 | } 351 | 352 | trailer, ok := b.readObject().(dict) 353 | if !ok { 354 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref table not followed by trailer dictionary") 355 | } 356 | 357 | for prevoff := trailer["Prev"]; prevoff != nil; { 358 | off, ok := prevoff.(int64) 359 | if !ok { 360 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff) 361 | } 362 | b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off) 363 | tok := b.readToken() 364 | if tok != keyword("xref") { 365 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev does not point to xref") 366 | } 367 | table, err = readXrefTableData(b, table) 368 | if err != nil { 369 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err) 370 | } 371 | 372 | trailer, ok := b.readObject().(dict) 373 | if !ok { 374 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev table not followed by trailer dictionary") 375 | } 376 | prevoff = trailer["Prev"] 377 | } 378 | 379 | size, ok := trailer[name("Size")].(int64) 380 | if !ok { 381 | return nil, objptr{}, nil, fmt.Errorf("malformed PDF: trailer missing /Size entry") 382 | } 383 | 384 | if size < int64(len(table)) { 385 | table = table[:size] 386 | } 387 | 388 | return table, objptr{}, trailer, nil 389 | } 390 | 391 | func readXrefTableData(b *buffer, table []xref) ([]xref, error) { 392 | for { 393 | tok := b.readToken() 394 | if tok == keyword("trailer") { 395 | break 396 | } 397 | start, ok1 := tok.(int64) 398 | n, ok2 := b.readToken().(int64) 399 | if !ok1 || !ok2 { 400 | return nil, fmt.Errorf("malformed xref table") 401 | } 402 | for i := 0; i < int(n); i++ { 403 | off, ok1 := b.readToken().(int64) 404 | gen, ok2 := b.readToken().(int64) 405 | alloc, ok3 := b.readToken().(keyword) 406 | if !ok1 || !ok2 || !ok3 || alloc != keyword("f") && alloc != keyword("n") { 407 | return nil, fmt.Errorf("malformed xref table") 408 | } 409 | x := int(start) + i 410 | for cap(table) <= x { 411 | table = append(table[:cap(table)], xref{}) 412 | } 413 | if len(table) <= x { 414 | table = table[:x+1] 415 | } 416 | if alloc == "n" && table[x].offset == 0 { 417 | table[x] = xref{ptr: objptr{uint32(x), uint16(gen)}, offset: int64(off)} 418 | } 419 | } 420 | } 421 | return table, nil 422 | } 423 | 424 | func findLastLine(buf []byte, s string) int { 425 | bs := []byte(s) 426 | max := len(buf) 427 | for { 428 | i := bytes.LastIndex(buf[:max], bs) 429 | if i <= 0 || i+len(bs) >= len(buf) { 430 | return -1 431 | } 432 | if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') { 433 | return i 434 | } 435 | max = i 436 | } 437 | } 438 | 439 | // A Value is a single PDF value, such as an integer, dictionary, or array. 440 | // The zero Value is a PDF null (Kind() == Null, IsNull() = true). 441 | type Value struct { 442 | r *Reader 443 | ptr objptr 444 | data interface{} 445 | err error 446 | } 447 | 448 | // IsNull reports whether the value is a null. It is equivalent to Kind() == Null. 449 | func (v Value) IsNull() bool { 450 | return v.data == nil 451 | } 452 | 453 | // IsError reports whether the value is an error. It is equivalent to v.err != nil 454 | func (v Value) IsError() bool { 455 | return v.err != nil 456 | } 457 | 458 | // A ValueKind specifies the kind of data underlying a Value. 459 | type ValueKind int 460 | 461 | // The PDF value kinds. 462 | const ( 463 | Null ValueKind = iota 464 | Bool 465 | Integer 466 | Real 467 | String 468 | Name 469 | Dict 470 | Array 471 | Stream 472 | ) 473 | 474 | // Kind reports the kind of value underlying v. 475 | func (v Value) Kind() ValueKind { 476 | switch v.data.(type) { 477 | default: 478 | return Null 479 | case bool: 480 | return Bool 481 | case int64: 482 | return Integer 483 | case float64: 484 | return Real 485 | case string: 486 | return String 487 | case name: 488 | return Name 489 | case dict: 490 | return Dict 491 | case array: 492 | return Array 493 | case stream: 494 | return Stream 495 | } 496 | } 497 | 498 | // String returns a textual representation of the value v. 499 | // Note that String is not the accessor for values with Kind() == String. 500 | // To access such values, see RawString, Text, and TextFromUTF16. 501 | func (v Value) String() string { 502 | return objfmt(v.data) 503 | } 504 | 505 | func objfmt(x interface{}) string { 506 | switch x := x.(type) { 507 | default: 508 | return fmt.Sprint(x) 509 | case string: 510 | if isPDFDocEncoded(x) { 511 | return strconv.Quote(pdfDocDecode(x)) 512 | } 513 | if isUTF16(x) { 514 | return strconv.Quote(utf16Decode(x[2:])) 515 | } 516 | return strconv.Quote(x) 517 | case name: 518 | return "/" + string(x) 519 | case dict: 520 | var keys []string 521 | for k := range x { 522 | keys = append(keys, string(k)) 523 | } 524 | sort.Strings(keys) 525 | var buf bytes.Buffer 526 | buf.WriteString("<<") 527 | for i, k := range keys { 528 | elem := x[name(k)] 529 | if i > 0 { 530 | buf.WriteString(" ") 531 | } 532 | buf.WriteString("/") 533 | buf.WriteString(k) 534 | buf.WriteString(" ") 535 | buf.WriteString(objfmt(elem)) 536 | } 537 | buf.WriteString(">>") 538 | return buf.String() 539 | 540 | case array: 541 | var buf bytes.Buffer 542 | buf.WriteString("[") 543 | for i, elem := range x { 544 | if i > 0 { 545 | buf.WriteString(" ") 546 | } 547 | buf.WriteString(objfmt(elem)) 548 | } 549 | buf.WriteString("]") 550 | return buf.String() 551 | 552 | case stream: 553 | return fmt.Sprintf("%v@%d", objfmt(x.hdr), x.offset) 554 | 555 | case objptr: 556 | return fmt.Sprintf("%d %d R", x.id, x.gen) 557 | 558 | case objdef: 559 | return fmt.Sprintf("{%d %d obj}%v", x.ptr.id, x.ptr.gen, objfmt(x.obj)) 560 | } 561 | } 562 | 563 | // Bool returns v's boolean value. 564 | // If v.Kind() != Bool, Bool returns false. 565 | func (v Value) Bool() bool { 566 | x, ok := v.data.(bool) 567 | if !ok { 568 | return false 569 | } 570 | return x 571 | } 572 | 573 | // Int64 returns v's int64 value. 574 | // If v.Kind() != Int64, Int64 returns 0. 575 | func (v Value) Int64() int64 { 576 | x, ok := v.data.(int64) 577 | if !ok { 578 | return 0 579 | } 580 | return x 581 | } 582 | 583 | // Float64 returns v's float64 value, converting from integer if necessary. 584 | // If v.Kind() != Float64 and v.Kind() != Int64, Float64 returns 0. 585 | func (v Value) Float64() float64 { 586 | x, ok := v.data.(float64) 587 | if !ok { 588 | x, ok := v.data.(int64) 589 | if ok { 590 | return float64(x) 591 | } 592 | return 0 593 | } 594 | return x 595 | } 596 | 597 | // RawString returns v's string value. 598 | // If v.Kind() != String, RawString returns the empty string. 599 | func (v Value) RawString() string { 600 | x, ok := v.data.(string) 601 | if !ok { 602 | return "" 603 | } 604 | return x 605 | } 606 | 607 | // Text returns v's string value interpreted as a ``text string'' (defined in the PDF spec) 608 | // and converted to UTF-8. 609 | // If v.Kind() != String, Text returns the empty string. 610 | func (v Value) Text() string { 611 | x, ok := v.data.(string) 612 | if !ok { 613 | return "" 614 | } 615 | if isPDFDocEncoded(x) { 616 | return pdfDocDecode(x) 617 | } 618 | if isUTF16(x) { 619 | return utf16Decode(x[2:]) 620 | } 621 | return x 622 | } 623 | 624 | // TextFromUTF16 returns v's string value interpreted as big-endian UTF-16 625 | // and then converted to UTF-8. 626 | // If v.Kind() != String or if the data is not valid UTF-16, TextFromUTF16 returns 627 | // the empty string. 628 | func (v Value) TextFromUTF16() string { 629 | x, ok := v.data.(string) 630 | if !ok { 631 | return "" 632 | } 633 | if len(x)%2 == 1 { 634 | return "" 635 | } 636 | if x == "" { 637 | return "" 638 | } 639 | return utf16Decode(x) 640 | } 641 | 642 | // Name returns v's name value. 643 | // If v.Kind() != Name, Name returns the empty string. 644 | // The returned name does not include the leading slash: 645 | // if v corresponds to the name written using the syntax /Helvetica, 646 | // Name() == "Helvetica". 647 | func (v Value) Name() string { 648 | x, ok := v.data.(name) 649 | if !ok { 650 | return "" 651 | } 652 | return string(x) 653 | } 654 | 655 | // Key returns the value associated with the given name key in the dictionary v. 656 | // Like the result of the Name method, the key should not include a leading slash. 657 | // If v is a stream, Key applies to the stream's header dictionary. 658 | // If v.Kind() != Dict and v.Kind() != Stream, Key returns a null Value. 659 | func (v Value) Key(key string) Value { 660 | if v.IsError() { 661 | return v 662 | } 663 | x, ok := v.data.(dict) 664 | if !ok { 665 | strm, ok := v.data.(stream) 666 | if !ok { 667 | return Value{} 668 | } 669 | x = strm.hdr 670 | } 671 | return v.r.resolve(v.ptr, x[name(key)]) 672 | } 673 | 674 | // Keys returns a sorted list of the keys in the dictionary v. 675 | // If v is a stream, Keys applies to the stream's header dictionary. 676 | // If v.Kind() != Dict and v.Kind() != Stream, Keys returns nil. 677 | func (v Value) Keys() []string { 678 | x, ok := v.data.(dict) 679 | if !ok { 680 | strm, ok := v.data.(stream) 681 | if !ok { 682 | return nil 683 | } 684 | x = strm.hdr 685 | } 686 | keys := []string{} // not nil 687 | for k := range x { 688 | keys = append(keys, string(k)) 689 | } 690 | sort.Strings(keys) 691 | return keys 692 | } 693 | 694 | // Index returns the i'th element in the array v. 695 | // If v.Kind() != Array or if i is outside the array bounds, 696 | // Index returns a null Value. 697 | func (v Value) Index(i int) Value { 698 | if v.IsError() { 699 | return v 700 | } 701 | x, ok := v.data.(array) 702 | if !ok || i < 0 || i >= len(x) { 703 | return Value{} 704 | } 705 | return v.r.resolve(v.ptr, x[i]) 706 | } 707 | 708 | // Len returns the length of the array v. 709 | // If v.Kind() != Array, Len returns 0. 710 | func (v Value) Len() int { 711 | x, ok := v.data.(array) 712 | if !ok { 713 | return 0 714 | } 715 | return len(x) 716 | } 717 | 718 | func (r *Reader) resolve(parent objptr, x interface{}) Value { 719 | if ptr, ok := x.(objptr); ok { 720 | if ptr.id >= uint32(len(r.xref)) { 721 | return Value{} 722 | } 723 | xref := r.xref[ptr.id] 724 | if xref.ptr != ptr || !xref.inStream && xref.offset == 0 { 725 | return Value{} 726 | } 727 | var obj object 728 | if xref.inStream { 729 | strm := r.resolve(parent, xref.stream) 730 | Search: 731 | for { 732 | if strm.Kind() != Stream { 733 | return Value{err: errors.New("not a stream")} 734 | } 735 | if strm.Key("Type").Name() != "ObjStm" { 736 | return Value{err: errors.New("not an object stream")} 737 | } 738 | n := int(strm.Key("N").Int64()) 739 | first := strm.Key("First").Int64() 740 | if first == 0 { 741 | return Value{err: errors.New("missing First")} 742 | } 743 | b := newBuffer(strm.Reader(), 0) 744 | b.allowEOF = true 745 | for i := 0; i < n; i++ { 746 | id, _ := b.readToken().(int64) 747 | off, _ := b.readToken().(int64) 748 | if uint32(id) == ptr.id { 749 | b.seekForward(first + off) 750 | x = b.readObject() 751 | break Search 752 | } 753 | } 754 | ext := strm.Key("Extends") 755 | if ext.Kind() != Stream { 756 | return Value{err: errors.New("cannot find object in stream")} 757 | } 758 | strm = ext 759 | } 760 | } else { 761 | b := newBuffer(io.NewSectionReader(r.f, xref.offset, r.end-xref.offset), xref.offset) 762 | b.key = r.key 763 | b.useAES = r.useAES 764 | obj = b.readObject() 765 | def, ok := obj.(objdef) 766 | if !ok { 767 | return Value{err: fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)} 768 | } 769 | if def.ptr != ptr { 770 | return Value{err: fmt.Errorf("loading %v: found %v", ptr, def.ptr)} 771 | } 772 | x = def.obj 773 | } 774 | parent = ptr 775 | } 776 | 777 | switch x := x.(type) { 778 | case nil, bool, int64, float64, name, dict, array, stream: 779 | return Value{r: r, ptr: parent, data: x} 780 | case string: 781 | return Value{r: r, ptr: parent, data: x} 782 | default: 783 | return Value{err: fmt.Errorf("unexpected value type %T in resolve", x)} 784 | } 785 | } 786 | 787 | type errorReadCloser struct { 788 | err error 789 | } 790 | 791 | func (e *errorReadCloser) Read([]byte) (int, error) { 792 | return 0, e.err 793 | } 794 | 795 | func (e *errorReadCloser) Close() error { 796 | return e.err 797 | } 798 | 799 | // Reader returns the data contained in the stream v. 800 | // If v.Kind() != Stream, Reader returns a ReadCloser that 801 | // responds to all reads with a ``stream not present'' error. 802 | func (v Value) Reader() io.ReadCloser { 803 | if v.IsError() { 804 | return &errorReadCloser{errors.Wrap(v.err, "stream not present")} 805 | } 806 | x, ok := v.data.(stream) 807 | if !ok { 808 | return &errorReadCloser{fmt.Errorf("stream not present")} 809 | } 810 | var rd io.Reader 811 | rd = io.NewSectionReader(v.r.f, x.offset, v.Key("Length").Int64()) 812 | if v.r.key != nil { 813 | rd = decryptStream(v.r.key, v.r.useAES, x.ptr, rd) 814 | } 815 | filter := v.Key("Filter") 816 | param := v.Key("DecodeParms") 817 | switch filter.Kind() { 818 | default: 819 | return &errorReadCloser{fmt.Errorf("unsupported filter %v", filter)} 820 | case Null: 821 | // ok 822 | case Name: 823 | rd = applyFilter(rd, filter.Name(), param) 824 | case Array: 825 | for i := 0; i < filter.Len(); i++ { 826 | rd = applyFilter(rd, filter.Index(i).Name(), param.Index(i)) 827 | } 828 | } 829 | 830 | return ioutil.NopCloser(rd) 831 | } 832 | 833 | func applyFilter(rd io.Reader, name string, param Value) io.Reader { 834 | switch name { 835 | default: 836 | return &errorReadCloser{errors.New("unknown filter " + name)} 837 | case "FlateDecode": 838 | zr, err := zlib.NewReader(rd) 839 | if err != nil { 840 | return &errorReadCloser{err} 841 | } 842 | pred := param.Key("Predictor") 843 | if pred.Kind() == Null { 844 | return zr 845 | } 846 | columns := param.Key("Columns").Int64() 847 | switch pred.Int64() { 848 | default: 849 | return &errorReadCloser{errors.Errorf("unknown predictor %v", pred)} 850 | case 12: 851 | return &pngUpReader{r: zr, hist: make([]byte, 1+columns), tmp: make([]byte, 1+columns)} 852 | } 853 | } 854 | } 855 | 856 | type pngUpReader struct { 857 | r io.Reader 858 | hist []byte 859 | tmp []byte 860 | pend []byte 861 | } 862 | 863 | func (r *pngUpReader) Read(b []byte) (int, error) { 864 | n := 0 865 | for len(b) > 0 { 866 | if len(r.pend) > 0 { 867 | m := copy(b, r.pend) 868 | n += m 869 | b = b[m:] 870 | r.pend = r.pend[m:] 871 | continue 872 | } 873 | _, err := io.ReadFull(r.r, r.tmp) 874 | if err != nil { 875 | return n, err 876 | } 877 | if r.tmp[0] != 2 { 878 | return n, fmt.Errorf("malformed PNG-Up encoding") 879 | } 880 | for i, b := range r.tmp { 881 | r.hist[i] += b 882 | } 883 | r.pend = r.hist[1:] 884 | } 885 | return n, nil 886 | } 887 | 888 | var passwordPad = []byte{ 889 | 0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08, 890 | 0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A, 891 | } 892 | 893 | func (r *Reader) initEncrypt(password string) error { 894 | // See PDF 32000-1:2008, §7.6. 895 | encrypt, _ := r.resolve(objptr{}, r.trailer["Encrypt"]).data.(dict) 896 | if encrypt["Filter"] != name("Standard") { 897 | return fmt.Errorf("unsupported PDF: encryption filter %v", objfmt(encrypt["Filter"])) 898 | } 899 | n, _ := encrypt["Length"].(int64) 900 | if n == 0 { 901 | n = 40 902 | } 903 | if n%8 != 0 || n > 128 || n < 40 { 904 | return fmt.Errorf("malformed PDF: %d-bit encryption key", n) 905 | } 906 | V, _ := encrypt["V"].(int64) 907 | if V != 1 && V != 2 && (V != 4 || !okayV4(encrypt)) { 908 | return fmt.Errorf("unsupported PDF: encryption version V=%d; %v", V, objfmt(encrypt)) 909 | } 910 | 911 | ids, ok := r.trailer["ID"].(array) 912 | if !ok || len(ids) < 1 { 913 | return fmt.Errorf("malformed PDF: missing ID in trailer") 914 | } 915 | idstr, ok := ids[0].(string) 916 | if !ok { 917 | return fmt.Errorf("malformed PDF: missing ID in trailer") 918 | } 919 | ID := []byte(idstr) 920 | 921 | R, _ := encrypt["R"].(int64) 922 | if R < 2 { 923 | return fmt.Errorf("malformed PDF: encryption revision R=%d", R) 924 | } 925 | if R > 4 { 926 | return fmt.Errorf("unsupported PDF: encryption revision R=%d", R) 927 | } 928 | O, _ := encrypt["O"].(string) 929 | U, _ := encrypt["U"].(string) 930 | if len(O) != 32 || len(U) != 32 { 931 | return fmt.Errorf("malformed PDF: missing O= or U= encryption parameters") 932 | } 933 | p, _ := encrypt["P"].(int64) 934 | P := uint32(p) 935 | 936 | // TODO: Password should be converted to Latin-1. 937 | pw := []byte(password) 938 | h := md5.New() 939 | if len(pw) >= 32 { 940 | h.Write(pw[:32]) 941 | } else { 942 | h.Write(pw) 943 | h.Write(passwordPad[:32-len(pw)]) 944 | } 945 | h.Write([]byte(O)) 946 | h.Write([]byte{byte(P), byte(P >> 8), byte(P >> 16), byte(P >> 24)}) 947 | h.Write([]byte(ID)) 948 | key := h.Sum(nil) 949 | 950 | if R >= 3 { 951 | for i := 0; i < 50; i++ { 952 | h.Reset() 953 | h.Write(key[:n/8]) 954 | key = h.Sum(key[:0]) 955 | } 956 | key = key[:n/8] 957 | } else { 958 | key = key[:40/8] 959 | } 960 | 961 | c, err := rc4.NewCipher(key) 962 | if err != nil { 963 | return fmt.Errorf("malformed PDF: invalid RC4 key: %v", err) 964 | } 965 | 966 | var u []byte 967 | if R == 2 { 968 | u = make([]byte, 32) 969 | copy(u, passwordPad) 970 | c.XORKeyStream(u, u) 971 | } else { 972 | h.Reset() 973 | h.Write(passwordPad) 974 | h.Write([]byte(ID)) 975 | u = h.Sum(nil) 976 | c.XORKeyStream(u, u) 977 | 978 | for i := 1; i <= 19; i++ { 979 | key1 := make([]byte, len(key)) 980 | copy(key1, key) 981 | for j := range key1 { 982 | key1[j] ^= byte(i) 983 | } 984 | c, _ = rc4.NewCipher(key1) 985 | c.XORKeyStream(u, u) 986 | } 987 | } 988 | 989 | if !bytes.HasPrefix([]byte(U), u) { 990 | return ErrInvalidPassword 991 | } 992 | 993 | r.key = key 994 | r.useAES = V == 4 995 | 996 | return nil 997 | } 998 | 999 | var ErrInvalidPassword = fmt.Errorf("encrypted PDF: invalid password") 1000 | 1001 | func okayV4(encrypt dict) bool { 1002 | cf, ok := encrypt["CF"].(dict) 1003 | if !ok { 1004 | return false 1005 | } 1006 | stmf, ok := encrypt["StmF"].(name) 1007 | if !ok { 1008 | return false 1009 | } 1010 | strf, ok := encrypt["StrF"].(name) 1011 | if !ok { 1012 | return false 1013 | } 1014 | if stmf != strf { 1015 | return false 1016 | } 1017 | cfparam, ok := cf[stmf].(dict) 1018 | if cfparam["AuthEvent"] != nil && cfparam["AuthEvent"] != name("DocOpen") { 1019 | return false 1020 | } 1021 | if cfparam["Length"] != nil && cfparam["Length"] != int64(16) { 1022 | return false 1023 | } 1024 | if cfparam["CFM"] != name("AESV2") { 1025 | return false 1026 | } 1027 | return true 1028 | } 1029 | 1030 | func cryptKey(key []byte, useAES bool, ptr objptr) []byte { 1031 | h := md5.New() 1032 | h.Write(key) 1033 | h.Write([]byte{byte(ptr.id), byte(ptr.id >> 8), byte(ptr.id >> 16), byte(ptr.gen), byte(ptr.gen >> 8)}) 1034 | if useAES { 1035 | h.Write([]byte("sAlT")) 1036 | } 1037 | return h.Sum(nil) 1038 | } 1039 | 1040 | func decryptString(key []byte, useAES bool, ptr objptr, x string) (string, error) { 1041 | key = cryptKey(key, useAES, ptr) 1042 | if useAES { 1043 | return "", errors.New("AES not implemented") 1044 | } 1045 | 1046 | c, _ := rc4.NewCipher(key) 1047 | data := []byte(x) 1048 | c.XORKeyStream(data, data) 1049 | return string(data), nil 1050 | } 1051 | 1052 | func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader { 1053 | key = cryptKey(key, useAES, ptr) 1054 | if useAES { 1055 | cb, err := aes.NewCipher(key) 1056 | if err != nil { 1057 | return &errorReadCloser{errors.New("AES: " + err.Error())} 1058 | } 1059 | iv := make([]byte, 16) 1060 | io.ReadFull(rd, iv) 1061 | cbc := cipher.NewCBCDecrypter(cb, iv) 1062 | rd = &cbcReader{cbc: cbc, rd: rd, buf: make([]byte, 16)} 1063 | } else { 1064 | c, _ := rc4.NewCipher(key) 1065 | rd = &cipher.StreamReader{S: c, R: rd} 1066 | } 1067 | return rd 1068 | } 1069 | 1070 | type cbcReader struct { 1071 | cbc cipher.BlockMode 1072 | rd io.Reader 1073 | buf []byte 1074 | pend []byte 1075 | } 1076 | 1077 | func (r *cbcReader) Read(b []byte) (n int, err error) { 1078 | if len(r.pend) == 0 { 1079 | _, err = io.ReadFull(r.rd, r.buf) 1080 | if err != nil { 1081 | return 0, err 1082 | } 1083 | r.cbc.CryptBlocks(r.buf, r.buf) 1084 | r.pend = r.buf 1085 | } 1086 | n = copy(b, r.pend) 1087 | r.pend = r.pend[n:] 1088 | return n, nil 1089 | } 1090 | -------------------------------------------------------------------------------- /text.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package pdf 6 | 7 | import ( 8 | "unicode" 9 | "unicode/utf16" 10 | ) 11 | 12 | const noRune = unicode.ReplacementChar 13 | 14 | func isPDFDocEncoded(s string) bool { 15 | if isUTF16(s) { 16 | return false 17 | } 18 | for i := 0; i < len(s); i++ { 19 | if pdfDocEncoding[s[i]] == noRune { 20 | return false 21 | } 22 | } 23 | return true 24 | } 25 | 26 | func pdfDocDecode(s string) string { 27 | for i := 0; i < len(s); i++ { 28 | if s[i] >= 0x80 || pdfDocEncoding[s[i]] != rune(s[i]) { 29 | goto Decode 30 | } 31 | } 32 | return s 33 | 34 | Decode: 35 | r := make([]rune, len(s)) 36 | for i := 0; i < len(s); i++ { 37 | r[i] = pdfDocEncoding[s[i]] 38 | } 39 | return string(r) 40 | } 41 | 42 | func isUTF16(s string) bool { 43 | return len(s) >= 2 && s[0] == 0xfe && s[1] == 0xff && len(s)%2 == 0 44 | } 45 | 46 | func utf16Decode(s string) string { 47 | var u []uint16 48 | for i := 0; i < len(s); i += 2 { 49 | u = append(u, uint16(s[i])<<8|uint16(s[i+1])) 50 | } 51 | return string(utf16.Decode(u)) 52 | } 53 | 54 | // See PDF 32000-1:2008, Table D.2 55 | var pdfDocEncoding = [256]rune{ 56 | noRune, noRune, noRune, noRune, noRune, noRune, noRune, noRune, 57 | noRune, 0x0009, 0x000a, noRune, noRune, 0x000d, noRune, noRune, 58 | noRune, noRune, noRune, noRune, noRune, noRune, noRune, noRune, 59 | 0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 60 | 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 61 | 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 62 | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 63 | 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 64 | 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 65 | 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 66 | 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 67 | 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 68 | 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 69 | 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 70 | 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 71 | 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, noRune, 72 | 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, 73 | 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018, 74 | 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 75 | 0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, noRune, 76 | 0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 77 | 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, noRune, 0x00ae, 0x00af, 78 | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 79 | 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 80 | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 81 | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 82 | 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 83 | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 84 | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 85 | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 86 | 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 87 | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 88 | } 89 | 90 | var winAnsiEncoding = [256]rune{ 91 | 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 92 | 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 93 | 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 94 | 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 95 | 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 96 | 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 97 | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 98 | 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 99 | 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 100 | 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 101 | 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 102 | 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 103 | 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 104 | 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 105 | 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 106 | 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 107 | 0x20ac, noRune, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, 108 | 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, noRune, 0x017d, noRune, 109 | noRune, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 110 | 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, noRune, 0x017e, 0x0178, 111 | 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 112 | 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 113 | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 114 | 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 115 | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 116 | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 117 | 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 118 | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 119 | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 120 | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 121 | 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 122 | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 123 | } 124 | 125 | var macRomanEncoding = [256]rune{ 126 | 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 127 | 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 128 | 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 129 | 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 130 | 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 131 | 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 132 | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 133 | 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 134 | 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 135 | 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 136 | 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 137 | 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 138 | 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 139 | 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 140 | 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 141 | 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 142 | 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, 143 | 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, 144 | 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, 145 | 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, 146 | 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, 147 | 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, 148 | 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, 149 | 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, 150 | 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, 151 | 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, 152 | 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, 153 | 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, 154 | 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, 155 | 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, 156 | 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, 157 | 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, 158 | } 159 | --------------------------------------------------------------------------------