├── LICENSE
├── README.md
├── lex.go
├── name.go
├── page.go
├── pdfpasswd
    └── main.go
├── ps.go
├── read.go
└── text.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PDF Reader
 2 | 
 3 | A simple Go library which enables reading PDF files.
 4 | 
 5 | Features
 6 |   - Get plain text content (without format)
 7 |   - Get Content (including all font and formatting information)
 8 | 
 9 | ## Install:
10 | 
11 | `go get -u github.com/rsc/pdf`
12 | 
13 | 
14 | ## Read plain text
15 | 
16 | ```golang
17 | package main
18 | 
19 | import (
20 | 	"bytes"
21 | 	"fmt"
22 | 
23 | 	"github.com/rsc/pdf"
24 | )
25 | 
26 | func main() {
27 | 	content, err := readPdf("test.pdf") // Read local pdf file
28 | 	if err != nil {
29 | 		panic(err)
30 | 	}
31 | 	fmt.Println(content)
32 | 	return
33 | }
34 | 
35 | func readPdf(path string) (string, error) {
36 | 	r, err := pdf.Open(path)
37 | 	if err != nil {
38 | 		return "", err
39 | 	}
40 | 
41 | 	var buf bytes.Buffer
42 | 	buf.ReadFrom(p.GetPlainText())
43 | 	return buf.String(), nil
44 | }
45 | ```
46 | 
47 | ## Read all text with styles from PDF
48 | 
49 | ```golang
50 | func readPdf2(path string) (string, error) {
51 | 	r, err := pdf.Open(path)
52 | 	if err != nil {
53 | 		return "", err
54 | 	}
55 | 	totalPage := r.NumPage()
56 | 
57 | 	for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
58 | 		p := r.Page(pageIndex)
59 | 		if p.V.IsNull() {
60 | 			continue
61 | 		}
62 | 		var lastTextStyle pdf.Text
63 | 		texts := p.Content().Text
64 | 		for _, text := range texts {
65 | 			if isSameSentence(text, lastTextStyle) {
66 | 				lastTextStyle.S = lastTextStyle.S + text.S
67 | 			} else {
68 | 				fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S)
69 | 				lastTextStyle = text
70 | 			}
71 | 		}
72 | 	}
73 | 	return "", nil
74 | }
75 | ```
76 | 
77 | ## Demo
78 | ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif)


--------------------------------------------------------------------------------
/lex.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // Reading of PDF tokens and objects from a raw byte stream.
  6 | 
  7 | package pdf
  8 | 
  9 | import (
 10 | 	"io"
 11 | 	"strconv"
 12 | 
 13 | 	"github.com/pkg/errors"
 14 | )
 15 | 
 16 | // A token is a PDF token in the input stream, one of the following Go types:
 17 | //
 18 | //	bool, a PDF boolean
 19 | //	int64, a PDF integer
 20 | //	float64, a PDF real
 21 | //	string, a PDF string literal
 22 | //	keyword, a PDF keyword
 23 | //	name, a PDF name without the leading slash
 24 | //
 25 | type token interface{}
 26 | 
 27 | // A name is a PDF name, without the leading slash.
 28 | type name string
 29 | 
 30 | // A keyword is a PDF keyword.
 31 | // Delimiter tokens used in higher-level syntax,
 32 | // such as "<<", ">>", "[", "]", "{", "}", are also treated as keywords.
 33 | type keyword string
 34 | 
 35 | // A buffer holds buffered input bytes from the PDF file.
 36 | type buffer struct {
 37 | 	r           io.Reader // source of data
 38 | 	buf         []byte    // buffered data
 39 | 	pos         int       // read index in buf
 40 | 	offset      int64     // offset at end of buf; aka offset of next read
 41 | 	tmp         []byte    // scratch space for accumulating token
 42 | 	unread      []token   // queue of read but then unread tokens
 43 | 	allowEOF    bool
 44 | 	allowObjptr bool
 45 | 	allowStream bool
 46 | 	eof         bool
 47 | 	key         []byte
 48 | 	useAES      bool
 49 | 	objptr      objptr
 50 | }
 51 | 
 52 | // newBuffer returns a new buffer reading from r at the given offset.
 53 | func newBuffer(r io.Reader, offset int64) *buffer {
 54 | 	return &buffer{
 55 | 		r:           r,
 56 | 		offset:      offset,
 57 | 		buf:         make([]byte, 0, 4096),
 58 | 		allowObjptr: true,
 59 | 		allowStream: true,
 60 | 	}
 61 | }
 62 | 
 63 | func (b *buffer) seek(offset int64) {
 64 | 	b.offset = offset
 65 | 	b.buf = b.buf[:0]
 66 | 	b.pos = 0
 67 | 	b.unread = b.unread[:0]
 68 | }
 69 | 
 70 | func (b *buffer) readByte() (byte, error) {
 71 | 	if b.pos >= len(b.buf) {
 72 | 		if _, err := b.reload(); err != nil {
 73 | 			return '\x00', err
 74 | 		}
 75 | 		if b.pos >= len(b.buf) {
 76 | 			return '\n', nil
 77 | 		}
 78 | 	}
 79 | 	c := b.buf[b.pos]
 80 | 	b.pos++
 81 | 	return c, nil
 82 | }
 83 | 
 84 | func (b *buffer) reload() (bool, error) {
 85 | 	n := cap(b.buf) - int(b.offset%int64(cap(b.buf)))
 86 | 	n, err := b.r.Read(b.buf[:n])
 87 | 	if n == 0 && err != nil {
 88 | 		b.buf = b.buf[:0]
 89 | 		b.pos = 0
 90 | 		if b.allowEOF && err == io.EOF {
 91 | 			b.eof = true
 92 | 			return false, nil
 93 | 		}
 94 | 		return false, errors.Errorf("malformed PDF: reading at offset %d: %v", b.offset, err)
 95 | 	}
 96 | 	b.offset += int64(n)
 97 | 	b.buf = b.buf[:n]
 98 | 	b.pos = 0
 99 | 	return true, nil
100 | }
101 | 
102 | func (b *buffer) seekForward(offset int64) {
103 | 	for b.offset < offset {
104 | 		if ok, _ := b.reload(); !ok {
105 | 			return
106 | 		}
107 | 	}
108 | 	b.pos = len(b.buf) - int(b.offset-offset)
109 | }
110 | 
111 | func (b *buffer) readOffset() int64 {
112 | 	return b.offset - int64(len(b.buf)) + int64(b.pos)
113 | }
114 | 
115 | func (b *buffer) unreadByte() {
116 | 	if b.pos > 0 {
117 | 		b.pos--
118 | 	}
119 | }
120 | 
121 | func (b *buffer) unreadToken(t token) {
122 | 	b.unread = append(b.unread, t)
123 | }
124 | 
125 | func (b *buffer) readToken() token {
126 | 	if n := len(b.unread); n > 0 {
127 | 		t := b.unread[n-1]
128 | 		b.unread = b.unread[:n-1]
129 | 		return t
130 | 	}
131 | 
132 | 	// Find first non-space, non-comment byte.
133 | 	c, err := b.readByte()
134 | 	for {
135 | 		if err != nil {
136 | 			return err
137 | 		}
138 | 		if isSpace(c) {
139 | 			if b.eof {
140 | 				return io.EOF
141 | 			}
142 | 			c, err = b.readByte()
143 | 		} else if c == '%' {
144 | 			for c != '\r' && c != '\n' {
145 | 				c, err = b.readByte()
146 | 			}
147 | 		} else {
148 | 			break
149 | 		}
150 | 	}
151 | 
152 | 	switch c {
153 | 	case '<':
154 | 		if b, _ := b.readByte(); b == '<' {
155 | 			return keyword("<<")
156 | 		}
157 | 		b.unreadByte()
158 | 		return b.readHexString()
159 | 
160 | 	case '(':
161 | 		return b.readLiteralString()
162 | 
163 | 	case '[', ']', '{', '}':
164 | 		return keyword(string(c))
165 | 
166 | 	case '/':
167 | 		return b.readName()
168 | 
169 | 	case '>':
170 | 		if b, _ := b.readByte(); b == '>' {
171 | 			return keyword(">>")
172 | 		}
173 | 		b.unreadByte()
174 | 		fallthrough
175 | 
176 | 	default:
177 | 		if isDelim(c) {
178 | 			return errors.Errorf("unexpected delimiter %#q", rune(c))
179 | 		}
180 | 		b.unreadByte()
181 | 		return b.readKeyword()
182 | 	}
183 | }
184 | 
185 | func (b *buffer) readHexString() token {
186 | 	tmp := b.tmp[:0]
187 | 	for {
188 | 	Loop:
189 | 		c, err := b.readByte()
190 | 		if err != nil {
191 | 			return err
192 | 		}
193 | 		if c == '>' {
194 | 			break
195 | 		}
196 | 		if isSpace(c) {
197 | 			goto Loop
198 | 		}
199 | 	Loop2:
200 | 		c2, err := b.readByte()
201 | 		if err != nil {
202 | 			return err
203 | 		}
204 | 		if isSpace(c2) {
205 | 			goto Loop2
206 | 		}
207 | 		x := unhex(c)<<4 | unhex(c2)
208 | 		if x < 0 {
209 | 			return errors.Errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])
210 | 		}
211 | 		tmp = append(tmp, byte(x))
212 | 	}
213 | 	b.tmp = tmp
214 | 	return string(tmp)
215 | }
216 | 
217 | func unhex(b byte) int {
218 | 	switch {
219 | 	case '0' <= b && b <= '9':
220 | 		return int(b) - '0'
221 | 	case 'a' <= b && b <= 'f':
222 | 		return int(b) - 'a' + 10
223 | 	case 'A' <= b && b <= 'F':
224 | 		return int(b) - 'A' + 10
225 | 	}
226 | 	return -1
227 | }
228 | 
229 | func (b *buffer) readLiteralString() token {
230 | 	tmp := b.tmp[:0]
231 | 	depth := 1
232 | Loop:
233 | 	for {
234 | 		c, err := b.readByte()
235 | 		if err != nil {
236 | 			return err
237 | 		}
238 | 		switch c {
239 | 		default:
240 | 			tmp = append(tmp, c)
241 | 		case '(':
242 | 			depth++
243 | 			tmp = append(tmp, c)
244 | 		case ')':
245 | 			if depth--; depth == 0 {
246 | 				break Loop
247 | 			}
248 | 			tmp = append(tmp, c)
249 | 		case '\\':
250 | 			if c, err = b.readByte(); err != nil {
251 | 				return err
252 | 			}
253 | 			switch c {
254 | 			default:
255 | 				return errors.Errorf("invalid escape sequence \\%c", c)
256 | 			case 'n':
257 | 				tmp = append(tmp, '\n')
258 | 			case 'r':
259 | 				tmp = append(tmp, '\r')
260 | 			case 'b':
261 | 				tmp = append(tmp, '\b')
262 | 			case 't':
263 | 				tmp = append(tmp, '\t')
264 | 			case 'f':
265 | 				tmp = append(tmp, '\f')
266 | 			case '(', ')', '\\':
267 | 				tmp = append(tmp, c)
268 | 			case '\r':
269 | 				if c, _ := b.readByte(); c != '\n' {
270 | 					b.unreadByte()
271 | 				}
272 | 				fallthrough
273 | 			case '\n':
274 | 				// no append
275 | 			case '0', '1', '2', '3', '4', '5', '6', '7':
276 | 				x := int(c - '0')
277 | 				for i := 0; i < 2; i++ {
278 | 					c, err = b.readByte()
279 | 					if err != nil {
280 | 						return err
281 | 					}
282 | 					if c < '0' || c > '7' {
283 | 						b.unreadByte()
284 | 						break
285 | 					}
286 | 					x = x*8 + int(c-'0')
287 | 				}
288 | 				if x > 255 {
289 | 					return errors.Errorf("invalid octal escape \\%03o", x)
290 | 				}
291 | 				tmp = append(tmp, byte(x))
292 | 			}
293 | 		}
294 | 	}
295 | 	b.tmp = tmp
296 | 	return string(tmp)
297 | }
298 | 
299 | func (b *buffer) readName() token {
300 | 	tmp := b.tmp[:0]
301 | 	for {
302 | 		c, err := b.readByte()
303 | 		if err != nil {
304 | 			return err
305 | 		}
306 | 		if isDelim(c) || isSpace(c) {
307 | 			b.unreadByte()
308 | 			break
309 | 		}
310 | 		if c == '#' {
311 | 			hi, err := b.readByte()
312 | 			if err != nil {
313 | 				return err
314 | 			}
315 | 			lo, err := b.readByte()
316 | 			if err != nil {
317 | 				return err
318 | 			}
319 | 			x := unhex(hi)<<4 | unhex(lo)
320 | 			if x < 0 {
321 | 				return errors.Errorf("malformed name")
322 | 			}
323 | 			tmp = append(tmp, byte(x))
324 | 			continue
325 | 		}
326 | 		tmp = append(tmp, c)
327 | 	}
328 | 	b.tmp = tmp
329 | 	return name(string(tmp))
330 | }
331 | 
332 | func (b *buffer) readKeyword() token {
333 | 	tmp := b.tmp[:0]
334 | 	for {
335 | 		c, err := b.readByte()
336 | 		if err != nil {
337 | 			return err
338 | 		}
339 | 		if isDelim(c) || isSpace(c) {
340 | 			b.unreadByte()
341 | 			break
342 | 		}
343 | 		tmp = append(tmp, c)
344 | 	}
345 | 	b.tmp = tmp
346 | 	s := string(tmp)
347 | 	switch {
348 | 	case s == "true":
349 | 		return true
350 | 	case s == "false":
351 | 		return false
352 | 	case isInteger(s):
353 | 		x, err := strconv.ParseInt(s, 10, 64)
354 | 		if err != nil {
355 | 			return errors.Errorf("invalid integer %s", s)
356 | 		}
357 | 		return x
358 | 	case isReal(s):
359 | 		x, err := strconv.ParseFloat(s, 64)
360 | 		if err != nil {
361 | 			return errors.Errorf("invalid real %s", s)
362 | 		}
363 | 		return x
364 | 	}
365 | 	return keyword(string(tmp))
366 | }
367 | 
368 | func isInteger(s string) bool {
369 | 	if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
370 | 		s = s[1:]
371 | 	}
372 | 	if len(s) == 0 {
373 | 		return false
374 | 	}
375 | 	for _, c := range s {
376 | 		if c < '0' || '9' < c {
377 | 			return false
378 | 		}
379 | 	}
380 | 	return true
381 | }
382 | 
383 | func isReal(s string) bool {
384 | 	if len(s) > 0 && (s[0] == '+' || s[0] == '-') {
385 | 		s = s[1:]
386 | 	}
387 | 	if len(s) == 0 {
388 | 		return false
389 | 	}
390 | 	ndot := 0
391 | 	for _, c := range s {
392 | 		if c == '.' {
393 | 			ndot++
394 | 			continue
395 | 		}
396 | 		if c < '0' || '9' < c {
397 | 			return false
398 | 		}
399 | 	}
400 | 	return ndot == 1
401 | }
402 | 
403 | // An object is a PDF syntax object, one of the following Go types:
404 | //
405 | //	bool, a PDF boolean
406 | //	int64, a PDF integer
407 | //	float64, a PDF real
408 | //	string, a PDF string literal
409 | //	name, a PDF name without the leading slash
410 | //	dict, a PDF dictionary
411 | //	array, a PDF array
412 | //	stream, a PDF stream
413 | //	objptr, a PDF object reference
414 | //	objdef, a PDF object definition
415 | //
416 | // An object may also be nil, to represent the PDF null.
417 | type object interface{}
418 | 
419 | type dict map[name]object
420 | 
421 | type array []object
422 | 
423 | type stream struct {
424 | 	hdr    dict
425 | 	ptr    objptr
426 | 	offset int64
427 | }
428 | 
429 | type objptr struct {
430 | 	id  uint32
431 | 	gen uint16
432 | }
433 | 
434 | type objdef struct {
435 | 	ptr objptr
436 | 	obj object
437 | }
438 | 
439 | func (b *buffer) readObject() object {
440 | 	tok := b.readToken()
441 | 	if kw, ok := tok.(keyword); ok {
442 | 		switch kw {
443 | 		case "null":
444 | 			return nil
445 | 		case "<<":
446 | 			return b.readDict()
447 | 		case "[":
448 | 			return b.readArray()
449 | 		}
450 | 		return errors.Errorf("unexpected keyword %q parsing object", kw)
451 | 	}
452 | 
453 | 	if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 {
454 | 		var err error
455 | 		tok, err = decryptString(b.key, b.useAES, b.objptr, str)
456 | 		if err != nil {
457 | 			return err
458 | 		}
459 | 	}
460 | 
461 | 	if !b.allowObjptr {
462 | 		return tok
463 | 	}
464 | 
465 | 	if t1, ok := tok.(int64); ok && int64(uint32(t1)) == t1 {
466 | 		tok2 := b.readToken()
467 | 		if t2, ok := tok2.(int64); ok && int64(uint16(t2)) == t2 {
468 | 			tok3 := b.readToken()
469 | 			switch tok3 {
470 | 			case keyword("R"):
471 | 				return objptr{uint32(t1), uint16(t2)}
472 | 			case keyword("obj"):
473 | 				old := b.objptr
474 | 				b.objptr = objptr{uint32(t1), uint16(t2)}
475 | 				obj := b.readObject()
476 | 				if _, ok := obj.(stream); !ok {
477 | 					tok4 := b.readToken()
478 | 					if tok4 != keyword("endobj") {
479 | 						return errors.Errorf("missing endobj after indirect object definition")
480 | 					}
481 | 				}
482 | 				b.objptr = old
483 | 				return objdef{objptr{uint32(t1), uint16(t2)}, obj}
484 | 			}
485 | 			b.unreadToken(tok3)
486 | 		}
487 | 		b.unreadToken(tok2)
488 | 	}
489 | 	return tok
490 | }
491 | 
492 | func (b *buffer) readArray() object {
493 | 	var x array
494 | 	for {
495 | 		tok := b.readToken()
496 | 		if tok == nil || tok == keyword("]") {
497 | 			break
498 | 		}
499 | 		b.unreadToken(tok)
500 | 		x = append(x, b.readObject())
501 | 	}
502 | 	return x
503 | }
504 | 
505 | func (b *buffer) readDict() object {
506 | 	x := make(dict)
507 | 	for {
508 | 		tok := b.readToken()
509 | 		if tok == nil || tok == keyword(">>") {
510 | 			break
511 | 		}
512 | 		n, ok := tok.(name)
513 | 		if !ok {
514 | 			return errors.Errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok)
515 | 		}
516 | 		x[n] = b.readObject()
517 | 	}
518 | 
519 | 	if !b.allowStream {
520 | 		return x
521 | 	}
522 | 
523 | 	tok := b.readToken()
524 | 	if tok != keyword("stream") {
525 | 		b.unreadToken(tok)
526 | 		return x
527 | 	}
528 | 
529 | 	c, err := b.readByte()
530 | 	if err != nil {
531 | 		return err
532 | 	}
533 | 	switch c {
534 | 	case '\r':
535 | 		if x, _ := b.readByte(); x != '\n' {
536 | 			b.unreadByte()
537 | 		}
538 | 	case '\n':
539 | 		// ok
540 | 	default:
541 | 		return errors.Errorf("stream keyword not followed by newline")
542 | 	}
543 | 
544 | 	return stream{x, b.objptr, b.readOffset()}
545 | }
546 | 
547 | func isSpace(b byte) bool {
548 | 	switch b {
549 | 	case '\x00', '\t', '\n', '\f', '\r', ' ':
550 | 		return true
551 | 	}
552 | 	return false
553 | }
554 | 
555 | func isDelim(b byte) bool {
556 | 	switch b {
557 | 	case '<', '>', '(', ')', '[', ']', '{', '}', '/', '%':
558 | 		return true
559 | 	}
560 | 	return false
561 | }
562 | 


--------------------------------------------------------------------------------
/page.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package pdf
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"strings"
 12 | 
 13 | 	"github.com/pkg/errors"
 14 | )
 15 | 
 16 | // A Page represent a single page in a PDF file.
 17 | // The methods interpret a Page dictionary stored in V.
 18 | type Page struct {
 19 | 	V Value
 20 | }
 21 | 
 22 | // Page returns the page for the given page number.
 23 | // Page numbers are indexed starting at 1, not 0.
 24 | // If the page is not found, Page returns a Page with p.V.IsNull().
 25 | func (r *Reader) Page(num int) Page {
 26 | 	num-- // now 0-indexed
 27 | 	page := r.Trailer().Key("Root").Key("Pages")
 28 | Search:
 29 | 	for page.Key("Type").Name() == "Pages" {
 30 | 		count := int(page.Key("Count").Int64())
 31 | 		if count < num {
 32 | 			return Page{}
 33 | 		}
 34 | 		kids := page.Key("Kids")
 35 | 		for i := 0; i < kids.Len(); i++ {
 36 | 			kid := kids.Index(i)
 37 | 			if kid.Key("Type").Name() == "Pages" {
 38 | 				c := int(kid.Key("Count").Int64())
 39 | 				if num < c {
 40 | 					page = kid
 41 | 					continue Search
 42 | 				}
 43 | 				num -= c
 44 | 				continue
 45 | 			}
 46 | 			if kid.Key("Type").Name() == "Page" {
 47 | 				if num == 0 {
 48 | 					return Page{kid}
 49 | 				}
 50 | 				num--
 51 | 			}
 52 | 		}
 53 | 	}
 54 | 	return Page{}
 55 | }
 56 | 
 57 | // NumPage returns the number of pages in the PDF file.
 58 | func (r *Reader) NumPage() int {
 59 | 	return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64())
 60 | }
 61 | 
 62 | // GetPlainText returns all the text in the PDF file
 63 | func (r *Reader) GetPlainText() (io.Reader, error) {
 64 | 	pages := r.NumPage()
 65 | 	var buf bytes.Buffer
 66 | 	fonts := make(map[string]*Font)
 67 | 	for i := 1; i <= pages; i++ {
 68 | 		p := r.Page(i)
 69 | 		for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap
 70 | 			if _, ok := fonts[name]; !ok {
 71 | 				f := p.Font(name)
 72 | 				fonts[name] = &f
 73 | 			}
 74 | 		}
 75 | 		r, err := p.GetPlainText(fonts)
 76 | 		if err != nil {
 77 | 			return nil, err
 78 | 		}
 79 | 		_, err = buf.ReadFrom(r)
 80 | 		if err != nil {
 81 | 			return nil, err
 82 | 		}
 83 | 	}
 84 | 	return &buf, nil
 85 | }
 86 | 
 87 | func (p Page) findInherited(key string) Value {
 88 | 	for v := p.V; !v.IsNull(); v = v.Key("Parent") {
 89 | 		if r := v.Key(key); !r.IsNull() {
 90 | 			return r
 91 | 		}
 92 | 	}
 93 | 	return Value{}
 94 | }
 95 | 
 96 | /*
 97 | func (p Page) MediaBox() Value {
 98 | 	return p.findInherited("MediaBox")
 99 | }
100 | 
101 | func (p Page) CropBox() Value {
102 | 	return p.findInherited("CropBox")
103 | }
104 | */
105 | 
106 | // Resources returns the resources dictionary associated with the page.
107 | func (p Page) Resources() Value {
108 | 	return p.findInherited("Resources")
109 | }
110 | 
111 | // Fonts returns a list of the fonts associated with the page.
112 | func (p Page) Fonts() []string {
113 | 	return p.Resources().Key("Font").Keys()
114 | }
115 | 
116 | // Font returns the font with the given name associated with the page.
117 | func (p Page) Font(name string) Font {
118 | 	return Font{p.Resources().Key("Font").Key(name), nil}
119 | }
120 | 
121 | // A Font represent a font in a PDF file.
122 | // The methods interpret a Font dictionary stored in V.
123 | type Font struct {
124 | 	V   Value
125 | 	enc TextEncoding
126 | }
127 | 
128 | // BaseFont returns the font's name (BaseFont property).
129 | func (f Font) BaseFont() string {
130 | 	return f.V.Key("BaseFont").Name()
131 | }
132 | 
133 | // FirstChar returns the code point of the first character in the font.
134 | func (f Font) FirstChar() int {
135 | 	return int(f.V.Key("FirstChar").Int64())
136 | }
137 | 
138 | // LastChar returns the code point of the last character in the font.
139 | func (f Font) LastChar() int {
140 | 	return int(f.V.Key("LastChar").Int64())
141 | }
142 | 
143 | // Widths returns the widths of the glyphs in the font.
144 | // In a well-formed PDF, len(f.Widths()) == f.LastChar()+1 - f.FirstChar().
145 | func (f Font) Widths() []float64 {
146 | 	x := f.V.Key("Widths")
147 | 	var out []float64
148 | 	for i := 0; i < x.Len(); i++ {
149 | 		out = append(out, x.Index(i).Float64())
150 | 	}
151 | 	return out
152 | }
153 | 
154 | // Width returns the width of the given code point.
155 | func (f Font) Width(code int) float64 {
156 | 	first := f.FirstChar()
157 | 	last := f.LastChar()
158 | 	if code < first || last < code {
159 | 		return 0
160 | 	}
161 | 	return f.V.Key("Widths").Index(code - first).Float64()
162 | }
163 | 
164 | // Encoder returns the encoding between font code point sequences and UTF-8.
165 | func (f *Font) Encoder() TextEncoding {
166 | 	if f.enc == nil { // caching the Encoder so we don't have to continually parse charmap
167 | 		f.enc = f.getEncoder()
168 | 	}
169 | 	return f.enc
170 | }
171 | 
172 | func (f Font) getEncoder() TextEncoding {
173 | 	if !f.V.Key("ToUnicode").IsNull() {
174 | 		return f.charmapEncoding()
175 | 	}
176 | 
177 | 	enc := f.V.Key("Encoding")
178 | 	switch enc.Kind() {
179 | 	case Name:
180 | 		switch enc.Name() {
181 | 		case "WinAnsiEncoding":
182 | 			return &byteEncoder{&winAnsiEncoding}
183 | 		case "MacRomanEncoding":
184 | 			return &byteEncoder{&macRomanEncoding}
185 | 		case "Identity-H":
186 | 			return f.charmapEncoding()
187 | 		default:
188 | 			println("unknown encoding", enc.Name())
189 | 			return &nopEncoder{}
190 | 		}
191 | 	case Dict:
192 | 		return &dictEncoder{enc.Key("Differences")}
193 | 	case Null:
194 | 		return f.charmapEncoding()
195 | 	default:
196 | 		println("unexpected encoding", enc.String())
197 | 		return &nopEncoder{}
198 | 	}
199 | }
200 | 
201 | func (f *Font) charmapEncoding() TextEncoding {
202 | 	toUnicode := f.V.Key("ToUnicode")
203 | 	if toUnicode.Kind() == Stream {
204 | 		m, err := readCmap(toUnicode)
205 | 		if err != nil {
206 | 			return &nopEncoder{}
207 | 		}
208 | 		return m
209 | 	}
210 | 
211 | 	return &byteEncoder{&pdfDocEncoding}
212 | }
213 | 
214 | type dictEncoder struct {
215 | 	v Value
216 | }
217 | 
218 | func (e *dictEncoder) Decode(raw string) (text string) {
219 | 	r := make([]rune, 0, len(raw))
220 | 	for i := 0; i < len(raw); i++ {
221 | 		ch := rune(raw[i])
222 | 		n := -1
223 | 		for j := 0; j < e.v.Len(); j++ {
224 | 			x := e.v.Index(j)
225 | 			if x.Kind() == Integer {
226 | 				n = int(x.Int64())
227 | 				continue
228 | 			}
229 | 			if x.Kind() == Name {
230 | 				if int(raw[i]) == n {
231 | 					r := nameToRune[x.Name()]
232 | 					if r != 0 {
233 | 						ch = r
234 | 						break
235 | 					}
236 | 				}
237 | 				n++
238 | 			}
239 | 		}
240 | 		r = append(r, ch)
241 | 	}
242 | 	return string(r)
243 | }
244 | 
245 | // A TextEncoding represents a mapping between
246 | // font code points and UTF-8 text.
247 | type TextEncoding interface {
248 | 	// Decode returns the UTF-8 text corresponding to
249 | 	// the sequence of code points in raw.
250 | 	Decode(raw string) (text string)
251 | }
252 | 
253 | type nopEncoder struct {
254 | }
255 | 
256 | func (e *nopEncoder) Decode(raw string) (text string) {
257 | 	return raw
258 | }
259 | 
260 | type byteEncoder struct {
261 | 	table *[256]rune
262 | }
263 | 
264 | func (e *byteEncoder) Decode(raw string) (text string) {
265 | 	r := make([]rune, 0, len(raw))
266 | 	for i := 0; i < len(raw); i++ {
267 | 		r = append(r, e.table[raw[i]])
268 | 	}
269 | 	return string(r)
270 | }
271 | 
272 | type byteRange struct {
273 | 	low  string
274 | 	high string
275 | }
276 | 
277 | type bfchar struct {
278 | 	orig string
279 | 	repl string
280 | }
281 | 
282 | type bfrange struct {
283 | 	lo  string
284 | 	hi  string
285 | 	dst Value
286 | }
287 | 
288 | type cmap struct {
289 | 	space   [4][]byteRange // codespace range
290 | 	bfrange []bfrange
291 | 	bfchar  []bfchar
292 | }
293 | 
294 | func (m *cmap) Decode(raw string) (text string) {
295 | 	var r []rune
296 | Parse:
297 | 	for len(raw) > 0 {
298 | 		for n := 1; n <= 4 && n <= len(raw); n++ { // number of digits in character replacement (1-4 possible)
299 | 			for _, space := range m.space[n-1] { // find matching codespace Ranges for number of digits
300 | 				if space.low <= raw[:n] && raw[:n] <= space.high { // see if value is in range
301 | 					text := raw[:n]
302 | 					raw = raw[n:]
303 | 					for _, bfchar := range m.bfchar { // check for matching bfchar
304 | 						if len(bfchar.orig) == n && bfchar.orig == text {
305 | 							r = append(r, []rune(utf16Decode(bfchar.repl))...)
306 | 							continue Parse
307 | 						}
308 | 					}
309 | 					for _, bfrange := range m.bfrange { // check for matching bfrange
310 | 						if len(bfrange.lo) == n && bfrange.lo <= text && text <= bfrange.hi {
311 | 							if bfrange.dst.Kind() == String {
312 | 								s := bfrange.dst.RawString()
313 | 								if bfrange.lo != text { // value isn't at the beginning of the range so scale result
314 | 									b := []byte(s)
315 | 									b[len(b)-1] += text[len(text)-1] - bfrange.lo[len(bfrange.lo)-1] // increment last byte by difference
316 | 									s = string(b)
317 | 								}
318 | 								r = append(r, []rune(utf16Decode(s))...)
319 | 								continue Parse
320 | 							}
321 | 							if bfrange.dst.Kind() == Array {
322 | 								fmt.Printf("array %v\n", bfrange.dst)
323 | 							} else {
324 | 								fmt.Printf("unknown dst %v\n", bfrange.dst)
325 | 							}
326 | 							r = append(r, noRune)
327 | 							continue Parse
328 | 						}
329 | 					}
330 | 					r = append(r, noRune)
331 | 					continue Parse
332 | 				}
333 | 			}
334 | 		}
335 | 		println("no code space found")
336 | 		r = append(r, noRune)
337 | 		raw = raw[1:]
338 | 	}
339 | 	return string(r)
340 | }
341 | 
342 | func readCmap(toUnicode Value) (*cmap, error) {
343 | 	n := -1
344 | 	var m cmap
345 | 	err := Interpret(toUnicode, func(stk *Stack, op string) error {
346 | 		switch op {
347 | 		case "findresource":
348 | 			stk.Pop() // category
349 | 			stk.Pop() // key
350 | 			stk.Push(newDict())
351 | 		case "begincmap":
352 | 			stk.Push(newDict())
353 | 		case "endcmap":
354 | 			stk.Pop()
355 | 		case "begincodespacerange":
356 | 			n = int(stk.Pop().Int64())
357 | 		case "endcodespacerange":
358 | 			if n < 0 {
359 | 				return errors.New("missing begincodespacerange")
360 | 			}
361 | 			for i := 0; i < n; i++ {
362 | 				hi, lo := stk.Pop().RawString(), stk.Pop().RawString()
363 | 				if len(lo) == 0 || len(lo) != len(hi) {
364 | 					return errors.New("bad codespace range")
365 | 				}
366 | 				m.space[len(lo)-1] = append(m.space[len(lo)-1], byteRange{lo, hi})
367 | 			}
368 | 			n = -1
369 | 		case "beginbfchar":
370 | 			n = int(stk.Pop().Int64())
371 | 		case "endbfchar":
372 | 			if n < 0 {
373 | 				return errors.New("missing beginbfchar")
374 | 			}
375 | 			for i := 0; i < n; i++ {
376 | 				repl, orig := stk.Pop().RawString(), stk.Pop().RawString()
377 | 				m.bfchar = append(m.bfchar, bfchar{orig, repl})
378 | 			}
379 | 		case "beginbfrange":
380 | 			n = int(stk.Pop().Int64())
381 | 		case "endbfrange":
382 | 			if n < 0 {
383 | 				return errors.New("missing beginbfrange")
384 | 			}
385 | 			for i := 0; i < n; i++ {
386 | 				dst, srcHi, srcLo := stk.Pop(), stk.Pop().RawString(), stk.Pop().RawString()
387 | 				m.bfrange = append(m.bfrange, bfrange{srcLo, srcHi, dst})
388 | 			}
389 | 		case "defineresource":
390 | 			stk.Pop().Name() // category
391 | 			value := stk.Pop()
392 | 			stk.Pop().Name() // key
393 | 			stk.Push(value)
394 | 		default:
395 | 			println("interp\t", op)
396 | 		}
397 | 		return nil
398 | 	})
399 | 	if err != nil {
400 | 		return nil, err
401 | 	}
402 | 	return &m, err
403 | }
404 | 
405 | type matrix [3][3]float64
406 | 
407 | var ident = matrix{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}
408 | 
409 | func (x matrix) mul(y matrix) matrix {
410 | 	var z matrix
411 | 	for i := 0; i < 3; i++ {
412 | 		for j := 0; j < 3; j++ {
413 | 			for k := 0; k < 3; k++ {
414 | 				z[i][j] += x[i][k] * y[k][j]
415 | 			}
416 | 		}
417 | 	}
418 | 	return z
419 | }
420 | 
421 | // A Text represents a single piece of text drawn on a page.
422 | type Text struct {
423 | 	Font     string  // the font used
424 | 	FontSize float64 // the font size, in points (1/72 of an inch)
425 | 	X        float64 // the X coordinate, in points, increasing left to right
426 | 	Y        float64 // the Y coordinate, in points, increasing bottom to top
427 | 	W        float64 // the width of the text, in points
428 | 	S        string  // the actual UTF-8 text
429 | }
430 | 
431 | // A Rect represents a rectangle.
432 | type Rect struct {
433 | 	Min, Max Point
434 | }
435 | 
436 | // A Point represents an X, Y pair.
437 | type Point struct {
438 | 	X float64
439 | 	Y float64
440 | }
441 | 
442 | // Content describes the basic content on a page: the text and any drawn rectangles.
443 | type Content struct {
444 | 	Text []Text
445 | 	Rect []Rect
446 | }
447 | 
448 | type gstate struct {
449 | 	Tc    float64
450 | 	Tw    float64
451 | 	Th    float64
452 | 	Tl    float64
453 | 	Tf    Font
454 | 	Tfs   float64
455 | 	Tmode int
456 | 	Trise float64
457 | 	Tm    matrix
458 | 	Tlm   matrix
459 | 	Trm   matrix
460 | 	CTM   matrix
461 | }
462 | 
463 | // GetPlainText returns the page's all text without format.
464 | // fonts can be passed in (to improve parsing performance) or left nil
465 | func (p Page) GetPlainText(fonts map[string]*Font) (io.Reader, error) {
466 | 	strm := p.V.Key("Contents")
467 | 	var enc TextEncoding = &nopEncoder{}
468 | 
469 | 	if fonts == nil {
470 | 		fonts = make(map[string]*Font)
471 | 		for _, font := range p.Fonts() {
472 | 			f := p.Font(font)
473 | 			fonts[font] = &f
474 | 		}
475 | 	}
476 | 
477 | 	var textBuilder bytes.Buffer
478 | 	showText := func(s string) {
479 | 		for _, ch := range enc.Decode(s) {
480 | 			textBuilder.WriteRune(ch)
481 | 		}
482 | 	}
483 | 
484 | 	err := Interpret(strm, func(stk *Stack, op string) error {
485 | 		n := stk.Len()
486 | 		args := make([]Value, n)
487 | 		for i := n - 1; i >= 0; i-- {
488 | 			args[i] = stk.Pop()
489 | 		}
490 | 
491 | 		switch op {
492 | 		default:
493 | 			return nil
494 | 		case "T*": // move to start of next line
495 | 			showText("\n")
496 | 		case "Tf": // set text font and size
497 | 			if len(args) != 2 {
498 | 				return errors.New("bad TL")
499 | 			}
500 | 			if font, ok := fonts[args[0].Name()]; ok {
501 | 				enc = font.Encoder()
502 | 			} else {
503 | 				enc = &nopEncoder{}
504 | 			}
505 | 		case "\"": // set spacing, move to next line, and show text
506 | 			if len(args) != 3 {
507 | 				return errors.New("bad \" operator")
508 | 			}
509 | 			fallthrough
510 | 		case "'": // move to next line and show text
511 | 			if len(args) != 1 {
512 | 				return errors.New("bad ' operator")
513 | 			}
514 | 			fallthrough
515 | 		case "Tj": // show text
516 | 			if len(args) != 1 {
517 | 				return errors.New("bad Tj operator")
518 | 			}
519 | 			showText(args[0].RawString())
520 | 		case "TJ": // show text, allowing individual glyph positioning
521 | 			v := args[0]
522 | 			for i := 0; i < v.Len(); i++ {
523 | 				x := v.Index(i)
524 | 				if x.Kind() == String {
525 | 					showText(x.RawString())
526 | 				}
527 | 			}
528 | 		}
529 | 		return nil
530 | 	})
531 | 	if err != nil {
532 | 		return nil, err
533 | 	}
534 | 	return &textBuilder, nil
535 | }
536 | 
537 | // Content returns the page's content.
538 | func (p Page) Content() (Content, error) {
539 | 	var enc TextEncoding = &nopEncoder{}
540 | 
541 | 	var g = gstate{
542 | 		Th:  1,
543 | 		CTM: ident,
544 | 	}
545 | 
546 | 	var text []Text
547 | 	showText := func(s string) {
548 | 		n := 0
549 | 		for _, ch := range enc.Decode(s) {
550 | 			Trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM)
551 | 			w0 := g.Tf.Width(int(s[n]))
552 | 			n++
553 | 			if ch != ' ' {
554 | 				f := g.Tf.BaseFont()
555 | 				if i := strings.Index(f, "+"); i >= 0 {
556 | 					f = f[i+1:]
557 | 				}
558 | 				text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)})
559 | 			}
560 | 			tx := w0/1000*g.Tfs + g.Tc
561 | 			if ch == ' ' {
562 | 				tx += g.Tw
563 | 			}
564 | 			tx *= g.Th
565 | 			g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
566 | 		}
567 | 	}
568 | 
569 | 	var rect []Rect
570 | 	var gstack []gstate
571 | 
572 | 	var strms []Value
573 | 	contents := p.V.Key("Contents")
574 | 	switch contents.Kind() {
575 | 	case Stream:
576 | 		strms = append(strms, contents)
577 | 	case Array:
578 | 		for i := 0; i < contents.Len(); i++ {
579 | 			strms = append(strms, contents.Index(i))
580 | 		}
581 | 	default:
582 | 		return Content{}, errors.New("expected page contents to be a stream or an array")
583 | 	}
584 | 
585 | 	for _, strm := range strms {
586 | 		err := Interpret(strm, func(stk *Stack, op string) error {
587 | 			n := stk.Len()
588 | 			args := make([]Value, n)
589 | 			for i := n - 1; i >= 0; i-- {
590 | 				args[i] = stk.Pop()
591 | 			}
592 | 			switch op {
593 | 			default:
594 | 				//fmt.Println(op, args)
595 | 				return nil
596 | 
597 | 			case "cm": // update g.CTM
598 | 				if len(args) != 6 {
599 | 					return errors.New("bad g.Tm")
600 | 				}
601 | 				var m matrix
602 | 				for i := 0; i < 6; i++ {
603 | 					m[i/2][i%2] = args[i].Float64()
604 | 				}
605 | 				m[2][2] = 1
606 | 				g.CTM = m.mul(g.CTM)
607 | 
608 | 			case "gs": // set parameters from graphics state resource
609 | 				gs := p.Resources().Key("ExtGState").Key(args[0].Name())
610 | 				font := gs.Key("Font")
611 | 				if font.Kind() == Array && font.Len() == 2 {
612 | 					//fmt.Println("FONT", font)
613 | 				}
614 | 
615 | 			case "f": // fill
616 | 			case "g": // setgray
617 | 			case "l": // lineto
618 | 			case "m": // moveto
619 | 
620 | 			case "cs": // set colorspace non-stroking
621 | 			case "scn": // set color non-stroking
622 | 
623 | 			case "re": // append rectangle to path
624 | 				if len(args) != 4 {
625 | 					return errors.New("bad re")
626 | 				}
627 | 				x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64()
628 | 				rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}})
629 | 
630 | 			case "q": // save graphics state
631 | 				gstack = append(gstack, g)
632 | 
633 | 			case "Q": // restore graphics state
634 | 				n := len(gstack) - 1
635 | 				g = gstack[n]
636 | 				gstack = gstack[:n]
637 | 
638 | 			case "BT": // begin text (reset text matrix and line matrix)
639 | 				g.Tm = ident
640 | 				g.Tlm = g.Tm
641 | 
642 | 			case "ET": // end text
643 | 
644 | 			case "T*": // move to start of next line
645 | 				x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
646 | 				g.Tlm = x.mul(g.Tlm)
647 | 				g.Tm = g.Tlm
648 | 
649 | 			case "Tc": // set character spacing
650 | 				if len(args) != 1 {
651 | 					return errors.New("bad g.Tc")
652 | 				}
653 | 				g.Tc = args[0].Float64()
654 | 
655 | 			case "TD": // move text position and set leading
656 | 				if len(args) != 2 {
657 | 					return errors.New("bad Td")
658 | 				}
659 | 				g.Tl = -args[1].Float64()
660 | 				fallthrough
661 | 			case "Td": // move text position
662 | 				if len(args) != 2 {
663 | 					return errors.New("bad Td")
664 | 				}
665 | 				tx := args[0].Float64()
666 | 				ty := args[1].Float64()
667 | 				x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}}
668 | 				g.Tlm = x.mul(g.Tlm)
669 | 				g.Tm = g.Tlm
670 | 
671 | 			case "Tf": // set text font and size
672 | 				if len(args) != 2 {
673 | 					return errors.New("bad TL")
674 | 				}
675 | 				f := args[0].Name()
676 | 				g.Tf = p.Font(f)
677 | 				enc = g.Tf.Encoder()
678 | 				if enc == nil {
679 | 					println("no cmap for", f)
680 | 					enc = &nopEncoder{}
681 | 				}
682 | 				g.Tfs = args[1].Float64()
683 | 
684 | 			case "\"": // set spacing, move to next line, and show text
685 | 				if len(args) != 3 {
686 | 					return errors.New("bad \" operator")
687 | 				}
688 | 				g.Tw = args[0].Float64()
689 | 				g.Tc = args[1].Float64()
690 | 				args = args[2:]
691 | 				fallthrough
692 | 			case "'": // move to next line and show text
693 | 				if len(args) != 1 {
694 | 					return errors.New("bad ' operator")
695 | 				}
696 | 				x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
697 | 				g.Tlm = x.mul(g.Tlm)
698 | 				g.Tm = g.Tlm
699 | 				fallthrough
700 | 			case "Tj": // show text
701 | 				if len(args) != 1 {
702 | 					return errors.New("bad Tj operator")
703 | 				}
704 | 				showText(args[0].RawString())
705 | 
706 | 			case "TJ": // show text, allowing individual glyph positioning
707 | 				v := args[0]
708 | 				for i := 0; i < v.Len(); i++ {
709 | 					x := v.Index(i)
710 | 					if x.Kind() == String {
711 | 						showText(x.RawString())
712 | 					} else {
713 | 						tx := -x.Float64() / 1000 * g.Tfs * g.Th
714 | 						g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
715 | 					}
716 | 				}
717 | 
718 | 			case "TL": // set text leading
719 | 				if len(args) != 1 {
720 | 					return errors.New("bad TL")
721 | 				}
722 | 				g.Tl = args[0].Float64()
723 | 
724 | 			case "Tm": // set text matrix and line matrix
725 | 				if len(args) != 6 {
726 | 					return errors.New("bad g.Tm")
727 | 				}
728 | 				var m matrix
729 | 				for i := 0; i < 6; i++ {
730 | 					m[i/2][i%2] = args[i].Float64()
731 | 				}
732 | 				m[2][2] = 1
733 | 				g.Tm = m
734 | 				g.Tlm = m
735 | 
736 | 			case "Tr": // set text rendering mode
737 | 				if len(args) != 1 {
738 | 					return errors.New("bad Tr")
739 | 				}
740 | 				g.Tmode = int(args[0].Int64())
741 | 
742 | 			case "Ts": // set text rise
743 | 				if len(args) != 1 {
744 | 					return errors.New("bad Ts")
745 | 				}
746 | 				g.Trise = args[0].Float64()
747 | 
748 | 			case "Tw": // set word spacing
749 | 				if len(args) != 1 {
750 | 					return errors.New("bad g.Tw")
751 | 				}
752 | 				g.Tw = args[0].Float64()
753 | 
754 | 			case "Tz": // set horizontal text scaling
755 | 				if len(args) != 1 {
756 | 					return errors.New("bad Tz")
757 | 				}
758 | 				g.Th = args[0].Float64() / 100
759 | 			}
760 | 			return nil
761 | 		})
762 | 		if err != nil {
763 | 			return Content{}, err
764 | 		}
765 | 	}
766 | 
767 | 	return Content{text, rect}, nil
768 | }
769 | 
770 | // TextVertical implements sort.Interface for sorting
771 | // a slice of Text values in vertical order, top to bottom,
772 | // and then left to right within a line.
773 | type TextVertical []Text
774 | 
775 | func (x TextVertical) Len() int      { return len(x) }
776 | func (x TextVertical) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
777 | func (x TextVertical) Less(i, j int) bool {
778 | 	if x[i].Y != x[j].Y {
779 | 		return x[i].Y > x[j].Y
780 | 	}
781 | 	return x[i].X < x[j].X
782 | }
783 | 
784 | // TextHorizontal implements sort.Interface for sorting
785 | // a slice of Text values in horizontal order, left to right,
786 | // and then top to bottom within a column.
787 | type TextHorizontal []Text
788 | 
789 | func (x TextHorizontal) Len() int      { return len(x) }
790 | func (x TextHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
791 | func (x TextHorizontal) Less(i, j int) bool {
792 | 	if x[i].X != x[j].X {
793 | 		return x[i].X < x[j].X
794 | 	}
795 | 	return x[i].Y > x[j].Y
796 | }
797 | 
798 | // An Outline is a tree describing the outline (also known as the table of contents)
799 | // of a document.
800 | type Outline struct {
801 | 	Title string    // title for this element
802 | 	Child []Outline // child elements
803 | }
804 | 
805 | // Outline returns the document outline.
806 | // The Outline returned is the root of the outline tree and typically has no Title itself.
807 | // That is, the children of the returned root are the top-level entries in the outline.
808 | func (r *Reader) Outline() Outline {
809 | 	return buildOutline(r.Trailer().Key("Root").Key("Outlines"))
810 | }
811 | 
812 | func buildOutline(entry Value) Outline {
813 | 	var x Outline
814 | 	x.Title = entry.Key("Title").Text()
815 | 	for child := entry.Key("First"); child.Kind() == Dict; child = child.Key("Next") {
816 | 		x.Child = append(x.Child, buildOutline(child))
817 | 	}
818 | 	return x
819 | }
820 | 


--------------------------------------------------------------------------------
/pdfpasswd/main.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // Pdfpasswd searches for the password for an encrypted PDF
  6 | // by trying all strings over a given alphabet up to a given length.
  7 | package main
  8 | 
  9 | import (
 10 | 	"flag"
 11 | 	"fmt"
 12 | 	"log"
 13 | 	"os"
 14 | 
 15 | 	"github.com/rsc/pdf"
 16 | )
 17 | 
 18 | var (
 19 | 	alphabet  = flag.String("a", "0123456789", "alphabet")
 20 | 	maxLength = flag.Int("m", 4, "max length")
 21 | )
 22 | 
 23 | func usage() {
 24 | 	fmt.Fprintf(os.Stderr, "usage: pdfpasswd [-a alphabet] [-m maxlength] file\n")
 25 | 	os.Exit(2)
 26 | }
 27 | 
 28 | func main() {
 29 | 	log.SetFlags(0)
 30 | 	log.SetPrefix("pdfpasswd: ")
 31 | 
 32 | 	flag.Usage = usage
 33 | 	flag.Parse()
 34 | 	if flag.NArg() != 1 {
 35 | 		usage()
 36 | 	}
 37 | 
 38 | 	f, err := os.Open(flag.Arg(0))
 39 | 	if err != nil {
 40 | 		log.Fatal(err)
 41 | 	}
 42 | 
 43 | 	last := ""
 44 | 	alpha := *alphabet
 45 | 	ctr := make([]int, *maxLength)
 46 | 	pw := func() string {
 47 | 		inc(ctr, len(alpha)+1)
 48 | 		for !valid(ctr) {
 49 | 			inc(ctr, len(alpha)+1)
 50 | 		}
 51 | 		if done(ctr) {
 52 | 			return ""
 53 | 		}
 54 | 		buf := make([]byte, len(ctr))
 55 | 		var i int
 56 | 		for i = 0; i < len(buf); i++ {
 57 | 			if ctr[i] == 0 {
 58 | 				break
 59 | 			}
 60 | 			buf[i] = alpha[ctr[i]-1]
 61 | 		}
 62 | 		last = string(buf[:i])
 63 | 		println(last)
 64 | 		return last
 65 | 	}
 66 | 	st, err := f.Stat()
 67 | 	if err != nil {
 68 | 		log.Fatal(err)
 69 | 	}
 70 | 	_, err = pdf.NewReaderEncrypted(f, st.Size(), pw)
 71 | 	if err != nil {
 72 | 		if err == pdf.ErrInvalidPassword {
 73 | 			log.Fatal("password not found")
 74 | 		}
 75 | 		log.Fatal("reading pdf: %v", err)
 76 | 	}
 77 | 	fmt.Printf("password: %q\n", last)
 78 | }
 79 | 
 80 | func inc(ctr []int, n int) {
 81 | 	for i := 0; i < len(ctr); i++ {
 82 | 		ctr[i]++
 83 | 		if ctr[i] < n {
 84 | 			break
 85 | 		}
 86 | 		ctr[i] = 0
 87 | 	}
 88 | }
 89 | 
 90 | func done(ctr []int) bool {
 91 | 	for _, x := range ctr {
 92 | 		if x != 0 {
 93 | 			return false
 94 | 		}
 95 | 	}
 96 | 	return true
 97 | }
 98 | 
 99 | func valid(ctr []int) bool {
100 | 	i := len(ctr)
101 | 	for i > 0 && ctr[i-1] == 0 {
102 | 		i--
103 | 	}
104 | 	for i--; i >= 0; i-- {
105 | 		if ctr[i] == 0 {
106 | 			return false
107 | 		}
108 | 	}
109 | 	return true
110 | }
111 | 


--------------------------------------------------------------------------------
/ps.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package pdf
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"io"
 10 | 
 11 | 	"github.com/pkg/errors"
 12 | )
 13 | 
 14 | // A Stack represents a stack of values.
 15 | type Stack struct {
 16 | 	stack []Value
 17 | }
 18 | 
 19 | func (stk *Stack) Len() int {
 20 | 	return len(stk.stack)
 21 | }
 22 | 
 23 | func (stk *Stack) Push(v Value) {
 24 | 	stk.stack = append(stk.stack, v)
 25 | }
 26 | 
 27 | func (stk *Stack) Pop() Value {
 28 | 	n := len(stk.stack)
 29 | 	if n == 0 {
 30 | 		return Value{}
 31 | 	}
 32 | 	v := stk.stack[n-1]
 33 | 	stk.stack[n-1] = Value{}
 34 | 	stk.stack = stk.stack[:n-1]
 35 | 	return v
 36 | }
 37 | 
 38 | func newDict() Value {
 39 | 	return Value{r: nil, ptr: objptr{}, data: make(dict)}
 40 | }
 41 | 
 42 | // Interpret interprets the content in a stream as a basic PostScript program,
 43 | // pushing values onto a stack and then calling the do function to execute
 44 | // operators. The do function may push or pop values from the stack as needed
 45 | // to implement op.
 46 | //
 47 | // Interpret handles the operators "dict", "currentdict", "begin", "end", "def", and "pop" itself.
 48 | //
 49 | // Interpret is not a full-blown PostScript interpreter. Its job is to handle the
 50 | // very limited PostScript found in certain supporting file formats embedded
 51 | // in PDF files, such as cmap files that describe the mapping from font code
 52 | // points to Unicode code points.
 53 | //
 54 | // There is no support for executable blocks, among other limitations.
 55 | //
 56 | func Interpret(strm Value, do func(stk *Stack, op string) error) error {
 57 | 	rd := strm.Reader()
 58 | 	b := newBuffer(rd, 0)
 59 | 	b.allowEOF = true
 60 | 	b.allowObjptr = false
 61 | 	b.allowStream = false
 62 | 	var stk Stack
 63 | 	var dicts []dict
 64 | Reading:
 65 | 	for {
 66 | 		tok := b.readToken()
 67 | 		if tok == io.EOF {
 68 | 			break
 69 | 		}
 70 | 		if err, ok := tok.(error); ok {
 71 | 			return err
 72 | 		}
 73 | 		if kw, ok := tok.(keyword); ok {
 74 | 			switch kw {
 75 | 			case "null", "[", "]", "<<", ">>":
 76 | 				break
 77 | 			default:
 78 | 				for i := len(dicts) - 1; i >= 0; i-- {
 79 | 					if v, ok := dicts[i][name(kw)]; ok {
 80 | 						stk.Push(Value{r: nil, ptr: objptr{}, data: v})
 81 | 						continue Reading
 82 | 					}
 83 | 				}
 84 | 				err := do(&stk, string(kw))
 85 | 				if err != nil {
 86 | 					return err
 87 | 				}
 88 | 				continue
 89 | 			case "dict":
 90 | 				stk.Pop()
 91 | 				stk.Push(Value{r: nil, ptr: objptr{}, data: make(dict)})
 92 | 				continue
 93 | 			case "currentdict":
 94 | 				if len(dicts) == 0 {
 95 | 					return errors.New("no current dictionary")
 96 | 				}
 97 | 				stk.Push(Value{r: nil, ptr: objptr{}, data: dicts[len(dicts)-1]})
 98 | 				continue
 99 | 			case "begin":
100 | 				d := stk.Pop()
101 | 				if d.Kind() != Dict {
102 | 					return errors.New("cannot begin non-dict")
103 | 				}
104 | 				dicts = append(dicts, d.data.(dict))
105 | 				continue
106 | 			case "end":
107 | 				if len(dicts) <= 0 {
108 | 					return errors.New("mismatched begin/end")
109 | 				}
110 | 				dicts = dicts[:len(dicts)-1]
111 | 				continue
112 | 			case "def":
113 | 				if len(dicts) <= 0 {
114 | 					return errors.New("def without open dict")
115 | 				}
116 | 				val := stk.Pop()
117 | 				key, ok := stk.Pop().data.(name)
118 | 				if !ok {
119 | 					return errors.New("def of non-name")
120 | 				}
121 | 				dicts[len(dicts)-1][key] = val.data
122 | 				continue
123 | 			case "pop":
124 | 				stk.Pop()
125 | 				continue
126 | 			}
127 | 		}
128 | 		b.unreadToken(tok)
129 | 		obj := b.readObject()
130 | 		stk.Push(Value{r: nil, ptr: objptr{}, data: obj})
131 | 	}
132 | 	return nil
133 | }
134 | 
135 | type seqReader struct {
136 | 	rd     io.Reader
137 | 	offset int64
138 | }
139 | 
140 | func (r *seqReader) ReadAt(buf []byte, offset int64) (int, error) {
141 | 	if offset != r.offset {
142 | 		return 0, fmt.Errorf("non-sequential read of stream")
143 | 	}
144 | 	n, err := io.ReadFull(r.rd, buf)
145 | 	r.offset += int64(n)
146 | 	return n, err
147 | }
148 | 


--------------------------------------------------------------------------------
/read.go:
--------------------------------------------------------------------------------
   1 | // Copyright 2014 The Go Authors.  All rights reserved.
   2 | // Use of this source code is governed by a BSD-style
   3 | // license that can be found in the LICENSE file.
   4 | 
   5 | // Package pdf implements reading of PDF files.
   6 | //
   7 | // Overview
   8 | //
   9 | // PDF is Adobe's Portable Document Format, ubiquitous on the internet.
  10 | // A PDF document is a complex data format built on a fairly simple structure.
  11 | // This package exposes the simple structure along with some wrappers to
  12 | // extract basic information. If more complex information is needed, it is
  13 | // possible to extract that information by interpreting the structure exposed
  14 | // by this package.
  15 | //
  16 | // Specifically, a PDF is a data structure built from Values, each of which has
  17 | // one of the following Kinds:
  18 | //
  19 | //	Null, for the null object.
  20 | //	Integer, for an integer.
  21 | //	Real, for a floating-point number.
  22 | //	Bool, for a boolean value.
  23 | //	Name, for a name constant (as in /Helvetica).
  24 | //	String, for a string constant.
  25 | //	Dict, for a dictionary of name-value pairs.
  26 | //	Array, for an array of values.
  27 | //	Stream, for an opaque data stream and associated header dictionary.
  28 | //
  29 | // The accessors on Value—Int64, Float64, Bool, Name, and so on—return
  30 | // a view of the data as the given type. When there is no appropriate view,
  31 | // the accessor returns a zero result. For example, the Name accessor returns
  32 | // the empty string if called on a Value v for which v.Kind() != Name.
  33 | // Returning zero values this way, especially from the Dict and Array accessors,
  34 | // which themselves return Values, makes it possible to traverse a PDF quickly
  35 | // without writing any error checking. On the other hand, it means that mistakes
  36 | // can go unreported.
  37 | //
  38 | // The basic structure of the PDF file is exposed as the graph of Values.
  39 | //
  40 | // Most richer data structures in a PDF file are dictionaries with specific interpretations
  41 | // of the name-value pairs. The Font and Page wrappers make the interpretation
  42 | // of a specific Value as the corresponding type easier. They are only helpers, though:
  43 | // they are implemented only in terms of the Value API and could be moved outside
  44 | // the package. Equally important, traversal of other PDF data structures can be implemented
  45 | // in other packages as needed.
  46 | //
  47 | package pdf
  48 | 
  49 | // BUG(rsc): The package is incomplete, although it has been used successfully on some
  50 | // large real-world PDF files.
  51 | 
  52 | // BUG(rsc): There is no support for closing open PDF files. If you drop all references to a Reader,
  53 | // the underlying reader will eventually be garbage collected.
  54 | 
  55 | // BUG(rsc): The library makes no attempt at efficiency. A value cache maintained in the Reader
  56 | // would probably help significantly.
  57 | 
  58 | // BUG(rsc): The support for reading encrypted files is weak.
  59 | 
  60 | // BUG(rsc): The Value API does not support error reporting. The intent is to allow users to
  61 | // set an error reporting callback in Reader, but that code has not been implemented.
  62 | 
  63 | import (
  64 | 	"bytes"
  65 | 	"compress/zlib"
  66 | 	"crypto/aes"
  67 | 	"crypto/cipher"
  68 | 	"crypto/md5"
  69 | 	"crypto/rc4"
  70 | 	"fmt"
  71 | 	"io"
  72 | 	"io/ioutil"
  73 | 	"os"
  74 | 	"sort"
  75 | 	"strconv"
  76 | 
  77 | 	"github.com/pkg/errors"
  78 | )
  79 | 
  80 | // A Reader is a single PDF file open for reading.
  81 | type Reader struct {
  82 | 	f          io.ReaderAt
  83 | 	end        int64
  84 | 	xref       []xref
  85 | 	trailer    dict
  86 | 	trailerptr objptr
  87 | 	key        []byte
  88 | 	useAES     bool
  89 | }
  90 | 
  91 | type xref struct {
  92 | 	ptr      objptr
  93 | 	inStream bool
  94 | 	stream   objptr
  95 | 	offset   int64
  96 | }
  97 | 
  98 | // Open opens a file for reading.
  99 | func Open(file string) (*Reader, error) {
 100 | 	// TODO: Deal with closing file.
 101 | 	f, err := os.Open(file)
 102 | 	if err != nil {
 103 | 		return nil, err
 104 | 	}
 105 | 	fi, err := f.Stat()
 106 | 	if err != nil {
 107 | 		f.Close()
 108 | 		return nil, err
 109 | 	}
 110 | 	return NewReader(f, fi.Size())
 111 | }
 112 | 
 113 | // NewReader opens a file for reading, using the data in f with the given total size.
 114 | func NewReader(f io.ReaderAt, size int64) (*Reader, error) {
 115 | 	return NewReaderEncrypted(f, size, nil)
 116 | }
 117 | 
 118 | // NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
 119 | // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords
 120 | // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt
 121 | // the file and returns an error.
 122 | func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) {
 123 | 	buf := make([]byte, 10)
 124 | 	f.ReadAt(buf, 0)
 125 | 	if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' {
 126 | 		return nil, fmt.Errorf("not a PDF file: invalid header")
 127 | 	}
 128 | 	end := size
 129 | 	const endChunk = 100
 130 | 	buf = make([]byte, endChunk)
 131 | 	f.ReadAt(buf, end-endChunk)
 132 | 	for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' {
 133 | 		buf = buf[:len(buf)-1]
 134 | 	}
 135 | 	buf = bytes.TrimRight(buf, "\r\n\t ")
 136 | 	if !bytes.HasSuffix(buf, []byte("%%EOF")) {
 137 | 		return nil, fmt.Errorf("not a PDF file: missing %%%%EOF")
 138 | 	}
 139 | 	i := findLastLine(buf, "startxref")
 140 | 	if i < 0 {
 141 | 		return nil, fmt.Errorf("malformed PDF file: missing final startxref")
 142 | 	}
 143 | 
 144 | 	r := &Reader{
 145 | 		f:   f,
 146 | 		end: end,
 147 | 	}
 148 | 	pos := end - endChunk + int64(i)
 149 | 	b := newBuffer(io.NewSectionReader(f, pos, end-pos), pos)
 150 | 	if b.readToken() != keyword("startxref") {
 151 | 		return nil, fmt.Errorf("malformed PDF file: missing startxref")
 152 | 	}
 153 | 	startxref, ok := b.readToken().(int64)
 154 | 	if !ok {
 155 | 		return nil, fmt.Errorf("malformed PDF file: startxref not followed by integer")
 156 | 	}
 157 | 	b = newBuffer(io.NewSectionReader(r.f, startxref, r.end-startxref), startxref)
 158 | 	xref, trailerptr, trailer, err := readXref(r, b)
 159 | 	if err != nil {
 160 | 		return nil, err
 161 | 	}
 162 | 	r.xref = xref
 163 | 	r.trailer = trailer
 164 | 	r.trailerptr = trailerptr
 165 | 	if trailer["Encrypt"] == nil {
 166 | 		return r, nil
 167 | 	}
 168 | 	err = r.initEncrypt("")
 169 | 	if err == nil {
 170 | 		return r, nil
 171 | 	}
 172 | 	if pw == nil || err != ErrInvalidPassword {
 173 | 		return nil, err
 174 | 	}
 175 | 	for {
 176 | 		next := pw()
 177 | 		if next == "" {
 178 | 			break
 179 | 		}
 180 | 		if r.initEncrypt(next) == nil {
 181 | 			return r, nil
 182 | 		}
 183 | 	}
 184 | 	return nil, err
 185 | }
 186 | 
 187 | // Trailer returns the file's Trailer value.
 188 | func (r *Reader) Trailer() Value {
 189 | 	return Value{r: r, ptr: r.trailerptr, data: r.trailer}
 190 | }
 191 | 
 192 | func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) {
 193 | 	tok := b.readToken()
 194 | 	if tok == keyword("xref") {
 195 | 		return readXrefTable(r, b)
 196 | 	}
 197 | 	if _, ok := tok.(int64); ok {
 198 | 		b.unreadToken(tok)
 199 | 		return readXrefStream(r, b)
 200 | 	}
 201 | 	return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", tok)
 202 | }
 203 | 
 204 | func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) {
 205 | 	obj1 := b.readObject()
 206 | 	obj, ok := obj1.(objdef)
 207 | 	if !ok {
 208 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj1))
 209 | 	}
 210 | 	strmptr := obj.ptr
 211 | 	strm, ok := obj.obj.(stream)
 212 | 	if !ok {
 213 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: cross-reference table not found: %v", objfmt(obj))
 214 | 	}
 215 | 	if strm.hdr["Type"] != name("XRef") {
 216 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream does not have type XRef")
 217 | 	}
 218 | 	size, ok := strm.hdr["Size"].(int64)
 219 | 	if !ok {
 220 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref stream missing Size")
 221 | 	}
 222 | 	table := make([]xref, size)
 223 | 
 224 | 	table, err := readXrefStreamData(r, strm, table, size)
 225 | 	if err != nil {
 226 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err)
 227 | 	}
 228 | 
 229 | 	for prevoff := strm.hdr["Prev"]; prevoff != nil; {
 230 | 		off, ok := prevoff.(int64)
 231 | 		if !ok {
 232 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff)
 233 | 		}
 234 | 		b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off)
 235 | 		obj1 := b.readObject()
 236 | 		obj, ok := obj1.(objdef)
 237 | 		if !ok {
 238 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj1))
 239 | 		}
 240 | 		prevstrm, ok := obj.obj.(stream)
 241 | 		if !ok {
 242 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj))
 243 | 		}
 244 | 		prevoff = prevstrm.hdr["Prev"]
 245 | 		prev := Value{r: r, ptr: objptr{}, data: prevstrm}
 246 | 		if prev.Kind() != Stream {
 247 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream is not stream: %v", prev)
 248 | 		}
 249 | 		if prev.Key("Type").Name() != "XRef" {
 250 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream does not have type XRef")
 251 | 		}
 252 | 		psize := prev.Key("Size").Int64()
 253 | 		if psize > size {
 254 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream larger than last stream")
 255 | 		}
 256 | 		if table, err = readXrefStreamData(r, prev.data.(stream), table, psize); err != nil {
 257 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: reading xref prev stream: %v", err)
 258 | 		}
 259 | 	}
 260 | 
 261 | 	return table, strmptr, strm.hdr, nil
 262 | }
 263 | 
 264 | func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xref, error) {
 265 | 	index, _ := strm.hdr["Index"].(array)
 266 | 	if index == nil {
 267 | 		index = array{int64(0), size}
 268 | 	}
 269 | 	if len(index)%2 != 0 {
 270 | 		return nil, fmt.Errorf("invalid Index array %v", objfmt(index))
 271 | 	}
 272 | 	ww, ok := strm.hdr["W"].(array)
 273 | 	if !ok {
 274 | 		return nil, fmt.Errorf("xref stream missing W array")
 275 | 	}
 276 | 
 277 | 	var w []int
 278 | 	for _, x := range ww {
 279 | 		i, ok := x.(int64)
 280 | 		if !ok || int64(int(i)) != i {
 281 | 			return nil, fmt.Errorf("invalid W array %v", objfmt(ww))
 282 | 		}
 283 | 		w = append(w, int(i))
 284 | 	}
 285 | 	if len(w) < 3 {
 286 | 		return nil, fmt.Errorf("invalid W array %v", objfmt(ww))
 287 | 	}
 288 | 
 289 | 	v := Value{r: r, ptr: objptr{}, data: strm}
 290 | 	wtotal := 0
 291 | 	for _, wid := range w {
 292 | 		wtotal += wid
 293 | 	}
 294 | 	buf := make([]byte, wtotal)
 295 | 	data := v.Reader()
 296 | 	for len(index) > 0 {
 297 | 		start, ok1 := index[0].(int64)
 298 | 		n, ok2 := index[1].(int64)
 299 | 		if !ok1 || !ok2 {
 300 | 			return nil, fmt.Errorf("malformed Index pair %v %v %T %T", objfmt(index[0]), objfmt(index[1]), index[0], index[1])
 301 | 		}
 302 | 		index = index[2:]
 303 | 		for i := 0; i < int(n); i++ {
 304 | 			_, err := io.ReadFull(data, buf)
 305 | 			if err != nil {
 306 | 				return nil, fmt.Errorf("error reading xref stream: %v", err)
 307 | 			}
 308 | 			v1 := decodeInt(buf[0:w[0]])
 309 | 			if w[0] == 0 {
 310 | 				v1 = 1
 311 | 			}
 312 | 			v2 := decodeInt(buf[w[0] : w[0]+w[1]])
 313 | 			v3 := decodeInt(buf[w[0]+w[1] : w[0]+w[1]+w[2]])
 314 | 			x := int(start) + i
 315 | 			for cap(table) <= x {
 316 | 				table = append(table[:cap(table)], xref{})
 317 | 			}
 318 | 			if table[x].ptr != (objptr{}) {
 319 | 				continue
 320 | 			}
 321 | 			switch v1 {
 322 | 			case 0:
 323 | 				table[x] = xref{ptr: objptr{0, 65535}}
 324 | 			case 1:
 325 | 				table[x] = xref{ptr: objptr{uint32(x), uint16(v3)}, offset: int64(v2)}
 326 | 			case 2:
 327 | 				table[x] = xref{ptr: objptr{uint32(x), 0}, inStream: true, stream: objptr{uint32(v2), 0}, offset: int64(v3)}
 328 | 			default:
 329 | 				fmt.Printf("invalid xref stream type %d: %x\n", v1, buf)
 330 | 			}
 331 | 		}
 332 | 	}
 333 | 	return table, nil
 334 | }
 335 | 
 336 | func decodeInt(b []byte) int {
 337 | 	x := 0
 338 | 	for _, c := range b {
 339 | 		x = x<<8 | int(c)
 340 | 	}
 341 | 	return x
 342 | }
 343 | 
 344 | func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) {
 345 | 	var table []xref
 346 | 
 347 | 	table, err := readXrefTableData(b, table)
 348 | 	if err != nil {
 349 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err)
 350 | 	}
 351 | 
 352 | 	trailer, ok := b.readObject().(dict)
 353 | 	if !ok {
 354 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref table not followed by trailer dictionary")
 355 | 	}
 356 | 
 357 | 	for prevoff := trailer["Prev"]; prevoff != nil; {
 358 | 		off, ok := prevoff.(int64)
 359 | 		if !ok {
 360 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev is not integer: %v", prevoff)
 361 | 		}
 362 | 		b := newBuffer(io.NewSectionReader(r.f, off, r.end-off), off)
 363 | 		tok := b.readToken()
 364 | 		if tok != keyword("xref") {
 365 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev does not point to xref")
 366 | 		}
 367 | 		table, err = readXrefTableData(b, table)
 368 | 		if err != nil {
 369 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: %v", err)
 370 | 		}
 371 | 
 372 | 		trailer, ok := b.readObject().(dict)
 373 | 		if !ok {
 374 | 			return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref Prev table not followed by trailer dictionary")
 375 | 		}
 376 | 		prevoff = trailer["Prev"]
 377 | 	}
 378 | 
 379 | 	size, ok := trailer[name("Size")].(int64)
 380 | 	if !ok {
 381 | 		return nil, objptr{}, nil, fmt.Errorf("malformed PDF: trailer missing /Size entry")
 382 | 	}
 383 | 
 384 | 	if size < int64(len(table)) {
 385 | 		table = table[:size]
 386 | 	}
 387 | 
 388 | 	return table, objptr{}, trailer, nil
 389 | }
 390 | 
 391 | func readXrefTableData(b *buffer, table []xref) ([]xref, error) {
 392 | 	for {
 393 | 		tok := b.readToken()
 394 | 		if tok == keyword("trailer") {
 395 | 			break
 396 | 		}
 397 | 		start, ok1 := tok.(int64)
 398 | 		n, ok2 := b.readToken().(int64)
 399 | 		if !ok1 || !ok2 {
 400 | 			return nil, fmt.Errorf("malformed xref table")
 401 | 		}
 402 | 		for i := 0; i < int(n); i++ {
 403 | 			off, ok1 := b.readToken().(int64)
 404 | 			gen, ok2 := b.readToken().(int64)
 405 | 			alloc, ok3 := b.readToken().(keyword)
 406 | 			if !ok1 || !ok2 || !ok3 || alloc != keyword("f") && alloc != keyword("n") {
 407 | 				return nil, fmt.Errorf("malformed xref table")
 408 | 			}
 409 | 			x := int(start) + i
 410 | 			for cap(table) <= x {
 411 | 				table = append(table[:cap(table)], xref{})
 412 | 			}
 413 | 			if len(table) <= x {
 414 | 				table = table[:x+1]
 415 | 			}
 416 | 			if alloc == "n" && table[x].offset == 0 {
 417 | 				table[x] = xref{ptr: objptr{uint32(x), uint16(gen)}, offset: int64(off)}
 418 | 			}
 419 | 		}
 420 | 	}
 421 | 	return table, nil
 422 | }
 423 | 
 424 | func findLastLine(buf []byte, s string) int {
 425 | 	bs := []byte(s)
 426 | 	max := len(buf)
 427 | 	for {
 428 | 		i := bytes.LastIndex(buf[:max], bs)
 429 | 		if i <= 0 || i+len(bs) >= len(buf) {
 430 | 			return -1
 431 | 		}
 432 | 		if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') {
 433 | 			return i
 434 | 		}
 435 | 		max = i
 436 | 	}
 437 | }
 438 | 
 439 | // A Value is a single PDF value, such as an integer, dictionary, or array.
 440 | // The zero Value is a PDF null (Kind() == Null, IsNull() = true).
 441 | type Value struct {
 442 | 	r    *Reader
 443 | 	ptr  objptr
 444 | 	data interface{}
 445 | 	err  error
 446 | }
 447 | 
 448 | // IsNull reports whether the value is a null. It is equivalent to Kind() == Null.
 449 | func (v Value) IsNull() bool {
 450 | 	return v.data == nil
 451 | }
 452 | 
 453 | // IsError reports whether the value is an error. It is equivalent to v.err != nil
 454 | func (v Value) IsError() bool {
 455 | 	return v.err != nil
 456 | }
 457 | 
 458 | // A ValueKind specifies the kind of data underlying a Value.
 459 | type ValueKind int
 460 | 
 461 | // The PDF value kinds.
 462 | const (
 463 | 	Null ValueKind = iota
 464 | 	Bool
 465 | 	Integer
 466 | 	Real
 467 | 	String
 468 | 	Name
 469 | 	Dict
 470 | 	Array
 471 | 	Stream
 472 | )
 473 | 
 474 | // Kind reports the kind of value underlying v.
 475 | func (v Value) Kind() ValueKind {
 476 | 	switch v.data.(type) {
 477 | 	default:
 478 | 		return Null
 479 | 	case bool:
 480 | 		return Bool
 481 | 	case int64:
 482 | 		return Integer
 483 | 	case float64:
 484 | 		return Real
 485 | 	case string:
 486 | 		return String
 487 | 	case name:
 488 | 		return Name
 489 | 	case dict:
 490 | 		return Dict
 491 | 	case array:
 492 | 		return Array
 493 | 	case stream:
 494 | 		return Stream
 495 | 	}
 496 | }
 497 | 
 498 | // String returns a textual representation of the value v.
 499 | // Note that String is not the accessor for values with Kind() == String.
 500 | // To access such values, see RawString, Text, and TextFromUTF16.
 501 | func (v Value) String() string {
 502 | 	return objfmt(v.data)
 503 | }
 504 | 
 505 | func objfmt(x interface{}) string {
 506 | 	switch x := x.(type) {
 507 | 	default:
 508 | 		return fmt.Sprint(x)
 509 | 	case string:
 510 | 		if isPDFDocEncoded(x) {
 511 | 			return strconv.Quote(pdfDocDecode(x))
 512 | 		}
 513 | 		if isUTF16(x) {
 514 | 			return strconv.Quote(utf16Decode(x[2:]))
 515 | 		}
 516 | 		return strconv.Quote(x)
 517 | 	case name:
 518 | 		return "/" + string(x)
 519 | 	case dict:
 520 | 		var keys []string
 521 | 		for k := range x {
 522 | 			keys = append(keys, string(k))
 523 | 		}
 524 | 		sort.Strings(keys)
 525 | 		var buf bytes.Buffer
 526 | 		buf.WriteString("<<")
 527 | 		for i, k := range keys {
 528 | 			elem := x[name(k)]
 529 | 			if i > 0 {
 530 | 				buf.WriteString(" ")
 531 | 			}
 532 | 			buf.WriteString("/")
 533 | 			buf.WriteString(k)
 534 | 			buf.WriteString(" ")
 535 | 			buf.WriteString(objfmt(elem))
 536 | 		}
 537 | 		buf.WriteString(">>")
 538 | 		return buf.String()
 539 | 
 540 | 	case array:
 541 | 		var buf bytes.Buffer
 542 | 		buf.WriteString("[")
 543 | 		for i, elem := range x {
 544 | 			if i > 0 {
 545 | 				buf.WriteString(" ")
 546 | 			}
 547 | 			buf.WriteString(objfmt(elem))
 548 | 		}
 549 | 		buf.WriteString("]")
 550 | 		return buf.String()
 551 | 
 552 | 	case stream:
 553 | 		return fmt.Sprintf("%v@%d", objfmt(x.hdr), x.offset)
 554 | 
 555 | 	case objptr:
 556 | 		return fmt.Sprintf("%d %d R", x.id, x.gen)
 557 | 
 558 | 	case objdef:
 559 | 		return fmt.Sprintf("{%d %d obj}%v", x.ptr.id, x.ptr.gen, objfmt(x.obj))
 560 | 	}
 561 | }
 562 | 
 563 | // Bool returns v's boolean value.
 564 | // If v.Kind() != Bool, Bool returns false.
 565 | func (v Value) Bool() bool {
 566 | 	x, ok := v.data.(bool)
 567 | 	if !ok {
 568 | 		return false
 569 | 	}
 570 | 	return x
 571 | }
 572 | 
 573 | // Int64 returns v's int64 value.
 574 | // If v.Kind() != Int64, Int64 returns 0.
 575 | func (v Value) Int64() int64 {
 576 | 	x, ok := v.data.(int64)
 577 | 	if !ok {
 578 | 		return 0
 579 | 	}
 580 | 	return x
 581 | }
 582 | 
 583 | // Float64 returns v's float64 value, converting from integer if necessary.
 584 | // If v.Kind() != Float64 and v.Kind() != Int64, Float64 returns 0.
 585 | func (v Value) Float64() float64 {
 586 | 	x, ok := v.data.(float64)
 587 | 	if !ok {
 588 | 		x, ok := v.data.(int64)
 589 | 		if ok {
 590 | 			return float64(x)
 591 | 		}
 592 | 		return 0
 593 | 	}
 594 | 	return x
 595 | }
 596 | 
 597 | // RawString returns v's string value.
 598 | // If v.Kind() != String, RawString returns the empty string.
 599 | func (v Value) RawString() string {
 600 | 	x, ok := v.data.(string)
 601 | 	if !ok {
 602 | 		return ""
 603 | 	}
 604 | 	return x
 605 | }
 606 | 
 607 | // Text returns v's string value interpreted as a ``text string'' (defined in the PDF spec)
 608 | // and converted to UTF-8.
 609 | // If v.Kind() != String, Text returns the empty string.
 610 | func (v Value) Text() string {
 611 | 	x, ok := v.data.(string)
 612 | 	if !ok {
 613 | 		return ""
 614 | 	}
 615 | 	if isPDFDocEncoded(x) {
 616 | 		return pdfDocDecode(x)
 617 | 	}
 618 | 	if isUTF16(x) {
 619 | 		return utf16Decode(x[2:])
 620 | 	}
 621 | 	return x
 622 | }
 623 | 
 624 | // TextFromUTF16 returns v's string value interpreted as big-endian UTF-16
 625 | // and then converted to UTF-8.
 626 | // If v.Kind() != String or if the data is not valid UTF-16, TextFromUTF16 returns
 627 | // the empty string.
 628 | func (v Value) TextFromUTF16() string {
 629 | 	x, ok := v.data.(string)
 630 | 	if !ok {
 631 | 		return ""
 632 | 	}
 633 | 	if len(x)%2 == 1 {
 634 | 		return ""
 635 | 	}
 636 | 	if x == "" {
 637 | 		return ""
 638 | 	}
 639 | 	return utf16Decode(x)
 640 | }
 641 | 
 642 | // Name returns v's name value.
 643 | // If v.Kind() != Name, Name returns the empty string.
 644 | // The returned name does not include the leading slash:
 645 | // if v corresponds to the name written using the syntax /Helvetica,
 646 | // Name() == "Helvetica".
 647 | func (v Value) Name() string {
 648 | 	x, ok := v.data.(name)
 649 | 	if !ok {
 650 | 		return ""
 651 | 	}
 652 | 	return string(x)
 653 | }
 654 | 
 655 | // Key returns the value associated with the given name key in the dictionary v.
 656 | // Like the result of the Name method, the key should not include a leading slash.
 657 | // If v is a stream, Key applies to the stream's header dictionary.
 658 | // If v.Kind() != Dict and v.Kind() != Stream, Key returns a null Value.
 659 | func (v Value) Key(key string) Value {
 660 | 	if v.IsError() {
 661 | 		return v
 662 | 	}
 663 | 	x, ok := v.data.(dict)
 664 | 	if !ok {
 665 | 		strm, ok := v.data.(stream)
 666 | 		if !ok {
 667 | 			return Value{}
 668 | 		}
 669 | 		x = strm.hdr
 670 | 	}
 671 | 	return v.r.resolve(v.ptr, x[name(key)])
 672 | }
 673 | 
 674 | // Keys returns a sorted list of the keys in the dictionary v.
 675 | // If v is a stream, Keys applies to the stream's header dictionary.
 676 | // If v.Kind() != Dict and v.Kind() != Stream, Keys returns nil.
 677 | func (v Value) Keys() []string {
 678 | 	x, ok := v.data.(dict)
 679 | 	if !ok {
 680 | 		strm, ok := v.data.(stream)
 681 | 		if !ok {
 682 | 			return nil
 683 | 		}
 684 | 		x = strm.hdr
 685 | 	}
 686 | 	keys := []string{} // not nil
 687 | 	for k := range x {
 688 | 		keys = append(keys, string(k))
 689 | 	}
 690 | 	sort.Strings(keys)
 691 | 	return keys
 692 | }
 693 | 
 694 | // Index returns the i'th element in the array v.
 695 | // If v.Kind() != Array or if i is outside the array bounds,
 696 | // Index returns a null Value.
 697 | func (v Value) Index(i int) Value {
 698 | 	if v.IsError() {
 699 | 		return v
 700 | 	}
 701 | 	x, ok := v.data.(array)
 702 | 	if !ok || i < 0 || i >= len(x) {
 703 | 		return Value{}
 704 | 	}
 705 | 	return v.r.resolve(v.ptr, x[i])
 706 | }
 707 | 
 708 | // Len returns the length of the array v.
 709 | // If v.Kind() != Array, Len returns 0.
 710 | func (v Value) Len() int {
 711 | 	x, ok := v.data.(array)
 712 | 	if !ok {
 713 | 		return 0
 714 | 	}
 715 | 	return len(x)
 716 | }
 717 | 
 718 | func (r *Reader) resolve(parent objptr, x interface{}) Value {
 719 | 	if ptr, ok := x.(objptr); ok {
 720 | 		if ptr.id >= uint32(len(r.xref)) {
 721 | 			return Value{}
 722 | 		}
 723 | 		xref := r.xref[ptr.id]
 724 | 		if xref.ptr != ptr || !xref.inStream && xref.offset == 0 {
 725 | 			return Value{}
 726 | 		}
 727 | 		var obj object
 728 | 		if xref.inStream {
 729 | 			strm := r.resolve(parent, xref.stream)
 730 | 		Search:
 731 | 			for {
 732 | 				if strm.Kind() != Stream {
 733 | 					return Value{err: errors.New("not a stream")}
 734 | 				}
 735 | 				if strm.Key("Type").Name() != "ObjStm" {
 736 | 					return Value{err: errors.New("not an object stream")}
 737 | 				}
 738 | 				n := int(strm.Key("N").Int64())
 739 | 				first := strm.Key("First").Int64()
 740 | 				if first == 0 {
 741 | 					return Value{err: errors.New("missing First")}
 742 | 				}
 743 | 				b := newBuffer(strm.Reader(), 0)
 744 | 				b.allowEOF = true
 745 | 				for i := 0; i < n; i++ {
 746 | 					id, _ := b.readToken().(int64)
 747 | 					off, _ := b.readToken().(int64)
 748 | 					if uint32(id) == ptr.id {
 749 | 						b.seekForward(first + off)
 750 | 						x = b.readObject()
 751 | 						break Search
 752 | 					}
 753 | 				}
 754 | 				ext := strm.Key("Extends")
 755 | 				if ext.Kind() != Stream {
 756 | 					return Value{err: errors.New("cannot find object in stream")}
 757 | 				}
 758 | 				strm = ext
 759 | 			}
 760 | 		} else {
 761 | 			b := newBuffer(io.NewSectionReader(r.f, xref.offset, r.end-xref.offset), xref.offset)
 762 | 			b.key = r.key
 763 | 			b.useAES = r.useAES
 764 | 			obj = b.readObject()
 765 | 			def, ok := obj.(objdef)
 766 | 			if !ok {
 767 | 				return Value{err: fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)}
 768 | 			}
 769 | 			if def.ptr != ptr {
 770 | 				return Value{err: fmt.Errorf("loading %v: found %v", ptr, def.ptr)}
 771 | 			}
 772 | 			x = def.obj
 773 | 		}
 774 | 		parent = ptr
 775 | 	}
 776 | 
 777 | 	switch x := x.(type) {
 778 | 	case nil, bool, int64, float64, name, dict, array, stream:
 779 | 		return Value{r: r, ptr: parent, data: x}
 780 | 	case string:
 781 | 		return Value{r: r, ptr: parent, data: x}
 782 | 	default:
 783 | 		return Value{err: fmt.Errorf("unexpected value type %T in resolve", x)}
 784 | 	}
 785 | }
 786 | 
 787 | type errorReadCloser struct {
 788 | 	err error
 789 | }
 790 | 
 791 | func (e *errorReadCloser) Read([]byte) (int, error) {
 792 | 	return 0, e.err
 793 | }
 794 | 
 795 | func (e *errorReadCloser) Close() error {
 796 | 	return e.err
 797 | }
 798 | 
 799 | // Reader returns the data contained in the stream v.
 800 | // If v.Kind() != Stream, Reader returns a ReadCloser that
 801 | // responds to all reads with a ``stream not present'' error.
 802 | func (v Value) Reader() io.ReadCloser {
 803 | 	if v.IsError() {
 804 | 		return &errorReadCloser{errors.Wrap(v.err, "stream not present")}
 805 | 	}
 806 | 	x, ok := v.data.(stream)
 807 | 	if !ok {
 808 | 		return &errorReadCloser{fmt.Errorf("stream not present")}
 809 | 	}
 810 | 	var rd io.Reader
 811 | 	rd = io.NewSectionReader(v.r.f, x.offset, v.Key("Length").Int64())
 812 | 	if v.r.key != nil {
 813 | 		rd = decryptStream(v.r.key, v.r.useAES, x.ptr, rd)
 814 | 	}
 815 | 	filter := v.Key("Filter")
 816 | 	param := v.Key("DecodeParms")
 817 | 	switch filter.Kind() {
 818 | 	default:
 819 | 		return &errorReadCloser{fmt.Errorf("unsupported filter %v", filter)}
 820 | 	case Null:
 821 | 		// ok
 822 | 	case Name:
 823 | 		rd = applyFilter(rd, filter.Name(), param)
 824 | 	case Array:
 825 | 		for i := 0; i < filter.Len(); i++ {
 826 | 			rd = applyFilter(rd, filter.Index(i).Name(), param.Index(i))
 827 | 		}
 828 | 	}
 829 | 
 830 | 	return ioutil.NopCloser(rd)
 831 | }
 832 | 
 833 | func applyFilter(rd io.Reader, name string, param Value) io.Reader {
 834 | 	switch name {
 835 | 	default:
 836 | 		return &errorReadCloser{errors.New("unknown filter " + name)}
 837 | 	case "FlateDecode":
 838 | 		zr, err := zlib.NewReader(rd)
 839 | 		if err != nil {
 840 | 			return &errorReadCloser{err}
 841 | 		}
 842 | 		pred := param.Key("Predictor")
 843 | 		if pred.Kind() == Null {
 844 | 			return zr
 845 | 		}
 846 | 		columns := param.Key("Columns").Int64()
 847 | 		switch pred.Int64() {
 848 | 		default:
 849 | 			return &errorReadCloser{errors.Errorf("unknown predictor %v", pred)}
 850 | 		case 12:
 851 | 			return &pngUpReader{r: zr, hist: make([]byte, 1+columns), tmp: make([]byte, 1+columns)}
 852 | 		}
 853 | 	}
 854 | }
 855 | 
 856 | type pngUpReader struct {
 857 | 	r    io.Reader
 858 | 	hist []byte
 859 | 	tmp  []byte
 860 | 	pend []byte
 861 | }
 862 | 
 863 | func (r *pngUpReader) Read(b []byte) (int, error) {
 864 | 	n := 0
 865 | 	for len(b) > 0 {
 866 | 		if len(r.pend) > 0 {
 867 | 			m := copy(b, r.pend)
 868 | 			n += m
 869 | 			b = b[m:]
 870 | 			r.pend = r.pend[m:]
 871 | 			continue
 872 | 		}
 873 | 		_, err := io.ReadFull(r.r, r.tmp)
 874 | 		if err != nil {
 875 | 			return n, err
 876 | 		}
 877 | 		if r.tmp[0] != 2 {
 878 | 			return n, fmt.Errorf("malformed PNG-Up encoding")
 879 | 		}
 880 | 		for i, b := range r.tmp {
 881 | 			r.hist[i] += b
 882 | 		}
 883 | 		r.pend = r.hist[1:]
 884 | 	}
 885 | 	return n, nil
 886 | }
 887 | 
 888 | var passwordPad = []byte{
 889 | 	0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08,
 890 | 	0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A,
 891 | }
 892 | 
 893 | func (r *Reader) initEncrypt(password string) error {
 894 | 	// See PDF 32000-1:2008, §7.6.
 895 | 	encrypt, _ := r.resolve(objptr{}, r.trailer["Encrypt"]).data.(dict)
 896 | 	if encrypt["Filter"] != name("Standard") {
 897 | 		return fmt.Errorf("unsupported PDF: encryption filter %v", objfmt(encrypt["Filter"]))
 898 | 	}
 899 | 	n, _ := encrypt["Length"].(int64)
 900 | 	if n == 0 {
 901 | 		n = 40
 902 | 	}
 903 | 	if n%8 != 0 || n > 128 || n < 40 {
 904 | 		return fmt.Errorf("malformed PDF: %d-bit encryption key", n)
 905 | 	}
 906 | 	V, _ := encrypt["V"].(int64)
 907 | 	if V != 1 && V != 2 && (V != 4 || !okayV4(encrypt)) {
 908 | 		return fmt.Errorf("unsupported PDF: encryption version V=%d; %v", V, objfmt(encrypt))
 909 | 	}
 910 | 
 911 | 	ids, ok := r.trailer["ID"].(array)
 912 | 	if !ok || len(ids) < 1 {
 913 | 		return fmt.Errorf("malformed PDF: missing ID in trailer")
 914 | 	}
 915 | 	idstr, ok := ids[0].(string)
 916 | 	if !ok {
 917 | 		return fmt.Errorf("malformed PDF: missing ID in trailer")
 918 | 	}
 919 | 	ID := []byte(idstr)
 920 | 
 921 | 	R, _ := encrypt["R"].(int64)
 922 | 	if R < 2 {
 923 | 		return fmt.Errorf("malformed PDF: encryption revision R=%d", R)
 924 | 	}
 925 | 	if R > 4 {
 926 | 		return fmt.Errorf("unsupported PDF: encryption revision R=%d", R)
 927 | 	}
 928 | 	O, _ := encrypt["O"].(string)
 929 | 	U, _ := encrypt["U"].(string)
 930 | 	if len(O) != 32 || len(U) != 32 {
 931 | 		return fmt.Errorf("malformed PDF: missing O= or U= encryption parameters")
 932 | 	}
 933 | 	p, _ := encrypt["P"].(int64)
 934 | 	P := uint32(p)
 935 | 
 936 | 	// TODO: Password should be converted to Latin-1.
 937 | 	pw := []byte(password)
 938 | 	h := md5.New()
 939 | 	if len(pw) >= 32 {
 940 | 		h.Write(pw[:32])
 941 | 	} else {
 942 | 		h.Write(pw)
 943 | 		h.Write(passwordPad[:32-len(pw)])
 944 | 	}
 945 | 	h.Write([]byte(O))
 946 | 	h.Write([]byte{byte(P), byte(P >> 8), byte(P >> 16), byte(P >> 24)})
 947 | 	h.Write([]byte(ID))
 948 | 	key := h.Sum(nil)
 949 | 
 950 | 	if R >= 3 {
 951 | 		for i := 0; i < 50; i++ {
 952 | 			h.Reset()
 953 | 			h.Write(key[:n/8])
 954 | 			key = h.Sum(key[:0])
 955 | 		}
 956 | 		key = key[:n/8]
 957 | 	} else {
 958 | 		key = key[:40/8]
 959 | 	}
 960 | 
 961 | 	c, err := rc4.NewCipher(key)
 962 | 	if err != nil {
 963 | 		return fmt.Errorf("malformed PDF: invalid RC4 key: %v", err)
 964 | 	}
 965 | 
 966 | 	var u []byte
 967 | 	if R == 2 {
 968 | 		u = make([]byte, 32)
 969 | 		copy(u, passwordPad)
 970 | 		c.XORKeyStream(u, u)
 971 | 	} else {
 972 | 		h.Reset()
 973 | 		h.Write(passwordPad)
 974 | 		h.Write([]byte(ID))
 975 | 		u = h.Sum(nil)
 976 | 		c.XORKeyStream(u, u)
 977 | 
 978 | 		for i := 1; i <= 19; i++ {
 979 | 			key1 := make([]byte, len(key))
 980 | 			copy(key1, key)
 981 | 			for j := range key1 {
 982 | 				key1[j] ^= byte(i)
 983 | 			}
 984 | 			c, _ = rc4.NewCipher(key1)
 985 | 			c.XORKeyStream(u, u)
 986 | 		}
 987 | 	}
 988 | 
 989 | 	if !bytes.HasPrefix([]byte(U), u) {
 990 | 		return ErrInvalidPassword
 991 | 	}
 992 | 
 993 | 	r.key = key
 994 | 	r.useAES = V == 4
 995 | 
 996 | 	return nil
 997 | }
 998 | 
 999 | var ErrInvalidPassword = fmt.Errorf("encrypted PDF: invalid password")
1000 | 
1001 | func okayV4(encrypt dict) bool {
1002 | 	cf, ok := encrypt["CF"].(dict)
1003 | 	if !ok {
1004 | 		return false
1005 | 	}
1006 | 	stmf, ok := encrypt["StmF"].(name)
1007 | 	if !ok {
1008 | 		return false
1009 | 	}
1010 | 	strf, ok := encrypt["StrF"].(name)
1011 | 	if !ok {
1012 | 		return false
1013 | 	}
1014 | 	if stmf != strf {
1015 | 		return false
1016 | 	}
1017 | 	cfparam, ok := cf[stmf].(dict)
1018 | 	if cfparam["AuthEvent"] != nil && cfparam["AuthEvent"] != name("DocOpen") {
1019 | 		return false
1020 | 	}
1021 | 	if cfparam["Length"] != nil && cfparam["Length"] != int64(16) {
1022 | 		return false
1023 | 	}
1024 | 	if cfparam["CFM"] != name("AESV2") {
1025 | 		return false
1026 | 	}
1027 | 	return true
1028 | }
1029 | 
1030 | func cryptKey(key []byte, useAES bool, ptr objptr) []byte {
1031 | 	h := md5.New()
1032 | 	h.Write(key)
1033 | 	h.Write([]byte{byte(ptr.id), byte(ptr.id >> 8), byte(ptr.id >> 16), byte(ptr.gen), byte(ptr.gen >> 8)})
1034 | 	if useAES {
1035 | 		h.Write([]byte("sAlT"))
1036 | 	}
1037 | 	return h.Sum(nil)
1038 | }
1039 | 
1040 | func decryptString(key []byte, useAES bool, ptr objptr, x string) (string, error) {
1041 | 	key = cryptKey(key, useAES, ptr)
1042 | 	if useAES {
1043 | 		return "", errors.New("AES not implemented")
1044 | 	}
1045 | 
1046 | 	c, _ := rc4.NewCipher(key)
1047 | 	data := []byte(x)
1048 | 	c.XORKeyStream(data, data)
1049 | 	return string(data), nil
1050 | }
1051 | 
1052 | func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader {
1053 | 	key = cryptKey(key, useAES, ptr)
1054 | 	if useAES {
1055 | 		cb, err := aes.NewCipher(key)
1056 | 		if err != nil {
1057 | 			return &errorReadCloser{errors.New("AES: " + err.Error())}
1058 | 		}
1059 | 		iv := make([]byte, 16)
1060 | 		io.ReadFull(rd, iv)
1061 | 		cbc := cipher.NewCBCDecrypter(cb, iv)
1062 | 		rd = &cbcReader{cbc: cbc, rd: rd, buf: make([]byte, 16)}
1063 | 	} else {
1064 | 		c, _ := rc4.NewCipher(key)
1065 | 		rd = &cipher.StreamReader{S: c, R: rd}
1066 | 	}
1067 | 	return rd
1068 | }
1069 | 
1070 | type cbcReader struct {
1071 | 	cbc  cipher.BlockMode
1072 | 	rd   io.Reader
1073 | 	buf  []byte
1074 | 	pend []byte
1075 | }
1076 | 
1077 | func (r *cbcReader) Read(b []byte) (n int, err error) {
1078 | 	if len(r.pend) == 0 {
1079 | 		_, err = io.ReadFull(r.rd, r.buf)
1080 | 		if err != nil {
1081 | 			return 0, err
1082 | 		}
1083 | 		r.cbc.CryptBlocks(r.buf, r.buf)
1084 | 		r.pend = r.buf
1085 | 	}
1086 | 	n = copy(b, r.pend)
1087 | 	r.pend = r.pend[n:]
1088 | 	return n, nil
1089 | }
1090 | 


--------------------------------------------------------------------------------
/text.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package pdf
  6 | 
  7 | import (
  8 | 	"unicode"
  9 | 	"unicode/utf16"
 10 | )
 11 | 
 12 | const noRune = unicode.ReplacementChar
 13 | 
 14 | func isPDFDocEncoded(s string) bool {
 15 | 	if isUTF16(s) {
 16 | 		return false
 17 | 	}
 18 | 	for i := 0; i < len(s); i++ {
 19 | 		if pdfDocEncoding[s[i]] == noRune {
 20 | 			return false
 21 | 		}
 22 | 	}
 23 | 	return true
 24 | }
 25 | 
 26 | func pdfDocDecode(s string) string {
 27 | 	for i := 0; i < len(s); i++ {
 28 | 		if s[i] >= 0x80 || pdfDocEncoding[s[i]] != rune(s[i]) {
 29 | 			goto Decode
 30 | 		}
 31 | 	}
 32 | 	return s
 33 | 
 34 | Decode:
 35 | 	r := make([]rune, len(s))
 36 | 	for i := 0; i < len(s); i++ {
 37 | 		r[i] = pdfDocEncoding[s[i]]
 38 | 	}
 39 | 	return string(r)
 40 | }
 41 | 
 42 | func isUTF16(s string) bool {
 43 | 	return len(s) >= 2 && s[0] == 0xfe && s[1] == 0xff && len(s)%2 == 0
 44 | }
 45 | 
 46 | func utf16Decode(s string) string {
 47 | 	var u []uint16
 48 | 	for i := 0; i < len(s); i += 2 {
 49 | 		u = append(u, uint16(s[i])<<8|uint16(s[i+1]))
 50 | 	}
 51 | 	return string(utf16.Decode(u))
 52 | }
 53 | 
 54 | // See PDF 32000-1:2008, Table D.2
 55 | var pdfDocEncoding = [256]rune{
 56 | 	noRune, noRune, noRune, noRune, noRune, noRune, noRune, noRune,
 57 | 	noRune, 0x0009, 0x000a, noRune, noRune, 0x000d, noRune, noRune,
 58 | 	noRune, noRune, noRune, noRune, noRune, noRune, noRune, noRune,
 59 | 	0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
 60 | 	0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
 61 | 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
 62 | 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
 63 | 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
 64 | 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
 65 | 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
 66 | 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
 67 | 	0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
 68 | 	0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
 69 | 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
 70 | 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
 71 | 	0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, noRune,
 72 | 	0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
 73 | 	0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
 74 | 	0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
 75 | 	0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, noRune,
 76 | 	0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
 77 | 	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, noRune, 0x00ae, 0x00af,
 78 | 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
 79 | 	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
 80 | 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
 81 | 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
 82 | 	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
 83 | 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
 84 | 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
 85 | 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
 86 | 	0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
 87 | 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
 88 | }
 89 | 
 90 | var winAnsiEncoding = [256]rune{
 91 | 	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
 92 | 	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
 93 | 	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
 94 | 	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
 95 | 	0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
 96 | 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
 97 | 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
 98 | 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
 99 | 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
100 | 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
101 | 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
102 | 	0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
103 | 	0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
104 | 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
105 | 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
106 | 	0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
107 | 	0x20ac, noRune, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
108 | 	0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, noRune, 0x017d, noRune,
109 | 	noRune, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
110 | 	0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, noRune, 0x017e, 0x0178,
111 | 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
112 | 	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
113 | 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
114 | 	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
115 | 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
116 | 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
117 | 	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
118 | 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
119 | 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
120 | 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
121 | 	0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
122 | 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
123 | }
124 | 
125 | var macRomanEncoding = [256]rune{
126 | 	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
127 | 	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
128 | 	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
129 | 	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
130 | 	0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
131 | 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
132 | 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
133 | 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
134 | 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
135 | 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
136 | 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
137 | 	0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
138 | 	0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
139 | 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
140 | 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
141 | 	0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
142 | 	0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,
143 | 	0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,
144 | 	0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,
145 | 	0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,
146 | 	0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,
147 | 	0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,
148 | 	0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,
149 | 	0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8,
150 | 	0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,
151 | 	0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,
152 | 	0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,
153 | 	0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02,
154 | 	0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,
155 | 	0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,
156 | 	0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,
157 | 	0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
158 | }
159 | 


--------------------------------------------------------------------------------