├── Conversion_test.go
├── DOC 2 Text.go
├── DOCX 2 Text.go
├── Decompress.go
├── EPUB 2 Text.go
├── HTML 2 Text.go
├── LICENSE
├── MBOX.go
├── MOBI 2 Text.go
├── ODS 2 Text.go
├── ODT 2 Text.go
├── PDF 2 Image.go
├── PDF 2 Text.go
├── PPT 2 Text.go
├── PPTX 2 Text.go
├── Picture.go
├── README.md
├── RTF 2 Text.go
├── XLS 2 Text.go
├── XLSX 2 Text.go
├── ZIP.go
├── html2text
    ├── README.md
    ├── html2text.go
    ├── html2text_test.go
    └── testdata
    │   ├── utf8.html
    │   └── utf8_with_bom.xhtml
├── odf
    ├── Readme.md
    ├── meta.go
    ├── meta_test.go
    ├── ods
    │   ├── ods_test.go
    │   ├── read.go
    │   └── test.ods
    └── read.go
├── ole2
    ├── README.md
    ├── dir.go
    ├── header.go
    ├── ole.go
    ├── pss.go
    ├── sector.go
    ├── stream_reader.go
    └── stream_reader_test.go
└── xls
    ├── README.md
    ├── bigtable_test.go
    ├── bof.go
    ├── cell_range.go
    ├── col.go
    ├── comparexlsxlsx.go
    ├── date.go
    ├── doc.go
    ├── example_test.go
    ├── font.go
    ├── format.go
    ├── issue47_test.go
    ├── row.go
    ├── sst.go
    ├── workbook.go
    ├── worksheet.go
    ├── xf.go
    ├── xls.go
    └── xls_test.go


/Conversion_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  Conversion_test.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | */
  6 | 
  7 | package fileconversion
  8 | 
  9 | import (
 10 | 	"bytes"
 11 | 	"fmt"
 12 | 	"io/ioutil"
 13 | 	"os"
 14 | 	"testing"
 15 | )
 16 | 
 17 | func TestXLS(t *testing.T) {
 18 | 	// open local file to extract text and output to command line
 19 | 	file, err := os.Open("test.xls")
 20 | 	if err != nil {
 21 | 		return
 22 | 	}
 23 | 
 24 | 	defer file.Close()
 25 | 
 26 | 	XLS2Text(file, os.Stdout, 1*1024*1024)
 27 | }
 28 | 
 29 | func TestPPTX(t *testing.T) {
 30 | 	// open local file to extract text and output to command line
 31 | 	file, err := os.Open("test.pptx")
 32 | 	if err != nil {
 33 | 		return
 34 | 	}
 35 | 
 36 | 	defer file.Close()
 37 | 
 38 | 	stat, _ := file.Stat()
 39 | 
 40 | 	text, _ := PPTX2Text(file, stat.Size())
 41 | 	fmt.Print(text)
 42 | }
 43 | 
 44 | func TestODS(t *testing.T) {
 45 | 	// open local file to extract text and output to command line
 46 | 	file, err := os.Open("test.ods")
 47 | 	if err != nil {
 48 | 		return
 49 | 	}
 50 | 
 51 | 	defer file.Close()
 52 | 	stat, _ := file.Stat()
 53 | 
 54 | 	ODS2Text(file, stat.Size(), os.Stdout, 1*1024*1024)
 55 | }
 56 | 
 57 | func TestExcelCell(t *testing.T) {
 58 | 	file1, err := os.Open("test.xls")
 59 | 	if err == nil {
 60 | 		cells, _ := XLS2Cells(file1)
 61 | 		for n, cell := range cells {
 62 | 			fmt.Printf("%s\n", cell)
 63 | 			if n > 20 {
 64 | 				break
 65 | 			}
 66 | 		}
 67 | 
 68 | 		file1.Close()
 69 | 	}
 70 | 
 71 | 	file1, err = os.Open("test.xlsx")
 72 | 	if err == nil {
 73 | 		stat, _ := file1.Stat()
 74 | 		cells, _ := XLSX2Cells(file1, stat.Size(), 1000)
 75 | 		for n, cell := range cells {
 76 | 			fmt.Printf("%s\n", cell)
 77 | 			if n > 20 {
 78 | 				break
 79 | 			}
 80 | 		}
 81 | 
 82 | 		file1.Close()
 83 | 	}
 84 | 
 85 | 	file1, err = os.Open("test.ods")
 86 | 	if err == nil {
 87 | 		stat, _ := file1.Stat()
 88 | 		cells, _ := ODS2Cells(file1, stat.Size())
 89 | 		for n, cell := range cells {
 90 | 			fmt.Printf("%s\n", cell)
 91 | 			if n > 20 {
 92 | 				break
 93 | 			}
 94 | 		}
 95 | 
 96 | 		file1.Close()
 97 | 	}
 98 | 
 99 | }
100 | 
101 | func TestCSV(t *testing.T) {
102 | 	file, err := os.Open("test.txt")
103 | 	if err != nil {
104 | 		return
105 | 	}
106 | 	defer file.Close()
107 | 
108 | 	content, _ := ioutil.ReadAll(file)
109 | 
110 | 	IsCSV(content)
111 | }
112 | 
113 | func TestEPUB(t *testing.T) {
114 | 	// open local file to extract text and output to command line
115 | 	file, err := os.Open("moby-dick.epub")
116 | 	if err != nil {
117 | 		return
118 | 	}
119 | 
120 | 	defer file.Close()
121 | 
122 | 	stat, _ := file.Stat()
123 | 
124 | 	text, _ := EPUB2Text(file, stat.Size(), 1000)
125 | 	fmt.Print(text)
126 | }
127 | 
128 | func TestMOBI(t *testing.T) {
129 | 	// open local file to extract text and output to command line
130 | 	file, err := os.Open("windows-1252.mobi")
131 | 	if err != nil {
132 | 		return
133 | 	}
134 | 
135 | 	defer file.Close()
136 | 
137 | 	text, _ := Mobi2Text(file)
138 | 	fmt.Print(text)
139 | }
140 | 
141 | func TestPDFImage(t *testing.T) {
142 | 	// open local file to extract images
143 | 	file, err := os.Open("test.pdf")
144 | 	if err != nil {
145 | 		return
146 | 	}
147 | 
148 | 	defer file.Close()
149 | 
150 | 	images, _ := PDFExtractImages(file)
151 | 	fmt.Print(len(images))
152 | }
153 | 
154 | func TestPD2Text(t *testing.T) {
155 | 	file, err := os.Open("1.pdf")
156 | 	if err != nil {
157 | 		return
158 | 	}
159 | 
160 | 	defer file.Close()
161 | 
162 | 	buffer := bytes.NewBuffer(make([]byte, 0, 2*1024))
163 | 	PDFListContentStreams(file, buffer, 2*1024)
164 | 
165 | 	fmt.Println(buffer.String())
166 | }
167 | 
168 | func TestODTText(t *testing.T) {
169 | 	file, err := os.Open("Test\\file-sample_500kB.odt")
170 | 	if err != nil {
171 | 		return
172 | 	}
173 | 
174 | 	defer file.Close()
175 | 	stat, _ := file.Stat()
176 | 
177 | 	buffer := bytes.NewBuffer(make([]byte, 0, 2*1024))
178 | 
179 | 	ODT2Text(file, stat.Size(), buffer, 2*1024)
180 | 
181 | 	fmt.Println(buffer.String())
182 | }
183 | 
184 | // TestXLSX extracts text from an XLSX file.
185 | // Memory usage: 100 rows = 52 MB, 500 rows = 200 MB, 1000 rows = 400 MB, 2000/5000/10000/-1 rows = 700 MB
186 | func TestXLSX(t *testing.T) {
187 | 	file, err := os.Open("Test\\971bd55b-5cbd-43d2-899e-d4a2a7d0a883.xlsx")
188 | 	if err != nil {
189 | 		return
190 | 	}
191 | 
192 | 	defer file.Close()
193 | 	stat, _ := file.Stat()
194 | 
195 | 	buffer := bytes.NewBuffer(make([]byte, 0, 2*1024))
196 | 
197 | 	XLSX2Text(file, stat.Size(), buffer, 2*1024, -1)
198 | 
199 | 	fmt.Println(buffer.String())
200 | }
201 | 


--------------------------------------------------------------------------------
/DOC 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  DOC 2 Text.go
  3 | Copyright:  2018 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | This code is forked from https://github.com/EndFirstCorp/doc2txt and extracts text from DOC files, the legacy binary Word files.
  7 | */
  8 | 
  9 | package fileconversion
 10 | 
 11 | import (
 12 | 	"bytes"
 13 | 	"encoding/binary"
 14 | 	"errors"
 15 | 	"io"
 16 | 	"unicode/utf16"
 17 | 	"unicode/utf8"
 18 | 
 19 | 	"github.com/mattetti/filebuffer"
 20 | 	"github.com/richardlehane/mscfb"
 21 | )
 22 | 
 23 | // ---- file doc.go ----
 24 | // There were a few changes in this file to actually support Unicode which the old code was not.
 25 | 
 26 | var (
 27 | 	errTable           = errors.New("cannot find table stream")
 28 | 	errDocEmpty        = errors.New("WordDocument not found")
 29 | 	errDocShort        = errors.New("wordDoc block too short")
 30 | 	errInvalidArgument = errors.New("invalid table and/or fib")
 31 | )
 32 | 
 33 | type allReader interface {
 34 | 	io.Closer
 35 | 	io.ReaderAt
 36 | 	io.ReadSeeker
 37 | }
 38 | 
 39 | func wrapError(e error) error {
 40 | 	return errors.New("Error processing file: " + e.Error())
 41 | }
 42 | 
 43 | // DOC2Text converts a standard io.Reader from a Microsoft Word .doc binary file and returns a reader (actually a bytes.Buffer) which will output the plain text found in the .doc file
 44 | func DOC2Text(r io.Reader) (io.Reader, error) {
 45 | 	ra, ok := r.(io.ReaderAt)
 46 | 	if !ok {
 47 | 		ra, _, err := toMemoryBuffer(r)
 48 | 		if err != nil {
 49 | 			return nil, wrapError(err)
 50 | 		}
 51 | 		defer ra.Close()
 52 | 	}
 53 | 
 54 | 	d, err := mscfb.New(ra)
 55 | 	if err != nil {
 56 | 		return nil, wrapError(err)
 57 | 	}
 58 | 
 59 | 	wordDoc, table0, table1 := getWordDocAndTables(d)
 60 | 	fib, err := getFib(wordDoc)
 61 | 	if err != nil {
 62 | 		return nil, wrapError(err)
 63 | 	}
 64 | 
 65 | 	table := getActiveTable(table0, table1, fib)
 66 | 	if table == nil {
 67 | 		return nil, wrapError(errTable)
 68 | 	}
 69 | 
 70 | 	clx, err := getClx(table, fib)
 71 | 	if err != nil {
 72 | 		return nil, wrapError(err)
 73 | 	}
 74 | 
 75 | 	return getText(wordDoc, clx)
 76 | }
 77 | 
 78 | func toMemoryBuffer(r io.Reader) (allReader, int64, error) {
 79 | 	var b bytes.Buffer
 80 | 	size, err := b.ReadFrom(r)
 81 | 	if err != nil {
 82 | 		return nil, 0, err
 83 | 	}
 84 | 	fb := filebuffer.New(b.Bytes())
 85 | 	return fb, size, nil
 86 | }
 87 | 
 88 | func getText(wordDoc *mscfb.File, clx *clx) (io.Reader, error) {
 89 | 	var buf bytes.Buffer
 90 | 	for i := 0; i < len(clx.pcdt.PlcPcd.aPcd); i++ {
 91 | 		pcd := clx.pcdt.PlcPcd.aPcd[i]
 92 | 		cp := clx.pcdt.PlcPcd.aCP[i]
 93 | 		cpNext := clx.pcdt.PlcPcd.aCP[i+1]
 94 | 
 95 | 		var start, end int
 96 | 		// https://msdn.microsoft.com/ko-kr/library/office/gg615596(v=office.14).aspx
 97 | 		// Read the value of the Pcd.Fc.fCompressed field at bit 46 of the current Pcd structure. If 0, the Pcd structure refers to a 16-bit Unicode character. If 1, it refers to an 8-bit ANSI character.
 98 | 		if pcd.fc.fCompressed {
 99 | 			start = pcd.fc.fc / 2
100 | 			end = start + cpNext - cp
101 | 		} else {
102 | 			// -> 16-bit Unicode characters
103 | 			start = pcd.fc.fc
104 | 			end = start + 2*(cpNext-cp)
105 | 		}
106 | 
107 | 		b := make([]byte, end-start)
108 | 		_, err := wordDoc.ReadAt(b, int64(start)) // read all the characters
109 | 		if err != nil {
110 | 			return nil, err
111 | 		}
112 | 		translateText(b, &buf, pcd.fc.fCompressed)
113 | 	}
114 | 	return &buf, nil
115 | }
116 | 
117 | // translateText translates the buffer into text. fCompressed = 0 for 16-bit Unicode, 1 = 8-bit ANSI characters.
118 | func translateText(b []byte, buf *bytes.Buffer, fCompressed bool) {
119 | 	u16s := make([]uint16, 1)
120 | 	b8buf := make([]byte, 4)
121 | 
122 | 	fieldLevel := 0
123 | 	var isFieldChar bool
124 | 	for cIndex := range b {
125 | 		// Convert to rune
126 | 		var char rune
127 | 		if fCompressed {
128 | 			// ANSI, 1 byte
129 | 			char = rune(b[cIndex])
130 | 		} else {
131 | 			// 16-bit Unicode: skip every second byte
132 | 			if cIndex%2 != 0 {
133 | 				continue
134 | 			} else if (cIndex + 1) >= len(b) { // make sure there are at least 2 bytes for Unicode decoding
135 | 				continue
136 | 			}
137 | 
138 | 			// convert from UTF16 to UTF8
139 | 			u16s[0] = uint16(b[cIndex]) + (uint16(b[cIndex+1]) << 8)
140 | 			r := utf16.Decode(u16s)
141 | 			if len(r) != 1 {
142 | 				//fmt.Printf("Invalid rune %v\n", r)
143 | 				continue
144 | 			}
145 | 			char = r[0]
146 | 		}
147 | 
148 | 		// Handle special field characters (section 2.8.25)
149 | 		if char == 0x13 {
150 | 			isFieldChar = true
151 | 			fieldLevel++
152 | 			continue
153 | 		} else if char == 0x14 {
154 | 			isFieldChar = false
155 | 			continue
156 | 		} else if char == 0x15 {
157 | 			isFieldChar = false
158 | 			continue
159 | 		} else if isFieldChar {
160 | 			continue
161 | 		}
162 | 
163 | 		if char == 7 { // table column separator
164 | 			buf.WriteByte(' ')
165 | 			continue
166 | 		} else if char < 32 && char != 9 && char != 10 && char != 13 { // skip non-printable ASCII characters
167 | 			//buf.Write([]byte(fmt.Sprintf("|%#x|", char)))
168 | 			continue
169 | 		}
170 | 
171 | 		if fCompressed { // compressed, so replace compressed characters
172 | 			buf.Write(replaceCompressed(byte(char)))
173 | 		} else {
174 | 			// encode the rune to UTF-8
175 | 			n := utf8.EncodeRune(b8buf, char)
176 | 			buf.Write(b8buf[:n])
177 | 		}
178 | 	}
179 | }
180 | 
181 | func replaceCompressed(char byte) []byte {
182 | 	var v uint16
183 | 	switch char {
184 | 	case 0x82:
185 | 		v = 0x201A
186 | 	case 0x83:
187 | 		v = 0x0192
188 | 	case 0x84:
189 | 		v = 0x201E
190 | 	case 0x85:
191 | 		v = 0x2026
192 | 	case 0x86:
193 | 		v = 0x2020
194 | 	case 0x87:
195 | 		v = 0x2021
196 | 	case 0x88:
197 | 		v = 0x02C6
198 | 	case 0x89:
199 | 		v = 0x2030
200 | 	case 0x8A:
201 | 		v = 0x0160
202 | 	case 0x8B:
203 | 		v = 0x2039
204 | 	case 0x8C:
205 | 		v = 0x0152
206 | 	case 0x91:
207 | 		v = 0x2018
208 | 	case 0x92:
209 | 		v = 0x2019
210 | 	case 0x93:
211 | 		v = 0x201C
212 | 	case 0x94:
213 | 		v = 0x201D
214 | 	case 0x95:
215 | 		v = 0x2022
216 | 	case 0x96:
217 | 		v = 0x2013
218 | 	case 0x97:
219 | 		v = 0x2014
220 | 	case 0x98:
221 | 		v = 0x02DC
222 | 	case 0x99:
223 | 		v = 0x2122
224 | 	case 0x9A:
225 | 		v = 0x0161
226 | 	case 0x9B:
227 | 		v = 0x203A
228 | 	case 0x9C:
229 | 		v = 0x0153
230 | 	case 0x9F:
231 | 		v = 0x0178
232 | 	default:
233 | 		return []byte{char}
234 | 	}
235 | 	out := make([]byte, 2)
236 | 	binary.LittleEndian.PutUint16(out, v)
237 | 	return out
238 | }
239 | 
240 | func getWordDocAndTables(r *mscfb.Reader) (*mscfb.File, *mscfb.File, *mscfb.File) {
241 | 	var wordDoc, table0, table1 *mscfb.File
242 | 	for i := 0; i < len(r.File); i++ {
243 | 		stream := r.File[i]
244 | 
245 | 		switch stream.Name {
246 | 		case "WordDocument":
247 | 			wordDoc = stream
248 | 		case "0Table":
249 | 			table0 = stream
250 | 		case "1Table":
251 | 			table1 = stream
252 | 		}
253 | 	}
254 | 	return wordDoc, table0, table1
255 | }
256 | 
257 | func getActiveTable(table0 *mscfb.File, table1 *mscfb.File, f *fib) *mscfb.File {
258 | 	if f.base.fWhichTblStm == 0 {
259 | 		return table0
260 | 	}
261 | 	return table1
262 | }
263 | 
264 | // ---- file fib.go ----
265 | 
266 | var (
267 | 	errFibInvalid = errors.New("file information block validation failed")
268 | )
269 | 
270 | type fib struct {
271 | 	base       fibBase
272 | 	csw        int
273 | 	fibRgW     fibRgW
274 | 	cslw       int
275 | 	fibRgLw    fibRgLw
276 | 	cbRgFcLcb  int
277 | 	fibRgFcLcb fibRgFcLcb
278 | }
279 | 
280 | type fibBase struct {
281 | 	fWhichTblStm int
282 | }
283 | 
284 | type fibRgW struct {
285 | }
286 | 
287 | type fibRgLw struct {
288 | 	ccpText    int
289 | 	ccpFtn     int
290 | 	ccpHdd     int
291 | 	ccpMcr     int
292 | 	ccpAtn     int
293 | 	ccpEdn     int
294 | 	ccpTxbx    int
295 | 	ccpHdrTxbx int
296 | 	cpLength   int
297 | }
298 | 
299 | type fibRgFcLcb struct {
300 | 	fcPlcfFldMom  int
301 | 	lcbPlcfFldMom int
302 | 	fcPlcfFldHdr  int
303 | 	lcbPlcfFldHdr int
304 | 	fcPlcfFldFtn  int
305 | 	lcbPlcfFldFtn int
306 | 	fcPlcfFldAtn  int
307 | 	lcbPlcfFldAtn int
308 | 	fcClx         int
309 | 	lcbClx        int
310 | }
311 | 
312 | // parse File Information Block (section 2.5.1)
313 | func getFib(wordDoc *mscfb.File) (*fib, error) {
314 | 	if wordDoc == nil {
315 | 		return nil, errDocEmpty
316 | 	}
317 | 
318 | 	b := make([]byte, 898) // get FIB block up to FibRgFcLcb97
319 | 	_, err := wordDoc.ReadAt(b, 0)
320 | 	if err != nil {
321 | 		return nil, err
322 | 	}
323 | 
324 | 	fibBase := getFibBase(b[0:32])
325 | 
326 | 	fibRgW, csw, err := getFibRgW(b, 32)
327 | 	if err != nil {
328 | 		return nil, err
329 | 	}
330 | 
331 | 	fibRgLw, cslw, err := getFibRgLw(b, 34+csw)
332 | 	if err != nil {
333 | 		return nil, err
334 | 	}
335 | 
336 | 	fibRgFcLcb, cbRgFcLcb, err := getFibRgFcLcb(b, 34+csw+2+cslw)
337 | 
338 | 	return &fib{base: *fibBase, csw: csw, cslw: cslw, fibRgW: *fibRgW, fibRgLw: *fibRgLw, fibRgFcLcb: *fibRgFcLcb, cbRgFcLcb: cbRgFcLcb}, err
339 | }
340 | 
341 | // parse FibBase (section 2.5.2)
342 | func getFibBase(fib []byte) *fibBase {
343 | 	byt := fib[11]                    // fWhichTblStm is 2nd highest bit in this byte
344 | 	fWhichTblStm := int(byt >> 1 & 1) // set which table (0Table or 1Table) is the table stream
345 | 	return &fibBase{fWhichTblStm: fWhichTblStm}
346 | }
347 | 
348 | func getFibRgW(fib []byte, start int) (*fibRgW, int, error) {
349 | 	if start+2 >= len(fib) { // must be big enough for csw
350 | 		return &fibRgW{}, 0, errFibInvalid
351 | 	}
352 | 
353 | 	csw := int(binary.LittleEndian.Uint16(fib[start:start+2])) * 2 // in bytes
354 | 	return &fibRgW{}, csw, nil
355 | }
356 | 
357 | // parse FibRgLw (section 2.5.4)
358 | func getFibRgLw(fib []byte, start int) (*fibRgLw, int, error) {
359 | 	fibRgLwStart := start + 2        // skip cslw
360 | 	if fibRgLwStart+88 >= len(fib) { // expect 88 bytes in fibRgLw
361 | 		return &fibRgLw{}, 0, errFibInvalid
362 | 	}
363 | 
364 | 	cslw := getInt16(fib, start) * 4 // in bytes
365 | 	ccpText := getInt(fib, fibRgLwStart+3*4)
366 | 	ccpFtn := getInt(fib, fibRgLwStart+4*4)
367 | 	ccpHdd := getInt(fib, fibRgLwStart+5*4)
368 | 	ccpMcr := getInt(fib, fibRgLwStart+6*4)
369 | 	ccpAtn := getInt(fib, fibRgLwStart+7*4)
370 | 	ccpEdn := getInt(fib, fibRgLwStart+8*4)
371 | 	ccpTxbx := getInt(fib, fibRgLwStart+9*4)
372 | 	ccpHdrTxbx := getInt(fib, fibRgLwStart+10*4)
373 | 
374 | 	// calculate cpLength. Used in PlcPcd verification (see section 2.8.35)
375 | 	var cpLength int
376 | 	if ccpFtn != 0 || ccpHdd != 0 || ccpMcr != 0 || ccpAtn != 0 || ccpEdn != 0 || ccpTxbx != 0 || ccpHdrTxbx != 0 {
377 | 		cpLength = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx + ccpText + 1
378 | 	} else {
379 | 		cpLength = ccpText
380 | 	}
381 | 	return &fibRgLw{ccpText: ccpText, ccpFtn: ccpFtn, ccpHdd: ccpHdd, ccpMcr: ccpMcr, ccpAtn: ccpAtn,
382 | 		ccpEdn: ccpEdn, ccpTxbx: ccpTxbx, ccpHdrTxbx: ccpHdrTxbx, cpLength: cpLength}, cslw, nil
383 | }
384 | 
385 | // parse FibRgFcLcb (section 2.5.5)
386 | func getFibRgFcLcb(fib []byte, start int) (*fibRgFcLcb, int, error) {
387 | 	fibRgFcLcbStart := start + 2          // skip cbRgFcLcb
388 | 	if fibRgFcLcbStart+186*4 < len(fib) { // expect 186+ values in FibRgFcLcb
389 | 		return &fibRgFcLcb{}, 0, errFibInvalid
390 | 	}
391 | 
392 | 	cbRgFcLcb := getInt16(fib, start)
393 | 	fcPlcfFldMom := getInt(fib, fibRgFcLcbStart+32*4)
394 | 	lcbPlcfFldMom := getInt(fib, fibRgFcLcbStart+33*4)
395 | 	fcPlcfFldHdr := getInt(fib, fibRgFcLcbStart+34*4)
396 | 	lcbPlcfFldHdr := getInt(fib, fibRgFcLcbStart+35*4)
397 | 	fcPlcfFldFtn := getInt(fib, fibRgFcLcbStart+36*4)
398 | 	lcbPlcfFldFtn := getInt(fib, fibRgFcLcbStart+37*4)
399 | 	fcPlcfFldAtn := getInt(fib, fibRgFcLcbStart+38*4)
400 | 	lcbPlcfFldAtn := getInt(fib, fibRgFcLcbStart+39*4)
401 | 	fcClx := getInt(fib, fibRgFcLcbStart+66*4)
402 | 	lcbClx := getInt(fib, fibRgFcLcbStart+67*4)
403 | 	return &fibRgFcLcb{fcPlcfFldMom: fcPlcfFldMom, lcbPlcfFldMom: lcbPlcfFldMom, fcPlcfFldHdr: fcPlcfFldHdr, lcbPlcfFldHdr: lcbPlcfFldHdr,
404 | 		fcPlcfFldFtn: fcPlcfFldFtn, lcbPlcfFldFtn: lcbPlcfFldFtn, fcPlcfFldAtn: fcPlcfFldAtn, lcbPlcfFldAtn: lcbPlcfFldAtn,
405 | 		fcClx: fcClx, lcbClx: lcbClx}, cbRgFcLcb, nil
406 | }
407 | 
408 | func getInt16(buf []byte, start int) int {
409 | 	return int(binary.LittleEndian.Uint16(buf[start : start+2]))
410 | }
411 | func getInt(buf []byte, start int) int {
412 | 	return int(binary.LittleEndian.Uint32(buf[start : start+4]))
413 | }
414 | 
415 | // ---- file clx.go ----
416 | 
417 | var (
418 | 	errInvalidPrc  = errors.New("Invalid Prc structure")
419 | 	errInvalidClx  = errors.New("expected last aCP value to equal fib.cpLength (2.8.35)")
420 | 	errInvalidPcdt = errors.New("expected clxt to be equal 0x02")
421 | )
422 | 
423 | type clx struct {
424 | 	pcdt pcdt
425 | }
426 | 
427 | type pcdt struct {
428 | 	lcb    int
429 | 	PlcPcd plcPcd
430 | }
431 | 
432 | type plcPcd struct {
433 | 	aCP  []int
434 | 	aPcd []pcd
435 | }
436 | 
437 | type pcd struct {
438 | 	fc fcCompressed
439 | }
440 | 
441 | type fcCompressed struct {
442 | 	fc          int
443 | 	fCompressed bool
444 | }
445 | 
446 | // read Clx (section 2.9.38)
447 | func getClx(table *mscfb.File, fib *fib) (*clx, error) {
448 | 	if table == nil || fib == nil {
449 | 		return nil, errInvalidArgument
450 | 	}
451 | 	b, err := readClx(table, fib)
452 | 	if err != nil {
453 | 		return nil, err
454 | 	}
455 | 
456 | 	pcdtOffset, err := getPrcArrayEnd(b)
457 | 	if err != nil {
458 | 		return nil, err
459 | 	}
460 | 
461 | 	pcdt, err := getPcdt(b, pcdtOffset)
462 | 	if err != nil {
463 | 		return nil, err
464 | 	}
465 | 
466 | 	if pcdt.PlcPcd.aCP[len(pcdt.PlcPcd.aCP)-1] != fib.fibRgLw.cpLength {
467 | 		return nil, errInvalidClx
468 | 	}
469 | 
470 | 	return &clx{pcdt: *pcdt}, nil
471 | }
472 | 
473 | func readClx(table *mscfb.File, fib *fib) ([]byte, error) {
474 | 	b := make([]byte, fib.fibRgFcLcb.lcbClx)
475 | 	_, err := table.ReadAt(b, int64(fib.fibRgFcLcb.fcClx))
476 | 	if err != nil {
477 | 		return nil, err
478 | 	}
479 | 	return b, nil
480 | }
481 | 
482 | // read Pcdt from Clx (section 2.9.178)
483 | func getPcdt(clx []byte, pcdtOffset int) (*pcdt, error) {
484 | 	const pcdSize = 8
485 | 	if pcdtOffset < 0 || pcdtOffset+5 >= len(clx) {
486 | 		return nil, errInvalidPcdt
487 | 	}
488 | 	if clx[pcdtOffset] != 0x02 { // clxt must be 0x02 or invalid
489 | 		return nil, errInvalidPcdt
490 | 	}
491 | 	lcb := int(binary.LittleEndian.Uint32(clx[pcdtOffset+1 : pcdtOffset+5])) // skip clxt, get lcb
492 | 	plcPcdOffset := pcdtOffset + 5                                           // skip clxt and lcb
493 | 	numPcds := (lcb - 4) / (4 + pcdSize)                                     // see 2.2.2 in the spec for equation
494 | 	numCps := numPcds + 1                                                    // always 1 more cp than pcds
495 | 
496 | 	cps := make([]int, numCps)
497 | 	for i := 0; i < numCps; i++ {
498 | 		cpOffset := plcPcdOffset + i*4
499 | 		if cpOffset < 0 || cpOffset+4 >= len(clx) {
500 | 			return nil, errInvalidPcdt
501 | 		}
502 | 		cps[i] = int(binary.LittleEndian.Uint32(clx[cpOffset : cpOffset+4]))
503 | 	}
504 | 
505 | 	pcdStart := plcPcdOffset + 4*numCps
506 | 	pcds := make([]pcd, numPcds)
507 | 	for i := 0; i < numPcds; i++ {
508 | 		pcdOffset := pcdStart + i*pcdSize
509 | 		if pcdOffset < 0 || pcdOffset+pcdSize >= len(clx) {
510 | 			return nil, errInvalidPcdt
511 | 		}
512 | 		pcds[i] = *parsePcd(clx[pcdOffset : pcdOffset+pcdSize])
513 | 	}
514 | 	return &pcdt{lcb: lcb, PlcPcd: plcPcd{aCP: cps, aPcd: pcds}}, nil
515 | }
516 | 
517 | // find end of RgPrc array (section 2.9.38)
518 | func getPrcArrayEnd(clx []byte) (int, error) {
519 | 	prcOffset := 0
520 | 	count := 0
521 | 	for {
522 | 		clxt := clx[prcOffset]
523 | 		if clxt != 0x01 { // this is not a Prc, so exit
524 | 			return prcOffset, nil
525 | 		}
526 | 		prcDataCbGrpprl := binary.LittleEndian.Uint16(clx[prcOffset+1 : prcOffset+3]) // skip the clxt and read 2 bytes
527 | 		prcOffset += 1 + 2 + int(prcDataCbGrpprl)                                     // skip clxt, cbGrpprl, and GrpPrl
528 | 
529 | 		if count > 10000 || prcDataCbGrpprl <= 0 || prcOffset+3 > len(clx) { // ensure no infinite loop
530 | 			return 0, errInvalidPrc
531 | 		}
532 | 		count++
533 | 	}
534 | }
535 | 
536 | // parse Pcd (section 2.9.177)
537 | func parsePcd(pcdData []byte) *pcd {
538 | 	return &pcd{fc: *parseFcCompressed(pcdData[2:6])}
539 | }
540 | 
541 | // parse FcCompressed (section 2.9.73)
542 | func parseFcCompressed(fcData []byte) *fcCompressed {
543 | 	fCompressed := fcData[3]&64 == 64        // check fcompressed value (second bit from lestmost of the last byte in fcdata)
544 | 	fcData[3] = fcData[3] & 63               // clear the fcompressed value from data
545 | 	fc := binary.LittleEndian.Uint32(fcData) // word doc generally uses little endian order (1.3.7)
546 | 	return &fcCompressed{fc: int(fc), fCompressed: fCompressed}
547 | }
548 | 
549 | // IsFileDOC checks if the data indicates a DOC file
550 | // DOC has multiple signature according to https://filesignatures.net/index.php?search=doc&mode=EXT, D0 CF 11 E0 A1 B1 1A E1
551 | func IsFileDOC(data []byte) bool {
552 | 	return bytes.HasPrefix(data, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1})
553 | }
554 | 


--------------------------------------------------------------------------------
/DOCX 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  DOCX 2 Text.go
  3 | Copyright:  2018 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | This code is forked from https://github.com/guylaor/goword and extracts text from DOCX files.
  7 | */
  8 | 
  9 | package fileconversion
 10 | 
 11 | import (
 12 | 	"archive/zip"
 13 | 	"bytes"
 14 | 	"encoding/xml"
 15 | 	"fmt"
 16 | 	"io"
 17 | 	"io/ioutil"
 18 | 	"strings"
 19 | )
 20 | 
 21 | // models.go
 22 | 
 23 | // WordDocument is a full word doc
 24 | type WordDocument struct {
 25 | 	Paragraphs []WordParagraph
 26 | }
 27 | 
 28 | // WordParagraph is a single paragraph
 29 | type WordParagraph struct {
 30 | 	Style WordStyle `xml:"pPr>pStyle"`
 31 | 	Rows  []WordRow `xml:"r"`
 32 | }
 33 | 
 34 | // WordStyle ...
 35 | type WordStyle struct {
 36 | 	Val string `xml:"val,attr"`
 37 | }
 38 | 
 39 | // WordRow ...
 40 | type WordRow struct {
 41 | 	Text string `xml:"t"`
 42 | }
 43 | 
 44 | // AsText returns all text in the document
 45 | func (w WordDocument) AsText() string {
 46 | 	text := ""
 47 | 	for _, v := range w.Paragraphs {
 48 | 		for _, rv := range v.Rows {
 49 | 			text += rv.Text
 50 | 		}
 51 | 		text += "\n"
 52 | 	}
 53 | 	return text
 54 | }
 55 | 
 56 | // goword.go
 57 | 
 58 | // DOCX2Text extracts text of a Word document
 59 | // Size is the full size of the input file.
 60 | func DOCX2Text(file io.ReaderAt, size int64) (string, error) {
 61 | 
 62 | 	doc, err := openWordFile(file, size)
 63 | 	if err != nil {
 64 | 		return "", err
 65 | 	}
 66 | 
 67 | 	docx, err := WordParse(doc)
 68 | 	if err != nil {
 69 | 		return "", err
 70 | 	}
 71 | 
 72 | 	return docx.AsText(), nil
 73 | }
 74 | 
 75 | // WordParse parses a word file
 76 | func WordParse(doc string) (WordDocument, error) {
 77 | 
 78 | 	docx := WordDocument{}
 79 | 	r := strings.NewReader(string(doc))
 80 | 	decoder := xml.NewDecoder(r)
 81 | 
 82 | 	for {
 83 | 		t, _ := decoder.Token()
 84 | 		if t == nil {
 85 | 			break
 86 | 		}
 87 | 		switch se := t.(type) {
 88 | 		case xml.StartElement:
 89 | 			if se.Name.Local == "p" {
 90 | 				var p WordParagraph
 91 | 				decoder.DecodeElement(&p, &se)
 92 | 				docx.Paragraphs = append(docx.Paragraphs, p)
 93 | 			}
 94 | 		}
 95 | 	}
 96 | 	return docx, nil
 97 | }
 98 | 
 99 | func openWordFile(file io.ReaderAt, size int64) (string, error) {
100 | 
101 | 	// Open a zip archive for reading. word files are zip archives
102 | 	r, err := zip.NewReader(file, size)
103 | 	if err != nil {
104 | 		return "", err
105 | 	}
106 | 
107 | 	// Iterate through the files in the archive,
108 | 	// find document.xml
109 | 	for _, f := range r.File {
110 | 
111 | 		//fmt.Printf("Contents of %s:\n", f.Name)
112 | 		rc, err := f.Open()
113 | 		if err != nil {
114 | 			return "", err
115 | 		}
116 | 		defer rc.Close()
117 | 		if f.Name == "word/document.xml" {
118 | 			doc, err := ioutil.ReadAll(rc)
119 | 			if err != nil {
120 | 				return "", err
121 | 			}
122 | 			return fmt.Sprintf("%s", doc), nil
123 | 		}
124 | 	}
125 | 
126 | 	return "", nil
127 | }
128 | 
129 | // IsFileDOCX checks if the data indicates a DOCX file
130 | // DOCX has a signature of 50 4B 03 04
131 | func IsFileDOCX(data []byte) bool {
132 | 	return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04})
133 | }
134 | 


--------------------------------------------------------------------------------
/Decompress.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  Decompress.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | */
  6 | 
  7 | package fileconversion
  8 | 
  9 | import (
 10 | 	"archive/tar"
 11 | 	"archive/zip"
 12 | 	"bytes"
 13 | 	"compress/bzip2"
 14 | 	"compress/gzip"
 15 | 	"io"
 16 | 	"io/ioutil"
 17 | 	"time"
 18 | 
 19 | 	"github.com/nwaples/rardecode"
 20 | 	"github.com/saracen/go7z"
 21 | 	"github.com/ulikunitz/xz"
 22 | )
 23 | 
 24 | // DecompressFile decompresses data. It supports: GZ, BZ, BZ2, XZ
 25 | func DecompressFile(data []byte) (decompressed []byte, valid bool) {
 26 | 	// Try GZ
 27 | 	if gr, err := gzip.NewReader(bytes.NewBuffer(data)); err == nil {
 28 | 		defer gr.Close()
 29 | 		decompressed, err = ioutil.ReadAll(gr)
 30 | 		if err == nil {
 31 | 			return decompressed, true
 32 | 		}
 33 | 	}
 34 | 
 35 | 	// BZ, BZ2
 36 | 	br := bzip2.NewReader(bytes.NewBuffer(data))
 37 | 	decompressed, err := ioutil.ReadAll(br)
 38 | 	if err == nil {
 39 | 		return decompressed, true
 40 | 	}
 41 | 
 42 | 	// XZ
 43 | 	if xr, err := xz.NewReader(bytes.NewBuffer(data)); err == nil {
 44 | 		decompressed, err = ioutil.ReadAll(xr)
 45 | 		if err == nil {
 46 | 			return decompressed, true
 47 | 		}
 48 | 	}
 49 | 
 50 | 	return nil, false
 51 | }
 52 | 
 53 | // ContainerExtractFiles extracts files from supported containers: ZIP, RAR, 7Z, TAR
 54 | func ContainerExtractFiles(data []byte, callback func(name string, size int64, date time.Time, data []byte)) {
 55 | 
 56 | 	// ZIP
 57 | 	if r, err := zip.NewReader(bytes.NewReader(data), int64(len(data))); err == nil {
 58 | 		for _, f := range r.File {
 59 | 			fileReader, err := f.Open()
 60 | 			if err != nil {
 61 | 				continue
 62 | 			}
 63 | 
 64 | 			data2, err := ioutil.ReadAll(fileReader)
 65 | 			fileReader.Close()
 66 | 			if err != nil {
 67 | 				// If the file is encrypted with a password, this fails with error "4" here.
 68 | 				continue
 69 | 			}
 70 | 
 71 | 			callback(f.Name, int64(f.UncompressedSize64), f.Modified, data2)
 72 | 		}
 73 | 
 74 | 		return
 75 | 	}
 76 | 
 77 | 	// RAR
 78 | 	if rc, err := rardecode.NewReader(bytes.NewReader(data), ""); err == nil {
 79 | 		for {
 80 | 			hdr, err := rc.Next()
 81 | 			if err == io.EOF || err != nil { // break if end of archive or other error returned
 82 | 				break
 83 | 			} else if err == nil && !hdr.IsDir {
 84 | 				if data2, err := ioutil.ReadAll(rc); err == nil {
 85 | 					callback(hdr.Name, hdr.UnPackedSize, hdr.CreationTime, data2)
 86 | 				}
 87 | 			}
 88 | 		}
 89 | 	}
 90 | 
 91 | 	// 7Z
 92 | 	if sz, err := go7z.NewReader(bytes.NewReader(data), int64(len(data))); err == nil {
 93 | 		for {
 94 | 			hdr, err := sz.Next()
 95 | 			if err == io.EOF || err != nil { // break if end of archive or other error returned
 96 | 				break // End of archive
 97 | 			} else if err == nil && !hdr.IsEmptyFile {
 98 | 				if data2, err := ioutil.ReadAll(sz); err == nil {
 99 | 					callback(hdr.Name, int64(len(data2)), hdr.CreatedAt, data2)
100 | 				}
101 | 			}
102 | 		}
103 | 	} else if err == go7z.ErrDecompressorNotFound {
104 | 		// May happen if it's 7Z, but decompressor not available (like 7zAES).
105 | 		return
106 | 	}
107 | 
108 | 	// TAR
109 | 	tr := tar.NewReader(bytes.NewReader(data))
110 | 	// Iterate through the files in the archive.
111 | 	for {
112 | 		hdr, err := tr.Next()
113 | 		if err == io.EOF {
114 | 			// end of tar archive
115 | 			break
116 | 		}
117 | 		if err != nil {
118 | 			// other error
119 | 			break
120 | 		}
121 | 		switch hdr.Typeflag {
122 | 		case tar.TypeDir:
123 | 			// directories are ignored
124 | 		case tar.TypeReg, tar.TypeRegA:
125 | 			// file
126 | 			data2, err := ioutil.ReadAll(tr)
127 | 			if err != nil {
128 | 				continue
129 | 			}
130 | 
131 | 			callback(hdr.Name, hdr.Size, hdr.ModTime, data2)
132 | 		}
133 | 	}
134 | 
135 | }
136 | 


--------------------------------------------------------------------------------
/EPUB 2 Text.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | File Name:  EPUB 2 Text.go
 3 | Copyright:  2019 Kleissner Investments s.r.o.
 4 | Author:     Peter Kleissner
 5 | 
 6 | EPUB files are ZIP based and contain the content as HTML files.
 7 | 
 8 | Tested but did not work:
 9 | * https://github.com/n3integration/epub could not read 2 sample files. Also no NewReader function available.
10 | 
11 | This one was tested and works:
12 | * https://github.com/taylorskalyo/goreader/tree/master/epub
13 | 
14 | Sample files via https://github.com/IDPF/epub3-samples/releases.
15 | */
16 | 
17 | package fileconversion
18 | 
19 | import (
20 | 	"io"
21 | 
22 | 	"github.com/taylorskalyo/goreader/epub"
23 | )
24 | 
25 | // EPUB2Text converts an EPUB ebook to text
26 | func EPUB2Text(file io.ReaderAt, size int64, limit int64) (string, error) {
27 | 	text := ""
28 | 
29 | 	rc, err := epub.NewReader(file, size)
30 | 	if err != nil {
31 | 		return "", nil
32 | 	}
33 | 
34 | 	// The rootfile (content.opf) lists all of the contents of an epub file.
35 | 	// There may be multiple rootfiles, although typically there is only one.
36 | 	book := rc.Rootfiles[0]
37 | 
38 | 	// Print book title.
39 | 	title := "Title: " + book.Title + "\n\n"
40 | 	limit -= int64(len(title))
41 | 	if limit <= 0 {
42 | 		return title, nil
43 | 	}
44 | 
45 | 	// List the IDs of files in the book's spine.
46 | 	for _, item := range book.Spine.Itemrefs {
47 | 		// item.ID was observed to be in one book: cover,titlepage,brief-toc,xpreface_001,xintroduction_001,xepigraph_001,xchapter_001
48 | 		reader2, err := item.Open()
49 | 		if err != nil {
50 | 			continue
51 | 		}
52 | 
53 | 		itemText, _ := HTML2Text(reader2)
54 | 
55 | 		// check max length
56 | 		if limit <= int64(len(itemText)) {
57 | 			itemText = itemText[:limit]
58 | 			return title + text, nil
59 | 		}
60 | 
61 | 		text += itemText
62 | 		limit -= int64(len(itemText))
63 | 	}
64 | 
65 | 	if text == "" {
66 | 		return "", nil
67 | 	}
68 | 
69 | 	return title + text, nil
70 | }
71 | 


--------------------------------------------------------------------------------
/HTML 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  HTML 2 Text.go
  3 | Copyright:  2018 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | */
  6 | 
  7 | package fileconversion
  8 | 
  9 | import (
 10 | 	"io"
 11 | 	"net/url"
 12 | 	"path"
 13 | 	"strings"
 14 | 
 15 | 	"github.com/IntelligenceX/fileconversion/html2text"
 16 | 	"github.com/PuerkitoBio/goquery"
 17 | 	"github.com/ssor/bom"
 18 | 	"golang.org/x/net/html"
 19 | 	"golang.org/x/net/html/charset"
 20 | )
 21 | 
 22 | // HTML2Text extracts the text from the HTML
 23 | func HTML2Text(reader io.Reader) (pageText string, err error) {
 24 | 	// The charset.NewReader ensures that foreign encodings are properly decoded to UTF-8.
 25 | 	// It will make both heuristic checks as well as look for the HTML meta charset tag.
 26 | 	reader, err = charset.NewReader(reader, "")
 27 | 	if err != nil {
 28 | 		return "", err
 29 | 	}
 30 | 
 31 | 	// The html2text is a forked improved version that converts HTML to human-friendly text.
 32 | 	return html2text.FromReader(reader)
 33 | }
 34 | 
 35 | // HTML2TextAndLinks extracts the text from the HTML and all links from <a> and <img> tags of a HTML
 36 | // If the base URL is provided, relative links will be converted to absolute ones.
 37 | func HTML2TextAndLinks(reader io.Reader, baseURL string) (pageText string, links []string, err error) {
 38 | 	// The charset.NewReader ensures that foreign encodings are properly decoded to UTF-8.
 39 | 	// It will make both heuristic checks as well as look for the HTML meta charset tag.
 40 | 	reader, err = charset.NewReader(reader, "")
 41 | 	if err != nil {
 42 | 		return "", nil, err
 43 | 	}
 44 | 
 45 | 	// code from html2text.FromReader to parse the doc
 46 | 	newReader, err := bom.NewReaderWithoutBom(reader)
 47 | 	if err != nil {
 48 | 		return "", nil, err
 49 | 	}
 50 | 	doc, err := html.Parse(newReader)
 51 | 	if err != nil {
 52 | 		return "", nil, err
 53 | 	}
 54 | 
 55 | 	// get the text
 56 | 	pageText, err = html2text.FromHTMLNode(doc)
 57 | 	if err != nil {
 58 | 		return pageText, nil, err
 59 | 	}
 60 | 
 61 | 	// get the links
 62 | 	docQ := goquery.NewDocumentFromNode(doc)
 63 | 	docQ.Url, _ = url.Parse(baseURL)
 64 | 	links = processLinks(docQ)
 65 | 
 66 | 	return pageText, links, err
 67 | }
 68 | 
 69 | // ---- below 2 functions are forks from gocrawl/worker.go ----
 70 | 
 71 | func handleBaseTag(root *url.URL, baseHref string, aHref string) string {
 72 | 	resolvedBase, err := root.Parse(baseHref)
 73 | 	if err != nil {
 74 | 		return ""
 75 | 	}
 76 | 
 77 | 	parsedURL, err := url.Parse(aHref)
 78 | 	if err != nil {
 79 | 		return ""
 80 | 	}
 81 | 	// If a[href] starts with a /, it overrides the base[href]
 82 | 	if parsedURL.Host == "" && !strings.HasPrefix(aHref, "/") {
 83 | 		aHref = path.Join(resolvedBase.Path, aHref)
 84 | 	}
 85 | 
 86 | 	resolvedURL, err := resolvedBase.Parse(aHref)
 87 | 	if err != nil {
 88 | 		return ""
 89 | 	}
 90 | 	return resolvedURL.String()
 91 | }
 92 | 
 93 | // Scrape the document's content to gather all links
 94 | func processLinks(doc *goquery.Document) (result []string) {
 95 | 	// process links via <a href=""> tags
 96 | 	baseURL, _ := doc.Find("base[href]").Attr("href")
 97 | 	urls := doc.Find("a[href]").Map(func(_ int, s *goquery.Selection) string {
 98 | 		val, _ := s.Attr("href")
 99 | 		if baseURL != "" {
100 | 			val = handleBaseTag(doc.Url, baseURL, val)
101 | 		}
102 | 		return val
103 | 	})
104 | 
105 | 	// all image references via <img src=""> tag
106 | 	imgURLs := doc.Find("img[src]").Map(func(_ int, s *goquery.Selection) string {
107 | 		val, _ := s.Attr("src")
108 | 		if baseURL != "" {
109 | 			val = handleBaseTag(doc.Url, baseURL, val)
110 | 		}
111 | 		return val
112 | 	})
113 | 	urls = append(urls, imgURLs...)
114 | 
115 | 	// form submission links <form action="/action_page.php" method="get">
116 | 	formURLs := doc.Find("form[action]").Map(func(_ int, s *goquery.Selection) string {
117 | 		val, _ := s.Attr("action")
118 | 		if baseURL != "" {
119 | 			val = handleBaseTag(doc.Url, baseURL, val)
120 | 		}
121 | 		return val
122 | 	})
123 | 	urls = append(urls, formURLs...)
124 | 
125 | 	// parse all found URLs
126 | 	for _, s := range urls {
127 | 		// If href starts with "#", then it points to this same exact URL, ignore (will fail to parse anyway)
128 | 		if len(s) > 0 && !strings.HasPrefix(s, "#") {
129 | 			if parsed, e := url.Parse(s); e == nil {
130 | 				parsed = doc.Url.ResolveReference(parsed)
131 | 
132 | 				result = append(result, parsed.String())
133 | 				//fmt.Printf("%s\n", parsed.String())
134 | 			} else {
135 | 				//w.logFunc(LogIgnored, "ignore on unparsable policy %s: %s", s, e.Error())
136 | 			}
137 | 		}
138 | 	}
139 | 	return
140 | }
141 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/MBOX.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | File Name:  MBOX.go
 3 | Copyright:  2019 Kleissner Investments s.r.o.
 4 | Author:     Peter Kleissner
 5 | 
 6 | Support for email files in the MBOX format.
 7 | */
 8 | 
 9 | package fileconversion
10 | 
11 | // Check out https://github.com/blabber/mbox ????
12 | 


--------------------------------------------------------------------------------
/MOBI 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  MOBI 2 Text.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | Mobi files use HTML tags.
  7 | 
  8 | Did not work:
  9 | * https://github.com/766b/mobi is only a writer and does not have a useful reader
 10 | * https://github.com/peterbn/mobi a fork of above one.
 11 | 
 12 | Works:
 13 | * https://github.com/neofight/mobi code basically works, just an in-memory open function had to be forked.
 14 | 
 15 | */
 16 | 
 17 | package fileconversion
 18 | 
 19 | import (
 20 | 	"bytes"
 21 | 	"encoding/binary"
 22 | 	"errors"
 23 | 	"fmt"
 24 | 	"io"
 25 | 	"strconv"
 26 | 	"strings"
 27 | 	"unicode/utf8"
 28 | 
 29 | 	html "github.com/levigross/exp-html"
 30 | 	"github.com/neofight/mobi/convert"
 31 | 	"github.com/neofight/mobi/headers"
 32 | )
 33 | 
 34 | // Mobi2Text converts a MOBI ebook to text
 35 | func Mobi2Text(file io.ReadSeeker) (string, error) {
 36 | 
 37 | 	book, _ := mobiOpen(file)
 38 | 	markupText, _ := book.Markup()
 39 | 
 40 | 	text, _ := HTML2Text(strings.NewReader(markupText))
 41 | 
 42 | 	return text, nil
 43 | }
 44 | 
 45 | // below code is forked from https://github.com/neofight/mobi MOBIFile.go
 46 | 
 47 | type mobiBook struct {
 48 | 	file          io.ReadSeeker
 49 | 	pdbHeader     *headers.PDB
 50 | 	palmDOCHeader *headers.PalmDOC
 51 | 	mobiHeader    *headers.MOBI
 52 | 	exthHeader    *headers.EXTH
 53 | }
 54 | 
 55 | func mobiOpen(file io.ReadSeeker) (*mobiBook, error) {
 56 | 
 57 | 	var book mobiBook
 58 | 
 59 | 	var err error
 60 | 
 61 | 	book.file = file
 62 | 	book.pdbHeader, err = headers.ReadPDB(book.file)
 63 | 
 64 | 	if err != nil {
 65 | 		return nil, fmt.Errorf("unable to read PDB header: %v", err)
 66 | 	}
 67 | 
 68 | 	book.palmDOCHeader, err = headers.ReadPalmDOC(book.file)
 69 | 
 70 | 	if err != nil {
 71 | 		return nil, fmt.Errorf("unable to read PalmDOC header: %v", err)
 72 | 	}
 73 | 
 74 | 	book.mobiHeader, err = headers.ReadMOBI(book.file)
 75 | 
 76 | 	if err != nil {
 77 | 		return nil, fmt.Errorf("unable to read MOBI header: %v", err)
 78 | 	}
 79 | 
 80 | 	if book.mobiHeader.EXTHHeaderPresent {
 81 | 
 82 | 		book.exthHeader, err = headers.ReadEXTH(book.file)
 83 | 
 84 | 		if err != nil {
 85 | 			return nil, fmt.Errorf("unable to read EXTH header: %v", err)
 86 | 		}
 87 | 	}
 88 | 
 89 | 	return &book, nil
 90 | }
 91 | 
 92 | func (mobiFile mobiBook) Cover() ([]byte, error) {
 93 | 
 94 | 	for _, r := range mobiFile.exthHeader.Records {
 95 | 
 96 | 		if r.RecordType == 201 {
 97 | 			coverIndex := mobiFile.mobiHeader.FirstImageIndex + convert.FromUint32(r.RecordData)
 98 | 
 99 | 			record := mobiFile.pdbHeader.Records[coverIndex]
100 | 			nextRecord := mobiFile.pdbHeader.Records[coverIndex+1]
101 | 
102 | 			coverOffset := record.RecordDataOffset
103 | 			coverSize := nextRecord.RecordDataOffset - coverOffset
104 | 
105 | 			_, err := mobiFile.file.Seek(int64(coverOffset), 0)
106 | 
107 | 			if err != nil {
108 | 				return nil, fmt.Errorf("unable to find cover: %v", err)
109 | 			}
110 | 
111 | 			cover := make([]byte, coverSize)
112 | 
113 | 			err = binary.Read(mobiFile.file, binary.BigEndian, &cover)
114 | 
115 | 			if err != nil {
116 | 				return nil, fmt.Errorf("unable to read cover: %v", err)
117 | 			}
118 | 
119 | 			return cover, nil
120 | 		}
121 | 	}
122 | 
123 | 	return nil, nil
124 | }
125 | 
126 | func (mobiFile mobiBook) Markup() (string, error) {
127 | 
128 | 	startIndex := mobiFile.mobiHeader.FirstContentIndex
129 | 	endIndex := mobiFile.mobiHeader.FirstNonBookIndex - 1
130 | 
131 | 	if endIndex > len(mobiFile.pdbHeader.Records)-2 {
132 | 		endIndex = len(mobiFile.pdbHeader.Records) - 2
133 | 	}
134 | 
135 | 	if endIndex < 0 || startIndex < 0 || startIndex >= len(mobiFile.pdbHeader.Records) {
136 | 		return "", fmt.Errorf("Invalid header")
137 | 	}
138 | 
139 | 	text := make([]byte, 0)
140 | 
141 | 	for index := startIndex; index <= endIndex; index++ {
142 | 
143 | 		record := mobiFile.pdbHeader.Records[index]
144 | 		nextRecord := mobiFile.pdbHeader.Records[index+1]
145 | 
146 | 		recordOffset := record.RecordDataOffset
147 | 		recordSize := nextRecord.RecordDataOffset - recordOffset
148 | 
149 | 		_, err := mobiFile.file.Seek(int64(recordOffset), 0)
150 | 
151 | 		if err != nil {
152 | 			return "", fmt.Errorf("unable to find text: %v", err)
153 | 		}
154 | 
155 | 		recordData := make([]byte, recordSize)
156 | 
157 | 		err = binary.Read(mobiFile.file, binary.BigEndian, &recordData)
158 | 
159 | 		if err != nil {
160 | 			return "", fmt.Errorf("unable to read text: %v", err)
161 | 		}
162 | 
163 | 		recordText := fromLZ77(recordData)
164 | 
165 | 		text = append(text, recordText...)
166 | 	}
167 | 
168 | 	text = text[:mobiFile.palmDOCHeader.TextLength]
169 | 
170 | 	if !utf8.Valid(text) {
171 | 		return "", errors.New("unable to decompress text")
172 | 	}
173 | 
174 | 	return string(text), nil
175 | }
176 | 
177 | func (mobiFile mobiBook) Text() (string, error) {
178 | 
179 | 	markup, err := mobiFile.Markup()
180 | 
181 | 	if err != nil {
182 | 		return "", fmt.Errorf("unable to read markup: %v", err)
183 | 	}
184 | 
185 | 	pos, err := getTOCPosition(markup)
186 | 
187 | 	if err != nil {
188 | 		return "", fmt.Errorf("unable to locate TOC: %v", err)
189 | 	}
190 | 
191 | 	bookmarks, err := parseTOC(markup[pos:])
192 | 
193 | 	text := make([]string, 0)
194 | 
195 | 	for i := range bookmarks {
196 | 
197 | 		start := bookmarks[i]
198 | 		var end int
199 | 
200 | 		if i < len(bookmarks)-1 {
201 | 			end = bookmarks[i+1]
202 | 		} else {
203 | 			end = pos
204 | 		}
205 | 
206 | 		paragraphs, err := parseChapter(markup[start:end])
207 | 
208 | 		if err != nil {
209 | 			return "", fmt.Errorf("unable to parse chapter: %v", err)
210 | 		}
211 | 
212 | 		text = append(text, paragraphs...)
213 | 	}
214 | 
215 | 	return strings.Join(text, "\n\n"), nil
216 | }
217 | 
218 | func getTOCPosition(markup string) (int, error) {
219 | 
220 | 	htmlReader := strings.NewReader(markup)
221 | 
222 | 	tokenizer := html.NewTokenizer(htmlReader)
223 | 
224 | 	for {
225 | 		tokenType := tokenizer.Next()
226 | 
227 | 		switch {
228 | 		case tokenType == html.ErrorToken:
229 | 			return 0, fmt.Errorf("unable to find reference element")
230 | 		case tokenType == html.SelfClosingTagToken:
231 | 			token := tokenizer.Token()
232 | 
233 | 			if token.Data == "reference" {
234 | 				filepos, err := attr(token, "filepos")
235 | 
236 | 				if err != nil {
237 | 					return 0, errors.New("filepos attribute missing")
238 | 				}
239 | 
240 | 				pos, err := strconv.Atoi(filepos)
241 | 
242 | 				if err != nil {
243 | 					return 0, errors.New("filepos attribute invalid")
244 | 				}
245 | 
246 | 				return pos, nil
247 | 			}
248 | 		}
249 | 	}
250 | }
251 | 
252 | func parseTOC(markup string) ([]int, error) {
253 | 
254 | 	toc := make([]int, 0)
255 | 
256 | 	htmlReader := strings.NewReader(markup)
257 | 
258 | 	tokenizer := html.NewTokenizer(htmlReader)
259 | 
260 | 	for {
261 | 		tokenType := tokenizer.Next()
262 | 
263 | 		switch {
264 | 		case tokenType == html.ErrorToken:
265 | 			return toc[1:], nil
266 | 		case tokenType == html.StartTagToken:
267 | 			token := tokenizer.Token()
268 | 
269 | 			if token.Data == "a" {
270 | 				filepos, err := attr(token, "filepos")
271 | 
272 | 				if err != nil {
273 | 					continue
274 | 				}
275 | 
276 | 				pos, err := strconv.Atoi(filepos)
277 | 
278 | 				if err != nil {
279 | 					return nil, errors.New("filepos attribute invalid")
280 | 				}
281 | 
282 | 				toc = append(toc, pos)
283 | 			}
284 | 		}
285 | 	}
286 | }
287 | 
288 | func parseChapter(markup string) ([]string, error) {
289 | 
290 | 	paragraphs := make([]string, 0)
291 | 
292 | 	htmlReader := strings.NewReader(markup)
293 | 
294 | 	tokenizer := html.NewTokenizer(htmlReader)
295 | 
296 | 	for {
297 | 		tokenType := tokenizer.Next()
298 | 
299 | 		switch {
300 | 		case tokenType == html.ErrorToken:
301 | 			return paragraphs, nil
302 | 		case tokenType == html.TextToken:
303 | 			token := tokenizer.Token()
304 | 
305 | 			if len(strings.TrimSpace(token.Data)) > 0 {
306 | 				paragraphs = append(paragraphs, strings.TrimSpace(token.Data))
307 | 			}
308 | 		}
309 | 	}
310 | }
311 | 
312 | func attr(t html.Token, name string) (string, error) {
313 | 	for _, a := range t.Attr {
314 | 		if a.Key == name {
315 | 			return a.Val, nil
316 | 		}
317 | 	}
318 | 
319 | 	return "", fmt.Errorf("attribute %v not found", name)
320 | }
321 | 
322 | // fromLZ77 is forked from conversion.go because of index out of range panic
323 | func fromLZ77(text []byte) []byte {
324 | 
325 | 	var reader = bytes.NewReader(text)
326 | 
327 | 	var buffer [4096]byte
328 | 	var pos int
329 | 
330 | 	for {
331 | 		if pos == 4096 {
332 | 			break
333 | 		}
334 | 
335 | 		c, err := reader.ReadByte()
336 | 
337 | 		if err == io.EOF {
338 | 			break
339 | 		}
340 | 
341 | 		switch {
342 | 
343 | 		// 0x00: "1 literal" copy that byte unmodified to the decompressed stream.
344 | 		case c == 0x00:
345 | 			buffer[pos] = c
346 | 			pos++
347 | 
348 | 		// 0x09 to 0x7f: "1 literal" copy that byte unmodified to the decompressed stream.
349 | 		case c >= 0x09 && c <= 0x7f:
350 | 			buffer[pos] = c
351 | 			pos++
352 | 
353 | 		// 0x01 to 0x08: "literals": the byte is interpreted as a count from 1 to 8, and that many literals are copied
354 | 		// unmodified from the compressed stream to the decompressed stream.
355 | 		case c >= 0x01 && c <= 0x08:
356 | 			length := int(c)
357 | 			for i := 0; i < length; i++ {
358 | 				c, err = reader.ReadByte()
359 | 				buffer[pos] = c
360 | 				pos++
361 | 			}
362 | 
363 | 		// 0x80 to 0xbf: "length, distance" pair: the 2 leftmost bits of this byte ('10') are discarded, and the
364 | 		// following 6 bits are combined with the 8 bits of the next byte to make a 14 bit "distance, length" item.
365 | 		// Those 14 bits are broken into 11 bits of distance backwards from the current location in the uncompressed
366 | 		// text, and 3 bits of length to copy from that point (copying n+3 bytes, 3 to 10 bytes).
367 | 		case c >= 0x80 && c <= 0xbf:
368 | 			c2, _ := reader.ReadByte()
369 | 
370 | 			distance := (int(c&0x3F)<<8 | int(c2)) >> 3
371 | 			length := int(c2&0x07) + 3
372 | 
373 | 			start := pos - distance
374 | 
375 | 			for i := 0; i < length; i++ {
376 | 				// check if index is in range
377 | 				if start+i >= len(buffer) || start+i < 0 {
378 | 					return buffer[:pos]
379 | 				}
380 | 
381 | 				c = buffer[start+i]
382 | 				buffer[pos] = c
383 | 				pos++
384 | 			}
385 | 
386 | 		// 0xc0 to 0xff: "byte pair": this byte is decoded into 2 characters: a space character, and a letter formed
387 | 		// from this byte XORed with 0x80.
388 | 		case c >= 0xc0:
389 | 			buffer[pos] = ' '
390 | 			pos++
391 | 			buffer[pos] = c ^ 0x80
392 | 			pos++
393 | 		}
394 | 	}
395 | 
396 | 	return buffer[:pos]
397 | }
398 | 
399 | // IsFileMOBI checks if the data indicates a MOBI file
400 | func IsFileMOBI(data []byte) bool {
401 | 	// Mobi files have a header and there is the signature "BOOKMOBI" or "TEXtREAd".
402 | 	// There are many more more potential signatures https://sno.phy.queensu.ca/~phil/exiftool/TagNames/Palm.html
403 | 
404 | 	// Fork from code here http://will.tip.dhappy.org/lib/calibre/dedrm/mobidedrm.py
405 | 	// if self.header[0x3C:0x3C+8] != 'BOOKMOBI' and self.header[0x3C:0x3C+8] != 'TEXtREAd':
406 | 	//   raise DrmException(u"Invalid file format")
407 | 
408 | 	if len(data) < 0x3C+8 {
409 | 		return false
410 | 	}
411 | 
412 | 	signature := data[0x3C : 0x3C+8]
413 | 
414 | 	return bytes.Equal(signature, []byte("BOOKMOBI")) || bytes.Equal(signature, []byte("TEXtREAd"))
415 | }
416 | 


--------------------------------------------------------------------------------
/ODS 2 Text.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | File Name:  ODS 2 Text.go
 3 | Copyright:  2019 Kleissner Investments s.r.o.
 4 | Author:     Peter Kleissner
 5 | 
 6 | Code for parsing Open Document Spreadsheet files. ZIP-compressed XML-based file format.
 7 | */
 8 | 
 9 | package fileconversion
10 | 
11 | import (
12 | 	"io"
13 | 
14 | 	"github.com/IntelligenceX/fileconversion/odf/ods"
15 | )
16 | 
17 | // ODS2Text extracts text of an OpenDocument Spreadsheet
18 | // Size is the full size of the input file.
19 | func ODS2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) {
20 | 
21 | 	var doc ods.Doc
22 | 
23 | 	f, err := ods.NewReader(file, size)
24 | 	if err != nil {
25 | 		return 0, err
26 | 	}
27 | 	defer f.Close()
28 | 	if err := f.ParseContent(&doc); err != nil {
29 | 		return 0, err
30 | 	}
31 | 
32 | 	for n, sheet := range doc.Table {
33 | 		rows := sheet.Strings()
34 | 		if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet.Name, n, int(len(rows)))), &written, &limit); err != nil || limit == 0 {
35 | 			return written, err
36 | 		}
37 | 
38 | 		for _, row := range rows {
39 | 
40 | 			rowText := ""
41 | 
42 | 			// go through all columns
43 | 			for m, text := range row {
44 | 				if text != "" {
45 | 					text = cleanCell(text)
46 | 
47 | 					if m > 0 {
48 | 						rowText += ", "
49 | 					}
50 | 					rowText += text
51 | 				}
52 | 			}
53 | 
54 | 			rowText += "\n"
55 | 
56 | 			if err = writeOutput(writer, []byte(rowText), &written, &limit); err != nil || limit == 0 {
57 | 				return written, err
58 | 			}
59 | 		}
60 | 	}
61 | 
62 | 	return written, nil
63 | }
64 | 
65 | // ODS2Cells converts an ODS file to individual cells
66 | // Size is the full size of the input file.
67 | func ODS2Cells(file io.ReaderAt, size int64) (cells []string, err error) {
68 | 
69 | 	var doc ods.Doc
70 | 
71 | 	f, err := ods.NewReader(file, size)
72 | 	if err != nil {
73 | 		return nil, err
74 | 	}
75 | 	defer f.Close()
76 | 	if err := f.ParseContent(&doc); err != nil {
77 | 		return nil, err
78 | 	}
79 | 
80 | 	for _, sheet := range doc.Table {
81 | 		for _, row := range sheet.Strings() {
82 | 			for _, text := range row {
83 | 				if text != "" {
84 | 					text = cleanCell(text)
85 | 					cells = append(cells, text)
86 | 				}
87 | 			}
88 | 		}
89 | 	}
90 | 
91 | 	return
92 | }
93 | 


--------------------------------------------------------------------------------
/ODT 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  ODT 2 Text.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | Fork from https://github.com/lu4p/cat/blob/master/odtxt/odtreader.go.
  7 | The extract discards any formatting. The output is one large string without new-lines at the current time.
  8 | */
  9 | 
 10 | package fileconversion
 11 | 
 12 | import (
 13 | 	"archive/zip"
 14 | 	"errors"
 15 | 	"io"
 16 | 	"io/ioutil"
 17 | 
 18 | 	"github.com/IntelligenceX/fileconversion/html2text"
 19 | )
 20 | 
 21 | // ODT2Text extracts text of an OpenDocument Text file
 22 | // Size is the full size of the input file.
 23 | func ODT2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) {
 24 | 	f, err := odtNewReader(file, size)
 25 | 	if err != nil {
 26 | 		return 0, err
 27 | 	}
 28 | 
 29 | 	text, err := f.GetTxt()
 30 | 	if err != nil {
 31 | 		return 0, err
 32 | 	}
 33 | 
 34 | 	err = writeOutput(writer, []byte(text), &written, &limit)
 35 | 
 36 | 	return
 37 | }
 38 | 
 39 | //odt zip struct
 40 | type odt struct {
 41 | 	zipFileReader *zip.Reader
 42 | 	Files         []*zip.File
 43 | 	FilesContent  map[string][]byte
 44 | 	Content       string
 45 | }
 46 | 
 47 | func odtNewReader(file io.ReaderAt, size int64) (*odt, error) {
 48 | 	reader, err := zip.NewReader(file, size)
 49 | 	if err != nil {
 50 | 		return nil, err
 51 | 	}
 52 | 
 53 | 	odtDoc := odt{
 54 | 		zipFileReader: reader,
 55 | 		Files:         reader.File,
 56 | 		FilesContent:  map[string][]byte{},
 57 | 	}
 58 | 
 59 | 	for _, f := range odtDoc.Files {
 60 | 		contents, _ := odtDoc.retrieveFileContents(f.Name)
 61 | 		odtDoc.FilesContent[f.Name] = contents
 62 | 	}
 63 | 
 64 | 	return &odtDoc, nil
 65 | }
 66 | 
 67 | //Read all files contents
 68 | func (d *odt) retrieveFileContents(filename string) ([]byte, error) {
 69 | 	var file *zip.File
 70 | 	for _, f := range d.Files {
 71 | 		if f.Name == filename {
 72 | 			file = f
 73 | 			break
 74 | 		}
 75 | 	}
 76 | 
 77 | 	if file == nil {
 78 | 		return nil, errors.New(filename + " file not found")
 79 | 	}
 80 | 
 81 | 	reader, err := file.Open()
 82 | 	if err != nil {
 83 | 		return nil, err
 84 | 	}
 85 | 	return ioutil.ReadAll(reader)
 86 | }
 87 | 
 88 | func (d *odt) GetTxt() (content string, err error) {
 89 | 	xmlData := d.FilesContent["content.xml"]
 90 | 	return xml2Text(xmlData)
 91 | 	//content, err = d.listP(xmlData)
 92 | }
 93 | 
 94 | /*
 95 | // listP for w:p tag value
 96 | func (d *odt) listP(data []byte) (string, error) {
 97 | 	v := new(odtQuery)
 98 | 	err := xml.Unmarshal(data, &v)
 99 | 	if err != nil {
100 | 		return "", err
101 | 	}
102 | 	var result string
103 | 	for _, text := range v.Body.Text {
104 | 		for _, line := range text.P {
105 | 			if line == "" {
106 | 				continue
107 | 			}
108 | 			result += line + "\n"
109 | 		}
110 | 	}
111 | 	return result, nil
112 | }
113 | 
114 | type odtQuery struct {
115 | 	XMLName xml.Name `xml:"document-content"`
116 | 	Body    odtBody  `xml:"body"`
117 | }
118 | type odtBody struct {
119 | 	Text []odtText `xml:"text"`
120 | }
121 | type odtText struct {
122 | 	P []string `xml:"p"`
123 | }
124 | */
125 | 
126 | // xml2Text extracts any text from XML data.
127 | // Note that any formatting will be lost. The output is one large string without new-lines.
128 | func xml2Text(data []byte) (string, error) {
129 | 	return html2text.FromString(string(data))
130 | }
131 | 


--------------------------------------------------------------------------------
/PDF 2 Image.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  PDF 2 Image.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | */
  6 | 
  7 | package fileconversion
  8 | 
  9 | import (
 10 | 	"image"
 11 | 	"io"
 12 | 	"strconv"
 13 | 
 14 | 	pdfcontent "github.com/unidoc/unipdf/contentstream"
 15 | 	pdfcore "github.com/unidoc/unipdf/core"
 16 | 	pdf "github.com/unidoc/unipdf/model"
 17 | )
 18 | 
 19 | var xObjectImages = 0
 20 | var inlineImages = 0
 21 | 
 22 | // ImageResult contains an extracted image
 23 | type ImageResult struct {
 24 | 	Image image.Image
 25 | 	Name  string
 26 | }
 27 | 
 28 | // PDFExtractImages extracts all images from a PDF file
 29 | func PDFExtractImages(input io.ReadSeeker) (images []ImageResult, err error) {
 30 | 
 31 | 	pdfReader, err := pdf.NewPdfReader(input)
 32 | 	if err != nil {
 33 | 		return nil, err
 34 | 	}
 35 | 
 36 | 	isEncrypted, err := pdfReader.IsEncrypted()
 37 | 	if err != nil {
 38 | 		return nil, err
 39 | 	}
 40 | 
 41 | 	// Try decrypting with an empty one.
 42 | 	if isEncrypted {
 43 | 		auth, err := pdfReader.Decrypt([]byte(""))
 44 | 		if err != nil {
 45 | 			// Encrypted and we cannot do anything about it.
 46 | 			return nil, err
 47 | 		}
 48 | 		if !auth {
 49 | 			//fmt.Println("Need to decrypt with password")
 50 | 			return nil, nil
 51 | 		}
 52 | 	}
 53 | 
 54 | 	numPages, err := pdfReader.GetNumPages()
 55 | 	if err != nil {
 56 | 		return nil, err
 57 | 	}
 58 | 	//fmt.Printf("PDF Num Pages: %d\n", numPages)
 59 | 
 60 | 	for i := 0; i < numPages; i++ {
 61 | 		//fmt.Printf("-----\nPage %d:\n", i+1)
 62 | 
 63 | 		page, err := pdfReader.GetPage(i + 1)
 64 | 		if err != nil {
 65 | 			return nil, err
 66 | 		}
 67 | 
 68 | 		// List images on the page.
 69 | 		rgbImages, err := extractImagesOnPage(page)
 70 | 		if err != nil {
 71 | 			return nil, err
 72 | 		}
 73 | 		_ = rgbImages
 74 | 
 75 | 		for idx, img := range rgbImages {
 76 | 			fname := "p" + strconv.Itoa(i+1) + "_" + strconv.Itoa(idx) + ".jpg"
 77 | 
 78 | 			gimg, err := img.ToGoImage()
 79 | 			if err != nil {
 80 | 				return nil, err
 81 | 			}
 82 | 
 83 | 			images = append(images, ImageResult{Image: gimg, Name: fname})
 84 | 		}
 85 | 	}
 86 | 
 87 | 	return images, nil
 88 | }
 89 | 
 90 | func extractImagesOnPage(page *pdf.PdfPage) ([]*pdf.Image, error) {
 91 | 	contents, err := page.GetAllContentStreams()
 92 | 	if err != nil {
 93 | 		return nil, err
 94 | 	}
 95 | 
 96 | 	return extractImagesInContentStream(contents, page.Resources)
 97 | }
 98 | 
 99 | func extractImagesInContentStream(contents string, resources *pdf.PdfPageResources) ([]*pdf.Image, error) {
100 | 	rgbImages := []*pdf.Image{}
101 | 	cstreamParser := pdfcontent.NewContentStreamParser(contents)
102 | 	operations, err := cstreamParser.Parse()
103 | 	if err != nil {
104 | 		return nil, err
105 | 	}
106 | 
107 | 	processedXObjects := map[string]bool{}
108 | 
109 | 	// Range through all the content stream operations.
110 | 	for _, op := range *operations {
111 | 		if op.Operand == "BI" && len(op.Params) == 1 {
112 | 			// BI: Inline image.
113 | 
114 | 			iimg, ok := op.Params[0].(*pdfcontent.ContentStreamInlineImage)
115 | 			if !ok {
116 | 				continue
117 | 			}
118 | 
119 | 			img, err := iimg.ToImage(resources)
120 | 			if err != nil {
121 | 				return nil, err
122 | 			}
123 | 
124 | 			cs, err := iimg.GetColorSpace(resources)
125 | 			if err != nil {
126 | 				return nil, err
127 | 			}
128 | 			if cs == nil {
129 | 				// Default if not specified?
130 | 				cs = pdf.NewPdfColorspaceDeviceGray()
131 | 			}
132 | 			//fmt.Printf("Cs: %T\n", cs)
133 | 
134 | 			rgbImg, err := cs.ImageToRGB(*img)
135 | 			if err != nil {
136 | 				return nil, err
137 | 			}
138 | 
139 | 			rgbImages = append(rgbImages, &rgbImg)
140 | 			inlineImages++
141 | 		} else if op.Operand == "Do" && len(op.Params) == 1 {
142 | 			// Do: XObject.
143 | 			name := op.Params[0].(*pdfcore.PdfObjectName)
144 | 
145 | 			// Only process each one once.
146 | 			_, has := processedXObjects[string(*name)]
147 | 			if has {
148 | 				continue
149 | 			}
150 | 			processedXObjects[string(*name)] = true
151 | 
152 | 			_, xtype := resources.GetXObjectByName(*name)
153 | 			if xtype == pdf.XObjectTypeImage {
154 | 				//fmt.Printf(" XObject Image: %s\n", *name)
155 | 
156 | 				ximg, err := resources.GetXObjectImageByName(*name)
157 | 				if err != nil {
158 | 					return nil, err
159 | 				}
160 | 
161 | 				img, err := ximg.ToImage()
162 | 				if err != nil {
163 | 					return nil, err
164 | 				}
165 | 
166 | 				rgbImg, err := ximg.ColorSpace.ImageToRGB(*img)
167 | 				if err != nil {
168 | 					return nil, err
169 | 				}
170 | 				rgbImages = append(rgbImages, &rgbImg)
171 | 				xObjectImages++
172 | 			} else if xtype == pdf.XObjectTypeForm {
173 | 				// Go through the XObject Form content stream.
174 | 				xform, err := resources.GetXObjectFormByName(*name)
175 | 				if err != nil {
176 | 					return nil, err
177 | 				}
178 | 
179 | 				formContent, err := xform.GetContentStream()
180 | 				if err != nil {
181 | 					return nil, err
182 | 				}
183 | 
184 | 				// Process the content stream in the Form object too:
185 | 				formResources := xform.Resources
186 | 				if formResources == nil {
187 | 					formResources = resources
188 | 				}
189 | 
190 | 				// Process the content stream in the Form object too:
191 | 				formRgbImages, err := extractImagesInContentStream(string(formContent), formResources)
192 | 				if err != nil {
193 | 					return nil, err
194 | 				}
195 | 				rgbImages = append(rgbImages, formRgbImages...)
196 | 			}
197 | 		}
198 | 	}
199 | 
200 | 	return rgbImages, nil
201 | }
202 | 


--------------------------------------------------------------------------------
/PDF 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  PDF 2 Text.go
  3 | Copyright:  2018 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | This code uses the commercial library UniDoc https://unidoc.io/ to extract text from PDFs.
  7 | */
  8 | 
  9 | package fileconversion
 10 | 
 11 | import (
 12 | 	"io"
 13 | 	"strconv"
 14 | 	"strings"
 15 | 	"time"
 16 | 
 17 | 	"github.com/unidoc/unipdf/core"
 18 | 	"github.com/unidoc/unipdf/extractor"
 19 | 	pdf "github.com/unidoc/unipdf/model"
 20 | 
 21 | 	"github.com/unidoc/unipdf/common/license"
 22 | )
 23 | 
 24 | // InitPDFLicense initializes the PDF license
 25 | func InitPDFLicense(key, name string) {
 26 | 	// load the unidoc license (v3)
 27 | 	license.SetLicenseKey(key, name)
 28 | }
 29 | 
 30 | // PDFListContentStreams writes all text streams in a PDF to the writer
 31 | // It returns the number of characters attempted written (excluding "Page N" and new-lines) and an error, if any. It can be used to determine whether any text was extracted.
 32 | // The parameter size is the max amount of bytes (not characters) to write out.
 33 | func PDFListContentStreams(f io.ReadSeeker, w io.Writer, size int64) (written int64, err error) {
 34 | 
 35 | 	pdfReader, err := pdf.NewPdfReader(f)
 36 | 	if err != nil {
 37 | 		return 0, err
 38 | 	}
 39 | 
 40 | 	isEncrypted, err := pdfReader.IsEncrypted()
 41 | 	if err != nil {
 42 | 		return 0, err
 43 | 	}
 44 | 
 45 | 	if isEncrypted {
 46 | 		_, err = pdfReader.Decrypt([]byte(""))
 47 | 		if err != nil {
 48 | 			return 0, err
 49 | 		}
 50 | 	}
 51 | 
 52 | 	numPages, err := pdfReader.GetNumPages()
 53 | 	if err != nil {
 54 | 		return 0, err
 55 | 	}
 56 | 
 57 | 	for i := 0; i < numPages && size > 0; i++ {
 58 | 		pageNum := i + 1
 59 | 
 60 | 		page, err := pdfReader.GetPage(pageNum)
 61 | 		if err != nil {
 62 | 			return written, err
 63 | 		}
 64 | 
 65 | 		ex, err := extractor.New(page)
 66 | 		if err != nil {
 67 | 			return written, err
 68 | 		}
 69 | 
 70 | 		txt, err := ex.ExtractText()
 71 | 		if err != nil {
 72 | 			return written, err
 73 | 		}
 74 | 
 75 | 		// use the extracted text
 76 | 		txtNL := ""
 77 | 		if written > 0 {
 78 | 			txtNL += "\n\n"
 79 | 		}
 80 | 
 81 | 		textB := []byte(txtNL + "---- Page " + strconv.Itoa(pageNum) + " ----\n")
 82 | 
 83 | 		// empty page? skip if so.
 84 | 		txt = strings.TrimSpace(txt)
 85 | 		if len(txt) == 0 {
 86 | 			continue
 87 | 		}
 88 | 
 89 | 		textB = append(textB, []byte(txt)...)
 90 | 		if int64(len(textB)) > size {
 91 | 			textB = textB[:size]
 92 | 		}
 93 | 
 94 | 		if _, err = w.Write(textB); err != nil {
 95 | 			return written, err
 96 | 		}
 97 | 
 98 | 		size -= int64(len(textB))
 99 | 		written += int64(len(txt))
100 | 	}
101 | 
102 | 	return written, nil
103 | }
104 | 
105 | // PDFGetCreationDate tries to get the creation date
106 | func PDFGetCreationDate(f io.ReadSeeker) (date time.Time, valid bool) {
107 | 	// Below code is forked from https://github.com/unidoc/unidoc-examples/blob/master/pdf/metadata/pdf_metadata_get_docinfo.go
108 | 	pdfReader, err := pdf.NewPdfReader(f)
109 | 	if err != nil {
110 | 		return date, false
111 | 	}
112 | 
113 | 	trailerDict, err := pdfReader.GetTrailer()
114 | 	if err != nil || trailerDict == nil {
115 | 		return date, false
116 | 	}
117 | 
118 | 	var infoDict *core.PdfObjectDictionary
119 | 
120 | 	infoObj := trailerDict.Get("Info")
121 | 	switch t := infoObj.(type) {
122 | 	case *core.PdfObjectReference:
123 | 		infoRef := t
124 | 		infoObj, err = pdfReader.GetIndirectObjectByNumber(int(infoRef.ObjectNumber))
125 | 		infoObj = core.TraceToDirectObject(infoObj)
126 | 		if err != nil {
127 | 			return date, false
128 | 		}
129 | 		infoDict, _ = infoObj.(*core.PdfObjectDictionary)
130 | 	case *core.PdfObjectDictionary:
131 | 		infoDict = t
132 | 	}
133 | 
134 | 	if infoDict == nil {
135 | 		return date, false
136 | 	}
137 | 
138 | 	if str, has := infoDict.Get("CreationDate").(*core.PdfObjectString); has {
139 | 		creationDateA := strings.TrimPrefix(str.String(), "D:")
140 | 
141 | 		time1, err := time.Parse("20060102150405-07'00'", creationDateA)
142 | 		if err == nil {
143 | 			return time1.UTC(), true
144 | 		}
145 | 	}
146 | 
147 | 	return date, false
148 | }
149 | 


--------------------------------------------------------------------------------
/PPT 2 Text.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | File Name:  PPT 2 Text.go
 3 | Copyright:  2019 Kleissner Investments s.r.o.
 4 | Author:     Peter Kleissner
 5 | 
 6 | Placeholder file until PPT conversion code 2 text is available.
 7 | */
 8 | 
 9 | package fileconversion
10 | 
11 | import "bytes"
12 | 
13 | // IsFilePPT checks if the data indicates a PPT file
14 | // PPT has multiple signature according to https://www.filesignatures.net/index.php?page=search&search=PPT&mode=EXT, D0 CF 11 E0 A1 B1 1A E1. This overlaps with others (including DOC ans XLS).
15 | func IsFilePPT(data []byte) bool {
16 | 	return bytes.HasPrefix(data, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1})
17 | }
18 | 


--------------------------------------------------------------------------------
/PPTX 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  PPTX 2 Text.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | This code is a fork from https://github.com/mr-tim/rol-o-decks/blob/master/indexer/indexer.go.
  7 | */
  8 | 
  9 | package fileconversion
 10 | 
 11 | import (
 12 | 	"archive/zip"
 13 | 	"bytes"
 14 | 	"io"
 15 | 	"sort"
 16 | 	"strconv"
 17 | 	"strings"
 18 | 
 19 | 	"gopkg.in/xmlpath.v2"
 20 | )
 21 | 
 22 | // PPTXDocument is a PPTX document loaded into memory
 23 | type PPTXDocument struct {
 24 | 	Slides []PPTXSlide
 25 | }
 26 | 
 27 | // PPTXSlide is a single PPTX slide
 28 | type PPTXSlide struct {
 29 | 	SlideNumber int
 30 | 	//ThumbnailBase64 string
 31 | 	TextContent string
 32 | }
 33 | 
 34 | // SlideNumberSorter is used for sorting
 35 | type SlideNumberSorter []PPTXSlide
 36 | 
 37 | func (a SlideNumberSorter) Len() int           { return len(a) }
 38 | func (a SlideNumberSorter) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 39 | func (a SlideNumberSorter) Less(i, j int) bool { return a[i].SlideNumber < a[j].SlideNumber }
 40 | 
 41 | // PPTX2Text extracts text of a PowerPoint document
 42 | // Size is the full size of the input file.
 43 | func PPTX2Text(file io.ReaderAt, size int64) (string, error) {
 44 | 
 45 | 	r, err := zip.NewReader(file, size)
 46 | 	if err != nil {
 47 | 		return "", err
 48 | 	}
 49 | 
 50 | 	doc := parsePPTXDocument(r)
 51 | 
 52 | 	return doc.AsText(), nil
 53 | }
 54 | 
 55 | // IsFilePPTX checks if the data indicates a PPTX file
 56 | // PPTX has a signature of 50 4B 03 04
 57 | // Warning: This collides with ZIP, DOCX and other zip-based files.
 58 | func IsFilePPTX(data []byte) bool {
 59 | 	return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04})
 60 | }
 61 | 
 62 | func extractSlideContent(f *zip.File) string {
 63 | 	p := xmlpath.MustCompile("//t")
 64 | 	zr, _ := f.Open()
 65 | 	defer zr.Close()
 66 | 	root, _ := xmlpath.Parse(zr)
 67 | 	i := p.Iter(root)
 68 | 	content := make([]string, 0)
 69 | 	for i.Next() {
 70 | 		n := i.Node()
 71 | 		content = append(content, n.String())
 72 | 	}
 73 | 	textContent := strings.Join(content, "\n")
 74 | 	return textContent
 75 | }
 76 | 
 77 | func parsePPTXDocument(r *zip.Reader) (doc PPTXDocument) {
 78 | 
 79 | 	for _, f := range r.File {
 80 | 		if strings.HasPrefix(f.Name, "ppt/slides/") && !strings.HasPrefix(f.Name, "ppt/slides/_rels") {
 81 | 			slideNumberStr := strings.TrimSuffix(strings.TrimPrefix(strings.ToLower(f.Name), "ppt/slides/slide"), ".xml")
 82 | 			slideNumber, _ := strconv.Atoi(slideNumberStr)
 83 | 
 84 | 			// grab the text content
 85 | 			doc.Slides = append(doc.Slides, PPTXSlide{
 86 | 				SlideNumber: slideNumber,
 87 | 				TextContent: extractSlideContent(f),
 88 | 				//ThumbnailBase64: generateThumbnail(fileToIndex, slideNumber),
 89 | 			})
 90 | 		}
 91 | 	}
 92 | 
 93 | 	sort.Sort(SlideNumberSorter(doc.Slides))
 94 | 
 95 | 	return doc
 96 | }
 97 | 
 98 | // AsText returns the text on all slides
 99 | func (doc PPTXDocument) AsText() (text string) {
100 | 
101 | 	for n, slide := range doc.Slides {
102 | 		if slide.TextContent == "" { // skip empty slides
103 | 			continue
104 | 		}
105 | 
106 | 		if n > 0 && text != "" {
107 | 			text += "\n\n"
108 | 		}
109 | 
110 | 		text += "Slide " + strconv.Itoa(n+1) + ":\n"
111 | 		text += slide.TextContent
112 | 	}
113 | 
114 | 	return text
115 | }
116 | 


--------------------------------------------------------------------------------
/Picture.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | File Name:  Picture.go
 3 | Copyright:  2018 Kleissner Investments s.r.o.
 4 | Author:     Peter Kleissner
 5 | */
 6 | 
 7 | package fileconversion
 8 | 
 9 | import (
10 | 	"bytes"
11 | 	"image"
12 | 	_ "image/gif" // automatic registration
13 | 	"image/jpeg"
14 | 	_ "image/png" // REQUIRED! automatic registration of PNG decoding for image.Decode
15 | 
16 | 	_ "golang.org/x/image/bmp"  // Required for BMP decoding
17 | 	_ "golang.org/x/image/tiff" // Required for TIFF decoding
18 | 
19 | 	"github.com/nfnt/resize"
20 | )
21 | 
22 | // IsExcessiveLargePicture checks if the picture has reasonable width and height, preventing potential DoS when decoding it
23 | // This protects against this problem: If the image claims to be large (in terms of width & height), jpeg.Decode may use a lot of memory, see https://github.com/golang/go/issues/10532.
24 | func IsExcessiveLargePicture(Picture []byte) (excessive bool, err error) {
25 | 	config, _, err := image.DecodeConfig(bytes.NewBuffer(Picture))
26 | 	if err != nil {
27 | 		return false, err
28 | 	}
29 | 
30 | 	return config.Width > 7680 || config.Height > 4320, nil
31 | }
32 | 
33 | // CompressJPEG compresses a JPEG picture according to the input
34 | // Warning: If the image claims to be large (in terms of width & height), this may use a lot of memory. Use IsExcessiveLargePicture first.
35 | func CompressJPEG(Picture []byte, quality int) (compressed []byte) {
36 | 	if quality == 100 { // nothing todo on perfect quality
37 | 		return Picture
38 | 	}
39 | 
40 | 	image, err := jpeg.Decode(bytes.NewBuffer(Picture))
41 | 	if err != nil {
42 | 		return Picture
43 | 	}
44 | 
45 | 	target := bytes.NewBuffer(make([]byte, 0, len(Picture)))
46 | 
47 | 	err = jpeg.Encode(target, image, &jpeg.Options{Quality: quality})
48 | 	if err != nil {
49 | 		return Picture
50 | 	}
51 | 
52 | 	return target.Bytes()
53 | }
54 | 
55 | // ResizeCompressPicture scales a picture down and compresses it. It accepts GIF, JPEG, PNG as input but output will always be JPEG.
56 | // Quality specifies the output JPEG quality 0-100. Anything below 75 will noticably reduce the picture quality.
57 | // Warning: If the image claims to be large (in terms of width & height), this may use a lot of memory. Use IsExcessiveLargePicture first.
58 | // Scaling a picture down is optional and only done if MaxWidth and MaxHeight are not 0. Even without rescaling, this function is useful to convert a picture into JPEG.
59 | func ResizeCompressPicture(Picture []byte, Quality int, MaxWidth, MaxHeight uint) (compressed []byte, err error) {
60 | 
61 | 	// decode the image
62 | 	img, _, err := image.Decode(bytes.NewBuffer(Picture))
63 | 	if err != nil { // discard images that can't be decoded
64 | 		return nil, err
65 | 	}
66 | 
67 | 	// resize if required
68 | 	if MaxWidth != 0 && MaxHeight != 0 {
69 | 		img = resize.Thumbnail(MaxWidth, MaxHeight, img, resize.Lanczos3)
70 | 	}
71 | 
72 | 	// encode as JPEG with the specified quality
73 | 	target := bytes.NewBuffer(make([]byte, 0, len(Picture)))
74 | 
75 | 	err = jpeg.Encode(target, img, &jpeg.Options{Quality: Quality})
76 | 	if err != nil {
77 | 		return nil, err
78 | 	}
79 | 
80 | 	return target.Bytes(), nil
81 | }
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # fileconversion
  2 | 
  3 | This is a Go library to convert various file formats into plaintext and provide related useful functions.
  4 | 
  5 | This library is used for https://intelx.io and was successfully tested over 184 million individual files. It is partly written from scratch, partly forked from open source and partly a rewrite of existing code. Many existing libraries lack stability and functionality and this libraries solves that. 
  6 | 
  7 | We welcome any contributions - please open issues for any feature requests, bugs, and other related issues.
  8 | 
  9 | It supports following file formats for plaintext conversion:
 10 | 
 11 | * Word: DOC, DOCX, RTF, ODT
 12 | * Excel: XLS, XLSX, ODS
 13 | * PowerPoint: PPTX
 14 | * PDF
 15 | * Ebook: EPUB, MOBI
 16 | * Website: HTML
 17 | 
 18 | Functions for compressed and container files:
 19 | 
 20 | * Decompress files: GZ, BZ, BZ2, XZ
 21 | * Extract files from containers: ZIP, RAR, 7Z, TAR
 22 | 
 23 | Picture related functions:
 24 | 
 25 | * Check if pictures are excessively large
 26 | * Compress (and convert) pictures to JPEG: GIF, JPEG, PNG, BMP, TIFF
 27 | * Resize and compress pictures
 28 | * Extract pictures from PDF files
 29 | 
 30 | To download this library:
 31 | 
 32 | ```
 33 | go get -u github.com/IntelligenceX/fileconversion
 34 | ```
 35 | 
 36 | And then use it like:
 37 | 
 38 | ```go
 39 | package main
 40 | 
 41 | import (
 42 | 	"bytes"
 43 | 	"fmt"
 44 | 	"os"
 45 | 
 46 | 	"github.com/IntelligenceX/fileconversion"
 47 | )
 48 | 
 49 | const sizeLimit = 2 * 1024 * 1024 // 2 MB
 50 | 
 51 | func main() {
 52 | 	// extract text from an XLSX file
 53 | 	file, err := os.Open("Test.xlsx")
 54 | 	if err != nil {
 55 | 		fmt.Printf("Error opening file: %s\n", err)
 56 | 		return
 57 | 	}
 58 | 
 59 | 	defer file.Close()
 60 | 	stat, _ := file.Stat()
 61 | 
 62 | 	buffer := bytes.NewBuffer(make([]byte, 0, sizeLimit))
 63 | 
 64 | 	fileconversion.XLSX2Text(file, stat.Size(), buffer, sizeLimit, -1)
 65 | 
 66 | 	fmt.Println(buffer.String())
 67 | }
 68 | ```
 69 | 
 70 | 
 71 | ## Functions
 72 | 
 73 | The package exports the following functions:
 74 | 
 75 | ```go
 76 | XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error)
 77 | DOCX2Text(file io.ReaderAt, size int64) (string, error)
 78 | EPUB2Text(file io.ReaderAt, size int64, limit int64) (string, error)
 79 | HTML2Text(reader io.Reader) (pageText string, err error)
 80 | HTML2TextAndLinks(reader io.Reader, baseURL string) (pageText string, links []string, err error)
 81 | Mobi2Text(file io.ReadSeeker) (string, error)
 82 | ODS2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error)
 83 | ODT2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error)
 84 | PDFListContentStreams(f io.ReadSeeker, w io.Writer, size int64) (written int64, err error)
 85 | PPTX2Text(file io.ReaderAt, size int64) (string, error)
 86 | RTF2Text(inputRtf string) string
 87 | XLS2Text(reader io.ReadSeeker, writer io.Writer, size int64) (written int64, err error)
 88 | XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error)
 89 | ```
 90 | 
 91 | Picture functions:
 92 | 
 93 | ```go
 94 | IsExcessiveLargePicture(Picture []byte) (excessive bool, err error)
 95 | CompressJPEG(Picture []byte, quality int) (compressed []byte)
 96 | ResizeCompressPicture(Picture []byte, Quality int, MaxWidth, MaxHeight uint) 
 97 | PDFExtractImages(input io.ReadSeeker) (images []ImageResult, err error)
 98 | ```
 99 | 
100 | Compression and container file functions:
101 | 
102 | ```go
103 | DecompressFile(data []byte) (decompressed []byte, valid bool)
104 | ContainerExtractFiles(data []byte, callback func(name string, size int64, date time.Time, data []byte))
105 | ```
106 | 
107 | ## Dependencies
108 | 
109 | This library uses other go packages. Run the following command to download them:
110 | 
111 | ```
112 | go get -u github.com/nwaples/rardecode
113 | go get -u github.com/saracen/go7z
114 | go get -u github.com/ulikunitz/xz
115 | go get -u github.com/mattetti/filebuffer
116 | go get -u github.com/richardlehane/mscfb
117 | go get -u github.com/taylorskalyo/goreader/epub
118 | go get -u github.com/PuerkitoBio/goquery
119 | go get -u github.com/ssor/bom
120 | go get -u github.com/levigross/exp-html
121 | go get -u github.com/neofight/mobi/convert
122 | go get -u github.com/neofight/mobi/headers
123 | go get -u github.com/unidoc/unipdf
124 | go get -u github.com/nfnt/resize
125 | go get -u github.com/tealeg/xlsx
126 | go get -u gopkg.in/xmlpath.v2
127 | ```
128 | 
129 | ## Tests
130 | 
131 | There are no functional tests. The only test functions are used manually for debugging.
132 | 
133 | ## Forks
134 | 
135 | Other packages were tested and either found insufficient, or unstable. Many of the below listed packages were found to be unstable, cause crashes, as well as exhaust memory due to bad programming, bad input sanitizing and bad memory management.
136 | 
137 | * `html2text` is forked from https://github.com/jaytaylor/html2text
138 | * `odf` is forked from https://github.com/knieriem/odf
139 | * `ole2` is forked and partly rewritten from https://github.com/extrame/ole2
140 | * `xls` is forked from https://github.com/sergeilem/xls which is a fork from https://github.com/extrame/xls
141 | * `doc` is forked from https://github.com/EndFirstCorp/doc2txt
142 | * `docx` is forked from https://github.com/guylaor/goword
143 | * `mobi` is forked from https://github.com/neofight/mobi
144 | * `odt` is forked from https://github.com/lu4p/cat
145 | * `pptx` is forked from https://github.com/mr-tim/rol-o-decks
146 | * `rtf` is forked from https://github.com/J45k4/rtf-go
147 | 
148 | ## License
149 | 
150 | This is free and unencumbered software released into the public domain.
151 | 
152 | Note that this package includes, or consists partly of forks or rewrite of existing open source code. Use at your own risk. Intelligence X does not provide any warranty for this library or any parts of it.
153 | 


--------------------------------------------------------------------------------
/RTF 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  RTF 2 Text.go
  3 | Copyright:  2018 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | This code is forked from https://github.com/J45k4/rtf-go and extracts text from RTF files.
  7 | It contains an important fix for a bug that was triggered with 06ffe2e7-06b6-41d6-9905-3a225fd55537 with an "index out of range" crash.
  8 | It contains another fix to properly decode foreign encodings.
  9 | 
 10 | Warning: rtfRegex.FindAllStringSubmatch may use excessive memory! Example System ID that causes problems: 02cf9199-2cda-4fa1-b830-060c67417d2d.
 11 | 
 12 | An alternative solution is https://github.com/EndFirstCorp/rtf2txt, but it was found to output everything as one long line without LFs.
 13 | */
 14 | 
 15 | package fileconversion
 16 | 
 17 | import (
 18 | 	"bytes"
 19 | 	"regexp"
 20 | 	"strconv"
 21 | 	"strings"
 22 | 
 23 | 	"golang.org/x/text/encoding"
 24 | 	"golang.org/x/text/encoding/charmap"
 25 | )
 26 | 
 27 | var destinations = map[string]bool{
 28 | 	"aftncn":             true,
 29 | 	"aftnsep":            true,
 30 | 	"aftnsepc":           true,
 31 | 	"annotation":         true,
 32 | 	"atnauthor":          true,
 33 | 	"atndate":            true,
 34 | 	"atnicn":             true,
 35 | 	"atnid":              true,
 36 | 	"atnparent":          true,
 37 | 	"atnref":             true,
 38 | 	"atntime":            true,
 39 | 	"atrfend":            true,
 40 | 	"atrfstart":          true,
 41 | 	"author":             true,
 42 | 	"background":         true,
 43 | 	"bkmkend":            true,
 44 | 	"bkmkstart":          true,
 45 | 	"blipuid":            true,
 46 | 	"buptim":             true,
 47 | 	"category":           true,
 48 | 	"colorschememapping": true,
 49 | 	"colortbl":           true,
 50 | 	"comment":            true,
 51 | 	"company":            true,
 52 | 	"creatim":            true,
 53 | 	"datafield":          true,
 54 | 	"datastore":          true,
 55 | 	"defchp":             true,
 56 | 	"defpap":             true,
 57 | 	"do":                 true,
 58 | 	"doccomm":            true,
 59 | 	"docvar":             true,
 60 | 	"dptxbxtext":         true,
 61 | 	"ebcend":             true,
 62 | 	"ebcstart":           true,
 63 | 	"factoidname":        true,
 64 | 	"falt":               true,
 65 | 	"fchars":             true,
 66 | 	"ffdeftext":          true,
 67 | 	"ffentrymcr":         true,
 68 | 	"ffexitmcr":          true,
 69 | 	"ffformat":           true,
 70 | 	"ffhelptext":         true,
 71 | 	"ffl":                true,
 72 | 	"ffname":             true,
 73 | 	"ffstattext":         true,
 74 | 	"field":              true,
 75 | 	"file":               true,
 76 | 	"filetbl":            true,
 77 | 	"fldinst":            true,
 78 | 	"fldrslt":            true,
 79 | 	"fldtype":            true,
 80 | 	"fname":              true,
 81 | 	"fontemb":            true,
 82 | 	"fontfile":           true,
 83 | 	"fonttbl":            true,
 84 | 	"footer":             true,
 85 | 	"footerf":            true,
 86 | 	"footerl":            true,
 87 | 	"footerr":            true,
 88 | 	"footnote":           true,
 89 | 	"formfield":          true,
 90 | 	"ftncn":              true,
 91 | 	"ftnsep":             true,
 92 | 	"ftnsepc":            true,
 93 | 	"g":                  true,
 94 | 	"generator":          true,
 95 | 	"gridtbl":            true,
 96 | 	"header":             true,
 97 | 	"headerf":            true,
 98 | 	"headerl":            true,
 99 | 	"headerr":            true,
100 | 	"hl":                 true,
101 | 	"hlfr":               true,
102 | 	"hlinkbase":          true,
103 | 	"hlloc":              true,
104 | 	"hlsrc":              true,
105 | 	"hsv":                true,
106 | 	"htmltag":            true,
107 | 	"info":               true,
108 | 	"keycode":            true,
109 | 	"keywords":           true,
110 | 	"latentstyles":       true,
111 | 	"lchars":             true,
112 | 	"levelnumbers":       true,
113 | 	"leveltext":          true,
114 | 	"lfolevel":           true,
115 | 	"linkval":            true,
116 | 	"list":               true,
117 | 	"listlevel":          true,
118 | 	"listname":           true,
119 | 	"listoverride":       true,
120 | 	"listoverridetable":  true,
121 | 	"listpicture":        true,
122 | 	"liststylename":      true,
123 | 	"listtable":          true,
124 | 	"listtext":           true,
125 | 	"lsdlockedexcept":    true,
126 | 	"macc":               true,
127 | 	"maccPr":             true,
128 | 	"mailmerge":          true,
129 | 	"maln":               true,
130 | 	"malnScr":            true,
131 | 	"manager":            true,
132 | 	"margPr":             true,
133 | 	"mbar":               true,
134 | 	"mbarPr":             true,
135 | 	"mbaseJc":            true,
136 | 	"mbegChr":            true,
137 | 	"mborderBox":         true,
138 | 	"mborderBoxPr":       true,
139 | 	"mbox":               true,
140 | 	"mboxPr":             true,
141 | 	"mchr":               true,
142 | 	"mcount":             true,
143 | 	"mctrlPr":            true,
144 | 	"md":                 true,
145 | 	"mdeg":               true,
146 | 	"mdegHide":           true,
147 | 	"mden":               true,
148 | 	"mdiff":              true,
149 | 	"mdPr":               true,
150 | 	"me":                 true,
151 | 	"mendChr":            true,
152 | 	"meqArr":             true,
153 | 	"meqArrPr":           true,
154 | 	"mf":                 true,
155 | 	"mfName":             true,
156 | 	"mfPr":               true,
157 | 	"mfunc":              true,
158 | 	"mfuncPr":            true,
159 | 	"mgroupChr":          true,
160 | 	"mgroupChrPr":        true,
161 | 	"mgrow":              true,
162 | 	"mhideBot":           true,
163 | 	"mhideLeft":          true,
164 | 	"mhideRight":         true,
165 | 	"mhideTop":           true,
166 | 	"mhtmltag":           true,
167 | 	"mlim":               true,
168 | 	"mlimloc":            true,
169 | 	"mlimlow":            true,
170 | 	"mlimlowPr":          true,
171 | 	"mlimupp":            true,
172 | 	"mlimuppPr":          true,
173 | 	"mm":                 true,
174 | 	"mmaddfieldname":     true,
175 | 	"mmath":              true,
176 | 	"mmathPict":          true,
177 | 	"mmathPr":            true,
178 | 	"mmaxdist":           true,
179 | 	"mmc":                true,
180 | 	"mmcJc":              true,
181 | 	"mmconnectstr":       true,
182 | 	"mmconnectstrdata":   true,
183 | 	"mmcPr":              true,
184 | 	"mmcs":               true,
185 | 	"mmdatasource":       true,
186 | 	"mmheadersource":     true,
187 | 	"mmmailsubject":      true,
188 | 	"mmodso":             true,
189 | 	"mmodsofilter":       true,
190 | 	"mmodsofldmpdata":    true,
191 | 	"mmodsomappedname":   true,
192 | 	"mmodsoname":         true,
193 | 	"mmodsorecipdata":    true,
194 | 	"mmodsosort":         true,
195 | 	"mmodsosrc":          true,
196 | 	"mmodsotable":        true,
197 | 	"mmodsoudl":          true,
198 | 	"mmodsoudldata":      true,
199 | 	"mmodsouniquetag":    true,
200 | 	"mmPr":               true,
201 | 	"mmquery":            true,
202 | 	"mmr":                true,
203 | 	"mnary":              true,
204 | 	"mnaryPr":            true,
205 | 	"mnoBreak":           true,
206 | 	"mnum":               true,
207 | 	"mobjDist":           true,
208 | 	"moMath":             true,
209 | 	"moMathPara":         true,
210 | 	"moMathParaPr":       true,
211 | 	"mopEmu":             true,
212 | 	"mphant":             true,
213 | 	"mphantPr":           true,
214 | 	"mplcHide":           true,
215 | 	"mpos":               true,
216 | 	"mr":                 true,
217 | 	"mrad":               true,
218 | 	"mradPr":             true,
219 | 	"mrPr":               true,
220 | 	"msepChr":            true,
221 | 	"mshow":              true,
222 | 	"mshp":               true,
223 | 	"msPre":              true,
224 | 	"msPrePr":            true,
225 | 	"msSub":              true,
226 | 	"msSubPr":            true,
227 | 	"msSubSup":           true,
228 | 	"msSubSupPr":         true,
229 | 	"msSup":              true,
230 | 	"msSupPr":            true,
231 | 	"mstrikeBLTR":        true,
232 | 	"mstrikeH":           true,
233 | 	"mstrikeTLBR":        true,
234 | 	"mstrikeV":           true,
235 | 	"msub":               true,
236 | 	"msubHide":           true,
237 | 	"msup":               true,
238 | 	"msupHide":           true,
239 | 	"mtransp":            true,
240 | 	"mtype":              true,
241 | 	"mvertJc":            true,
242 | 	"mvfmf":              true,
243 | 	"mvfml":              true,
244 | 	"mvtof":              true,
245 | 	"mvtol":              true,
246 | 	"mzeroAsc":           true,
247 | 	"mzeroDesc":          true,
248 | 	"mzeroWid":           true,
249 | 	"nesttableprops":     true,
250 | 	"nextfile":           true,
251 | 	"nonesttables":       true,
252 | 	"objalias":           true,
253 | 	"objclass":           true,
254 | 	"objdata":            true,
255 | 	"object":             true,
256 | 	"objname":            true,
257 | 	"objsect":            true,
258 | 	"objtime":            true,
259 | 	"oldcprops":          true,
260 | 	"oldpprops":          true,
261 | 	"oldsprops":          true,
262 | 	"oldtprops":          true,
263 | 	"oleclsid":           true,
264 | 	"operator":           true,
265 | 	"panose":             true,
266 | 	"password":           true,
267 | 	"passwordhash":       true,
268 | 	"pgp":                true,
269 | 	"pgptbl":             true,
270 | 	"picprop":            true,
271 | 	"pict":               true,
272 | 	"pn":                 true,
273 | 	"pnseclvl":           true,
274 | 	"pntext":             true,
275 | 	"pntxta":             true,
276 | 	"pntxtb":             true,
277 | 	"printim":            true,
278 | 	"private":            true,
279 | 	"propname":           true,
280 | 	"protend":            true,
281 | 	"protstart":          true,
282 | 	"protusertbl":        true,
283 | 	"pxe":                true,
284 | 	"result":             true,
285 | 	"revtbl":             true,
286 | 	"revtim":             true,
287 | 	"rsidtbl":            true,
288 | 	"rxe":                true,
289 | 	"shp":                true,
290 | 	"shpgrp":             true,
291 | 	"shpinst":            true,
292 | 	"shppict":            true,
293 | 	"shprslt":            true,
294 | 	"shptxt":             true,
295 | 	"sn":                 true,
296 | 	"sp":                 true,
297 | 	"staticval":          true,
298 | 	"stylesheet":         true,
299 | 	"subject":            true,
300 | 	"sv":                 true,
301 | 	"svb":                true,
302 | 	"tc":                 true,
303 | 	"template":           true,
304 | 	"themedata":          true,
305 | 	"title":              true,
306 | 	"txe":                true,
307 | 	"ud":                 true,
308 | 	"upr":                true,
309 | 	"userprops":          true,
310 | 	"wgrffmtfilter":      true,
311 | 	"windowcaption":      true,
312 | 	"writereservation":   true,
313 | 	"writereservhash":    true,
314 | 	"xe":                 true,
315 | 	"xform":              true,
316 | 	"xmlattrname":        true,
317 | 	"xmlattrvalue":       true,
318 | 	"xmlclose":           true,
319 | 	"xmlname":            true,
320 | 	"xmlnstbl":           true,
321 | 	"xmlopen":            true,
322 | }
323 | 
324 | var specialCharacters = map[string]string{
325 | 	"par":       "\n",
326 | 	"sect":      "\n\n",
327 | 	"page":      "\n\n",
328 | 	"line":      "\n",
329 | 	"tab":       "\t",
330 | 	"emdash":    "\u2014",
331 | 	"endash":    "\u2013",
332 | 	"emspace":   "\u2003",
333 | 	"enspace":   "\u2002",
334 | 	"qmspace":   "\u2005",
335 | 	"bullet":    "\u2022",
336 | 	"lquote":    "\u2018",
337 | 	"rquote":    "\u2019",
338 | 	"ldblquote": "\u201C",
339 | 	"rdblquote": "\u201D",
340 | }
341 | 
342 | var charmaps = map[string]*charmap.Charmap{
343 | 	"437": charmap.CodePage437,
344 | 	//	"708":  nil,
345 | 	//	"709":  nil,
346 | 	//	"710":  nil,
347 | 	//	"711":  nil,
348 | 	//	"720":  nil,
349 | 	//	"819":  nil,
350 | 	"850": charmap.CodePage850,
351 | 	"852": charmap.CodePage852,
352 | 	"860": charmap.CodePage860,
353 | 	"862": charmap.CodePage862,
354 | 	"863": charmap.CodePage863,
355 | 	//	"864":  nil,
356 | 	"865": charmap.CodePage865,
357 | 	"866": charmap.CodePage866,
358 | 	//	"874":  nil,
359 | 	//	"932":  nil,
360 | 	//	"936":  nil,
361 | 	//	"949":  nil,
362 | 	//	"950":  nil,
363 | 	"1250": charmap.Windows1250,
364 | 	"1251": charmap.Windows1251,
365 | 	"1252": charmap.Windows1252,
366 | 	"1253": charmap.Windows1253,
367 | 	"1254": charmap.Windows1254,
368 | 	"1255": charmap.Windows1255,
369 | 	"1256": charmap.Windows1256,
370 | 	"1257": charmap.Windows1257,
371 | 	"1258": charmap.Windows1258,
372 | 	//	"1361": nil,
373 | }
374 | 
375 | var rtfRegex = regexp.MustCompile(
376 | 	"(?i)" +
377 | 		`\\([a-z]{1,32})(-?\d{1,10})?[ ]?` +
378 | 		`|\\'([0-9a-f]{2})` +
379 | 		`|\\([^a-z])` +
380 | 		`|([{}])` +
381 | 		`|[\r\n]+` +
382 | 		`|(.)`)
383 | 
384 | type stackEntry struct {
385 | 	NumberOfCharactersToSkip int
386 | 	Ignorable                bool
387 | }
388 | 
389 | func newStackEntry(numberOfCharactersToSkip int, ignorable bool) stackEntry {
390 | 	return stackEntry{
391 | 		NumberOfCharactersToSkip: numberOfCharactersToSkip,
392 | 		Ignorable:                ignorable,
393 | 	}
394 | }
395 | 
396 | // RTF2Text removes rtf characters from string and returns the new string.
397 | func RTF2Text(inputRtf string) string {
398 | 	var charMap *charmap.Charmap
399 | 	var decoder *encoding.Decoder
400 | 	var stack []stackEntry
401 | 	var ignorable bool
402 | 	ucskip := 1
403 | 	curskip := 0
404 | 
405 | 	matches := rtfRegex.FindAllStringSubmatch(inputRtf, -1)
406 | 	var returnBuffer bytes.Buffer
407 | 
408 | 	for _, match := range matches {
409 | 		word := match[1]
410 | 		arg := match[2]
411 | 		hex := match[3]
412 | 		character := match[4]
413 | 		brace := match[5]
414 | 		tchar := match[6]
415 | 
416 | 		switch {
417 | 		case tchar != "":
418 | 			if curskip > 0 {
419 | 				curskip--
420 | 			} else if !ignorable {
421 | 				if charMap == nil || decoder == nil {
422 | 					returnBuffer.WriteString(tchar)
423 | 				} else {
424 | 					tcharDec, err := decoder.String(tchar)
425 | 					if err == nil {
426 | 						returnBuffer.WriteString(tcharDec)
427 | 					}
428 | 				}
429 | 			}
430 | 		case brace != "":
431 | 			curskip = 0
432 | 			if brace == "{" {
433 | 				stack = append(
434 | 					stack, newStackEntry(ucskip, ignorable))
435 | 			} else if brace == "}" {
436 | 				// There was a crash here with item 06ffe2e7-06b6-41d6-9905-3a225fd55537
437 | 				// It's fixed by checking l == 0 and handling it as special case
438 | 				if l := len(stack); l > 0 {
439 | 					entry := stack[l-1]
440 | 					stack = stack[:l-1]
441 | 					ucskip = entry.NumberOfCharactersToSkip
442 | 					ignorable = entry.Ignorable
443 | 				}
444 | 			}
445 | 		case character != "":
446 | 			curskip = 0
447 | 			if character == "~" {
448 | 				if !ignorable {
449 | 					returnBuffer.WriteString("\xA0")
450 | 				}
451 | 			} else if strings.Contains("{}\\", character) {
452 | 				if !ignorable {
453 | 					returnBuffer.WriteString(character)
454 | 				}
455 | 			} else if character == "*" {
456 | 				ignorable = true
457 | 			}
458 | 		case word != "":
459 | 			curskip = 0
460 | 			if destinations[word] {
461 | 				ignorable = true
462 | 			} else if ignorable {
463 | 			} else if specialCharacters[word] != "" {
464 | 				returnBuffer.WriteString(
465 | 					specialCharacters[word])
466 | 			} else if word == "ansicpg" {
467 | 				var ok bool
468 | 				if charMap, ok = charmaps[arg]; ok {
469 | 					decoder = charMap.NewDecoder()
470 | 				} else {
471 | 					// encoding not supported, continue anyway
472 | 				}
473 | 			} else if word == "uc" {
474 | 				i, _ := strconv.Atoi(arg)
475 | 				ucskip = i
476 | 			} else if word == "u" {
477 | 				c, _ := strconv.Atoi(arg)
478 | 				if c < 0 {
479 | 					c += 0x10000
480 | 				}
481 | 				returnBuffer.WriteRune(rune(c))
482 | 				curskip = ucskip
483 | 			}
484 | 		case hex != "":
485 | 			if curskip > 0 {
486 | 				curskip--
487 | 			} else if !ignorable {
488 | 				c, _ := strconv.ParseInt(hex, 16, 0)
489 | 				if charMap == nil {
490 | 					returnBuffer.WriteRune(rune(c))
491 | 				} else {
492 | 					returnBuffer.WriteRune(
493 | 						charMap.DecodeByte(byte(c)))
494 | 				}
495 | 			}
496 | 		}
497 | 	}
498 | 	return returnBuffer.String()
499 | }
500 | 
501 | // IsFileRTF checks if the data indicates a RTF file
502 | // RTF has a signature of 7B 5C 72 74 66 31, or in string "{\rtf1"
503 | func IsFileRTF(data []byte) bool {
504 | 	return bytes.HasPrefix(data, []byte{0x7B, 0x5C, 0x72, 0x74, 0x66, 0x31})
505 | }
506 | 


--------------------------------------------------------------------------------
/XLS 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  XLS 2 Text.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | The code originally used https://github.com/extrame/xls, which revealed multiple bugs that crashed for certain Excel files.
  7 | Now it forks the xls package and the underlying ole2 package. This fork also fixes excessive memory usage issues.
  8 | */
  9 | 
 10 | package fileconversion
 11 | 
 12 | import (
 13 | 	"bytes"
 14 | 	"fmt"
 15 | 	"io"
 16 | 	"strings"
 17 | 
 18 | 	"github.com/IntelligenceX/fileconversion/xls"
 19 | )
 20 | 
 21 | // XLS2Text extracts text from an Excel sheet. It returns bytes written.
 22 | // The parameter size is the max amount of bytes (not characters) to write out.
 23 | // The whole Excel file is required even for partial text extraction. This function returns no error with 0 bytes written in case of corrupted or invalid file.
 24 | func XLS2Text(reader io.ReadSeeker, writer io.Writer, size int64) (written int64, err error) {
 25 | 
 26 | 	xlFile, err := xls.OpenReader(reader, "utf-8")
 27 | 	if err != nil || xlFile == nil {
 28 | 		return 0, err
 29 | 	}
 30 | 
 31 | 	for n := 0; n < xlFile.NumSheets(); n++ {
 32 | 		if sheet1 := xlFile.GetSheet(n); sheet1 != nil {
 33 | 			if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet1.Name, n, int(sheet1.MaxRow))), &written, &size); err != nil || size == 0 {
 34 | 				return written, err
 35 | 			}
 36 | 
 37 | 			for m := 0; m <= int(sheet1.MaxRow); m++ {
 38 | 				row1 := sheet1.Row(m)
 39 | 				if row1 == nil {
 40 | 					continue
 41 | 				}
 42 | 
 43 | 				rowText := ""
 44 | 
 45 | 				// go through all columns
 46 | 				for c := row1.FirstCol(); c < row1.LastCol(); c++ {
 47 | 					if text := row1.Col(c); text != "" {
 48 | 						text = cleanCell(text)
 49 | 
 50 | 						if c > row1.FirstCol() {
 51 | 							rowText += ", "
 52 | 						}
 53 | 						rowText += text
 54 | 					}
 55 | 				}
 56 | 
 57 | 				rowText += "\n"
 58 | 
 59 | 				if err = writeOutput(writer, []byte(rowText), &written, &size); err != nil || size == 0 {
 60 | 					return written, err
 61 | 				}
 62 | 			}
 63 | 		}
 64 | 	}
 65 | 
 66 | 	return written, nil
 67 | }
 68 | 
 69 | // cleanCell returns a cleaned cell text without new-lines
 70 | func cleanCell(text string) string {
 71 | 	text = strings.ReplaceAll(text, "\n", " ")
 72 | 	text = strings.ReplaceAll(text, "\r", "")
 73 | 	text = strings.TrimSpace(text)
 74 | 
 75 | 	return text
 76 | }
 77 | 
 78 | func xlGenerateSheetTitle(name string, number, rows int) (title string) {
 79 | 	if number > 0 {
 80 | 		title += "\n"
 81 | 	}
 82 | 
 83 | 	title += fmt.Sprintf("Sheet \"%s\" (%d rows):\n", name, rows)
 84 | 
 85 | 	return title
 86 | }
 87 | 
 88 | func writeOutput(writer io.Writer, output []byte, alreadyWritten *int64, size *int64) (err error) {
 89 | 
 90 | 	if int64(len(output)) > *size {
 91 | 		output = output[:*size]
 92 | 	}
 93 | 
 94 | 	*size -= int64(len(output))
 95 | 
 96 | 	writtenOut, err := writer.Write(output)
 97 | 	*alreadyWritten += int64(writtenOut)
 98 | 
 99 | 	return err
100 | }
101 | 
102 | // IsFileXLS checks if the data indicates a XLS file
103 | // XLS has a signature of D0 CF 11 E0 A1 B1 1A E1
104 | func IsFileXLS(data []byte) bool {
105 | 	return bytes.HasPrefix(data, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1})
106 | }
107 | 
108 | // XLS2Cells converts an XLS file to individual cells
109 | func XLS2Cells(reader io.ReadSeeker) (cells []string, err error) {
110 | 
111 | 	xlFile, err := xls.OpenReader(reader, "utf-8")
112 | 	if err != nil || xlFile == nil {
113 | 		return nil, err
114 | 	}
115 | 
116 | 	for n := 0; n < xlFile.NumSheets(); n++ {
117 | 		if sheet1 := xlFile.GetSheet(n); sheet1 != nil {
118 | 			for m := 0; m <= int(sheet1.MaxRow); m++ {
119 | 				row1 := sheet1.Row(m)
120 | 				if row1 == nil {
121 | 					continue
122 | 				}
123 | 
124 | 				for c := row1.FirstCol(); c < row1.LastCol(); c++ {
125 | 					if text := row1.Col(c); text != "" {
126 | 						text = cleanCell(text)
127 | 						cells = append(cells, text)
128 | 					}
129 | 				}
130 | 			}
131 | 		}
132 | 	}
133 | 
134 | 	return
135 | }
136 | 


--------------------------------------------------------------------------------
/XLSX 2 Text.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | File Name:  XLSX 2 Text.go
  3 | Copyright:  2019 Kleissner Investments s.r.o.
  4 | Author:     Peter Kleissner
  5 | 
  6 | * https://github.com/tealeg/xlsx is used in production.
  7 | Some files used more than 1 GB of memory, even though the file itself is only 9 MB. Example 971bd55b-5cbd-43d2-899e-d4a2a7d0a883.
  8 | The underlying issue was how it decoded the worksheet XML into large structures. There was no easy fix for that.
  9 | 
 10 | * https://github.com/unidoc/unioffice is available as inactive implementation below, although it was found to also use lots of RAM.
 11 | 
 12 | * https://github.com/360EntSecGroup-Skylar/excelize was not tested in detail, but seems very similar to "tealeg/xlsx".
 13 | 
 14 | * https://github.com/szyhf/go-excel is faster and uses smaller resources than "tealeg/xlsx", but lacks quality when extracting cells and misses many.
 15 | 
 16 | */
 17 | 
 18 | package fileconversion
 19 | 
 20 | import (
 21 | 	"bytes"
 22 | 	"io"
 23 | 
 24 | 	"github.com/tealeg/xlsx"
 25 | )
 26 | 
 27 | // IsFileXLSX checks if the data indicates a XLSX file
 28 | // XLSX has a signature of 50 4B 03 04
 29 | // Warning: This collides with ZIP, DOCX and other zip-based files.
 30 | func IsFileXLSX(data []byte) bool {
 31 | 	return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04})
 32 | }
 33 | 
 34 | // XLSX2Text extracts text of an Excel sheet
 35 | // Size is the full size of the input file. Limit is the output limit in bytes.
 36 | // rowLimit defines how many rows per sheet to extract. -1 means unlimited. This exists as protection against some XLSX files that may use excessive amount of memory.
 37 | func XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error) {
 38 | 	var xlFile *xlsx.File
 39 | 
 40 | 	if rowLimit == -1 {
 41 | 		xlFile, err = xlsx.OpenReaderAt(file, size)
 42 | 	} else {
 43 | 		xlFile, err = xlsx.OpenReaderAtWithRowLimit(file, size, rowLimit)
 44 | 	}
 45 | 	if err != nil {
 46 | 		return 0, err
 47 | 	}
 48 | 
 49 | 	for n, sheet := range xlFile.Sheets {
 50 | 		if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet.Name, n, int(sheet.MaxRow))), &written, &limit); err != nil || limit == 0 {
 51 | 			return written, err
 52 | 		}
 53 | 
 54 | 		for _, row := range sheet.Rows {
 55 | 
 56 | 			rowText := ""
 57 | 
 58 | 			// go through all columns
 59 | 			for m, cell := range row.Cells {
 60 | 				text := cell.String()
 61 | 				if text != "" {
 62 | 					text = cleanCell(text)
 63 | 
 64 | 					if m > 0 {
 65 | 						rowText += ", "
 66 | 					}
 67 | 					rowText += text
 68 | 				}
 69 | 			}
 70 | 
 71 | 			rowText += "\n"
 72 | 
 73 | 			if err = writeOutput(writer, []byte(rowText), &written, &limit); err != nil || limit == 0 {
 74 | 				return written, err
 75 | 			}
 76 | 		}
 77 | 	}
 78 | 
 79 | 	return written, nil
 80 | }
 81 | 
 82 | // XLSX2Cells converts an XLSX file to individual cells
 83 | // Size is the full size of the input file.
 84 | // rowLimit defines how many rows per sheet to extract. -1 means unlimited. This exists as protection against some XLSX files that may use excessive amount of memory.
 85 | func XLSX2Cells(file io.ReaderAt, size int64, rowLimit int) (cells []string, err error) {
 86 | 	var xlFile *xlsx.File
 87 | 
 88 | 	if rowLimit == -1 {
 89 | 		xlFile, err = xlsx.OpenReaderAt(file, size)
 90 | 	} else {
 91 | 		xlFile, err = xlsx.OpenReaderAtWithRowLimit(file, size, rowLimit)
 92 | 	}
 93 | 	if err != nil {
 94 | 		return nil, err
 95 | 	}
 96 | 
 97 | 	for _, sheet := range xlFile.Sheets {
 98 | 		for _, row := range sheet.Rows {
 99 | 			for _, cell := range row.Cells {
100 | 				if text := cell.String(); text != "" {
101 | 					text = cleanCell(text)
102 | 					cells = append(cells, text)
103 | 				}
104 | 			}
105 | 		}
106 | 	}
107 | 
108 | 	return
109 | }
110 | 
111 | // alternative implementation using https://github.com/unidoc/unioffice, not required
112 | 
113 | /*
114 | // XLSX2Cells2 converts an XLS file to individual cells
115 | func XLSX2Cells2(file io.ReaderAt, size int64) (cells []string, err error) {
116 | 
117 | 	xlFile, err := spreadsheet.Read(file, size)
118 | 	if err != nil || xlFile == nil {
119 | 		return nil, err
120 | 	}
121 | 	defer xlFile.Close()
122 | 
123 | 	for _, sheet := range xlFile.Sheets() {
124 | 		for _, row := range sheet.Rows() {
125 | 			for _, cell := range row.Cells() {
126 | 				if text := cell.GetString(); text != "" {
127 | 					text = cleanCell(text)
128 | 					cells = append(cells, text)
129 | 				}
130 | 			}
131 | 		}
132 | 	}
133 | 
134 | 	return
135 | }
136 | 
137 | // XLSX2Text2 extracts text from an Excel sheet. It returns bytes written.
138 | // The parameter limit is the max amount of bytes (not characters) to write out.
139 | // The whole Excel file is required even for partial text extraction. This function returns no error with 0 bytes written in case of corrupted or invalid file.
140 | func XLSX2Text2(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) {
141 | 
142 | 	xlFile, err := spreadsheet.Read(file, size)
143 | 	if err != nil || xlFile == nil {
144 | 		return 0, err
145 | 	}
146 | 	defer xlFile.Close()
147 | 
148 | 	for n, sheet := range xlFile.Sheets() {
149 | 		rows := sheet.Rows()
150 | 		if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet.Name(), n, len(rows))), &written, &limit); err != nil || limit == 0 {
151 | 			return written, err
152 | 		}
153 | 
154 | 		for _, row := range sheet.Rows() {
155 | 			rowText := ""
156 | 
157 | 			for n, cell := range row.Cells() {
158 | 				text := cell.GetString()
159 | 				text = cleanCell(text)
160 | 
161 | 				if n > 1 {
162 | 					rowText += ", "
163 | 				}
164 | 				rowText += text
165 | 			}
166 | 
167 | 			rowText += "\n"
168 | 
169 | 			if err = writeOutput(writer, []byte(rowText), &written, &size); err != nil || size == 0 {
170 | 				return written, err
171 | 			}
172 | 		}
173 | 	}
174 | 
175 | 	return written, nil
176 | }
177 | */
178 | 
179 | // implementation using https://github.com/szyhf/go-excel
180 | 
181 | /*
182 | // XLSX2Text extracts text of an Excel sheet
183 | // Size is the full size of the input file.
184 | func XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) {
185 | 
186 | 	conn := excel.NewConnecter()
187 | 	err = conn.OpenReader(file, size)
188 | 	if err != nil {
189 | 		return 0, err
190 | 	}
191 | 	defer conn.Close()
192 | 
193 | 	for n, sheetName := range conn.GetSheetNames() {
194 | 		if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheetName, n, 100)), &written, &limit); err != nil || limit == 0 {
195 | 			return written, err
196 | 		}
197 | 
198 | 		rd, err := conn.NewReader(sheetName)
199 | 		if err != nil {
200 | 			continue
201 | 		}
202 | 
203 | 		for rd.Next() {
204 | 			var rowCells []string
205 | 			err1 := rd.Read(&rowCells)
206 | 
207 | 			rowText := ""
208 | 
209 | 			// go through all columns
210 | 			for m, cell := range rowCells {
211 | 				if text := cell; text != "" {
212 | 					text = cleanCell(text)
213 | 
214 | 					if m > 0 {
215 | 						rowText += ", "
216 | 					}
217 | 					rowText += text
218 | 				}
219 | 			}
220 | 
221 | 			rowText += "\n"
222 | 
223 | 			if err = writeOutput(writer, []byte(rowText), &written, &limit); err != nil || limit == 0 {
224 | 				return written, err
225 | 			}
226 | 
227 | 			if err1 == io.EOF {
228 | 				break
229 | 			}
230 | 		}
231 | 	}
232 | 
233 | 	return written, nil
234 | }
235 | 
236 | // XLSX2Cells converts an XLSX file to individual cells
237 | // Size is the full size of the input file.
238 | func XLSX2Cells(file io.ReaderAt, size int64) (cells []string, err error) {
239 | 
240 | 	conn := excel.NewConnecter()
241 | 	err = conn.OpenReader(file, size)
242 | 	if err != nil {
243 | 		return nil, err
244 | 	}
245 | 	defer conn.Close()
246 | 
247 | loopSheet:
248 | 	for _, sheetName := range conn.GetSheetNames() {
249 | 		rd, err := conn.NewReader(sheetName)
250 | 		if err != nil {
251 | 			continue
252 | 		}
253 | 
254 | 		for rd.Next() {
255 | 			var rowCells []string
256 | 			if err := rd.Read(&rowCells); err != nil {
257 | 				continue loopSheet
258 | 			}
259 | 
260 | 			for _, cell := range rowCells {
261 | 				if text := cell; text != "" {
262 | 					text = cleanCell(text)
263 | 					cells = append(cells, text)
264 | 				}
265 | 			}
266 | 		}
267 | 	}
268 | 
269 | 	return
270 | }
271 | */
272 | 


--------------------------------------------------------------------------------
/ZIP.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | File Name:  ZIP.go
 3 | Copyright:  2019 Kleissner Investments s.r.o.
 4 | Author:     Peter Kleissner
 5 | */
 6 | 
 7 | package fileconversion
 8 | 
 9 | import "bytes"
10 | 
11 | // IsFileZIP checks if the data indicates a ZIP file.
12 | // Many file formats like DOCX, XLSX, PPTX and APK are actual ZIP files.
13 | // Signature 50 4B 03 04
14 | func IsFileZIP(data []byte) bool {
15 | 	return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04})
16 | }
17 | 


--------------------------------------------------------------------------------
/html2text/README.md:
--------------------------------------------------------------------------------
  1 | # html2text
  2 | 
  3 | Forked from https://github.com/jaytaylor/html2text.
  4 | 
  5 | ### Converts HTML into text of the markdown-flavored variety
  6 | 
  7 | 
  8 | ## Introduction
  9 | 
 10 | Ensure your emails are readable by all!
 11 | 
 12 | Turns HTML into raw text, useful for sending fancy HTML emails with an equivalently nicely formatted TXT document as a fallback (e.g. for people who don't allow HTML emails or have other display issues).
 13 | 
 14 | html2text is a simple golang package for rendering HTML into plaintext.
 15 | 
 16 | There are still lots of improvements to be had, but FWIW this has worked fine for my [basic] HTML-2-text needs.
 17 | 
 18 | It requires go 1.x or newer ;)
 19 | 
 20 | 
 21 | ## Download the package
 22 | 
 23 | ```bash
 24 | go get jaytaylor.com/html2text
 25 | ```
 26 | 
 27 | ## Example usage
 28 | 
 29 | ```go
 30 | package main
 31 | 
 32 | import (
 33 | 	"fmt"
 34 | 
 35 | 	"jaytaylor.com/html2text"
 36 | )
 37 | 
 38 | func main() {
 39 | 	inputHTML := `
 40 | <html>
 41 |   <head>
 42 |     <title>My Mega Service</title>
 43 |     <link rel=\"stylesheet\" href=\"main.css\">
 44 |     <style type=\"text/css\">body { color: #fff; }</style>
 45 |   </head>
 46 | 
 47 |   <body>
 48 |     <div class="logo">
 49 |       <a href="http://jaytaylor.com/"><img src="/logo-image.jpg" alt="Mega Service"/></a>
 50 |     </div>
 51 | 
 52 |     <h1>Welcome to your new account on my service!</h1>
 53 | 
 54 |     <p>
 55 |       Here is some more information:
 56 | 
 57 |       <ul>
 58 |         <li>Link 1: <a href="https://example.com">Example.com</a></li>
 59 |         <li>Link 2: <a href="https://example2.com">Example2.com</a></li>
 60 |         <li>Something else</li>
 61 |       </ul>
 62 |     </p>
 63 | 
 64 |     <table>
 65 |       <thead>
 66 |         <tr><th>Header 1</th><th>Header 2</th></tr>
 67 |       </thead>
 68 |       <tfoot>
 69 |         <tr><td>Footer 1</td><td>Footer 2</td></tr>
 70 |       </tfoot>
 71 |       <tbody>
 72 |         <tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
 73 |         <tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
 74 |       </tbody>
 75 |     </table>
 76 |   </body>
 77 | </html>`
 78 | 
 79 | 	text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true})
 80 | 	if err != nil {
 81 | 		panic(err)
 82 | 	}
 83 | 	fmt.Println(text)
 84 | }
 85 | ```
 86 | 
 87 | Output:
 88 | ```
 89 | Mega Service ( http://jaytaylor.com/ )
 90 | 
 91 | ******************************************
 92 | Welcome to your new account on my service!
 93 | ******************************************
 94 | 
 95 | Here is some more information:
 96 | 
 97 | * Link 1: Example.com ( https://example.com )
 98 | * Link 2: Example2.com ( https://example2.com )
 99 | * Something else
100 | 
101 | +-------------+-------------+
102 | |  HEADER 1   |  HEADER 2   |
103 | +-------------+-------------+
104 | | Row 1 Col 1 | Row 1 Col 2 |
105 | | Row 2 Col 1 | Row 2 Col 2 |
106 | +-------------+-------------+
107 | |  FOOTER 1   |  FOOTER 2   |
108 | +-------------+-------------+
109 | ```
110 | 
111 | 
112 | ## Unit-tests
113 | 
114 | Running the unit-tests is straightforward and standard:
115 | 
116 | ```bash
117 | go test
118 | ```
119 | 
120 | 


--------------------------------------------------------------------------------
/html2text/html2text.go:
--------------------------------------------------------------------------------
  1 | package html2text
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"io"
  6 | 	"regexp"
  7 | 	"strings"
  8 | 	"unicode"
  9 | 
 10 | 	"github.com/olekukonko/tablewriter"
 11 | 	"github.com/ssor/bom"
 12 | 	"golang.org/x/net/html"
 13 | 	"golang.org/x/net/html/atom"
 14 | )
 15 | 
 16 | // Options provide toggles and overrides to control specific rendering behaviors.
 17 | type Options struct {
 18 | 	PrettyTables        bool                 // Turns on pretty ASCII rendering for table elements.
 19 | 	PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
 20 | 	OmitLinks           bool                 // Turns on omitting links
 21 | }
 22 | 
 23 | // PrettyTablesOptions overrides tablewriter behaviors
 24 | type PrettyTablesOptions struct {
 25 | 	AutoFormatHeader     bool
 26 | 	AutoWrapText         bool
 27 | 	ReflowDuringAutoWrap bool
 28 | 	ColWidth             int
 29 | 	ColumnSeparator      string
 30 | 	RowSeparator         string
 31 | 	CenterSeparator      string
 32 | 	HeaderAlignment      int
 33 | 	FooterAlignment      int
 34 | 	Alignment            int
 35 | 	ColumnAlignment      []int
 36 | 	NewLine              string
 37 | 	HeaderLine           bool
 38 | 	RowLine              bool
 39 | 	AutoMergeCells       bool
 40 | 	Borders              tablewriter.Border
 41 | }
 42 | 
 43 | // NewPrettyTablesOptions creates PrettyTablesOptions with default settings
 44 | func NewPrettyTablesOptions() *PrettyTablesOptions {
 45 | 	return &PrettyTablesOptions{
 46 | 		AutoFormatHeader:     true,
 47 | 		AutoWrapText:         true,
 48 | 		ReflowDuringAutoWrap: true,
 49 | 		ColWidth:             tablewriter.MAX_ROW_WIDTH,
 50 | 		ColumnSeparator:      tablewriter.COLUMN,
 51 | 		RowSeparator:         tablewriter.ROW,
 52 | 		CenterSeparator:      tablewriter.CENTER,
 53 | 		HeaderAlignment:      tablewriter.ALIGN_DEFAULT,
 54 | 		FooterAlignment:      tablewriter.ALIGN_DEFAULT,
 55 | 		Alignment:            tablewriter.ALIGN_DEFAULT,
 56 | 		ColumnAlignment:      []int{},
 57 | 		NewLine:              tablewriter.NEWLINE,
 58 | 		HeaderLine:           true,
 59 | 		RowLine:              false,
 60 | 		AutoMergeCells:       false,
 61 | 		Borders:              tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true},
 62 | 	}
 63 | }
 64 | 
 65 | // FromHTMLNode renders text output from a pre-parsed HTML document.
 66 | func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
 67 | 	var options Options
 68 | 	if len(o) > 0 {
 69 | 		options = o[0]
 70 | 	}
 71 | 
 72 | 	ctx := textifyTraverseContext{
 73 | 		buf:     bytes.Buffer{},
 74 | 		options: options,
 75 | 	}
 76 | 	if err := ctx.traverse(doc); err != nil {
 77 | 		return "", err
 78 | 	}
 79 | 
 80 | 	text := strings.TrimSpace(newlineRe.ReplaceAllString(
 81 | 		strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
 82 | 	)
 83 | 	return text, nil
 84 | }
 85 | 
 86 | // FromReader renders text output after parsing HTML for the specified
 87 | // io.Reader.
 88 | func FromReader(reader io.Reader, options ...Options) (string, error) {
 89 | 	newReader, err := bom.NewReaderWithoutBom(reader)
 90 | 	if err != nil {
 91 | 		return "", err
 92 | 	}
 93 | 	doc, err := html.Parse(newReader)
 94 | 	if err != nil {
 95 | 		return "", err
 96 | 	}
 97 | 	return FromHTMLNode(doc, options...)
 98 | }
 99 | 
100 | // FromString parses HTML from the input string, then renders the text form.
101 | func FromString(input string, options ...Options) (string, error) {
102 | 	bs := bom.CleanBom([]byte(input))
103 | 	text, err := FromReader(bytes.NewReader(bs), options...)
104 | 	if err != nil {
105 | 		return "", err
106 | 	}
107 | 	return text, nil
108 | }
109 | 
110 | var (
111 | 	spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
112 | 	newlineRe = regexp.MustCompile(`\n\n+`)
113 | )
114 | 
115 | // traverseTableCtx holds text-related context.
116 | type textifyTraverseContext struct {
117 | 	buf bytes.Buffer
118 | 
119 | 	prefix          string
120 | 	tableCtx        tableTraverseContext
121 | 	options         Options
122 | 	endsWithSpace   bool
123 | 	justClosedDiv   bool
124 | 	blockquoteLevel int
125 | 	lineLength      int
126 | 	isPre           bool
127 | 	isVirtualBQ     bool // virtual blockquote
128 | }
129 | 
130 | // tableTraverseContext holds table ASCII-form related context.
131 | type tableTraverseContext struct {
132 | 	header     []string
133 | 	body       [][]string
134 | 	footer     []string
135 | 	tmpRow     int
136 | 	isInFooter bool
137 | }
138 | 
139 | func (tableCtx *tableTraverseContext) init() {
140 | 	tableCtx.body = [][]string{}
141 | 	tableCtx.header = []string{}
142 | 	tableCtx.footer = []string{}
143 | 	tableCtx.isInFooter = false
144 | 	tableCtx.tmpRow = 0
145 | }
146 | 
147 | func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
148 | 	ctx.justClosedDiv = false
149 | 
150 | 	switch node.DataAtom {
151 | 	case atom.Br:
152 | 		return ctx.emit("\n")
153 | 
154 | 	case atom.H1, atom.H2, atom.H3:
155 | 		subCtx := textifyTraverseContext{}
156 | 		if err := subCtx.traverseChildren(node); err != nil {
157 | 			return err
158 | 		}
159 | 
160 | 		str := subCtx.buf.String()
161 | 		dividerLen := 0
162 | 		for _, line := range strings.Split(str, "\n") {
163 | 			if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
164 | 				dividerLen = lineLen - 1
165 | 			}
166 | 		}
167 | 		var divider string
168 | 		if node.DataAtom == atom.H1 {
169 | 			divider = strings.Repeat("*", dividerLen)
170 | 		} else {
171 | 			divider = strings.Repeat("-", dividerLen)
172 | 		}
173 | 
174 | 		if node.DataAtom == atom.H3 {
175 | 			return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
176 | 		}
177 | 		return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
178 | 
179 | 	case atom.Blockquote:
180 | 		if ctx.buf.Len() == 0 && !ctx.isVirtualBQ { // do not apply blockquote if full html is blockquote
181 | 			return ctx.traverseChildren(node)
182 | 		}
183 | 
184 | 		ctx.blockquoteLevel++
185 | 		ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
186 | 		// following lines are disabled, otherwise it outputs 2x empty '>' lines
187 | 		//if err := ctx.emit("\n"); err != nil {
188 | 		//	return err
189 | 		//}
190 | 		//if ctx.blockquoteLevel == 1 {
191 | 		//	if err := ctx.emit("\n"); err != nil {
192 | 		//		return err
193 | 		//	}
194 | 		//}
195 | 		if err := ctx.traverseChildren(node); err != nil {
196 | 			return err
197 | 		}
198 | 		ctx.blockquoteLevel--
199 | 		ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
200 | 		if ctx.blockquoteLevel > 0 {
201 | 			ctx.prefix += " "
202 | 		}
203 | 		// to remove the last "> " (or multiple on levels) added we would have to make some magic with the ctx.buf
204 | 		return ctx.emit("\n\n")
205 | 
206 | 	case atom.Div:
207 | 		// hack <div class="quote_container"> as blockquote
208 | 		for _, attr := range node.Attr {
209 | 			if attr.Key == "class" && attr.Val == "quote_container" {
210 | 				node.DataAtom = atom.Blockquote
211 | 				ctx.isVirtualBQ = true
212 | 				err := ctx.handleElement(node)
213 | 				ctx.isVirtualBQ = false
214 | 				return err
215 | 			}
216 | 		}
217 | 
218 | 		if ctx.lineLength > 0 {
219 | 			if err := ctx.emit("\n"); err != nil {
220 | 				return err
221 | 			}
222 | 		}
223 | 		if err := ctx.traverseChildren(node); err != nil {
224 | 			return err
225 | 		}
226 | 		var err error
227 | 		if !ctx.justClosedDiv {
228 | 			err = ctx.emit("\n")
229 | 		}
230 | 		ctx.justClosedDiv = true
231 | 		return err
232 | 
233 | 	case atom.Li:
234 | 		if err := ctx.emit("* "); err != nil {
235 | 			return err
236 | 		}
237 | 
238 | 		if err := ctx.traverseChildren(node); err != nil {
239 | 			return err
240 | 		}
241 | 
242 | 		return ctx.emit("\n")
243 | 
244 | 	case atom.B, atom.Strong:
245 | 		subCtx := textifyTraverseContext{}
246 | 		subCtx.endsWithSpace = true
247 | 		if err := subCtx.traverseChildren(node); err != nil {
248 | 			return err
249 | 		}
250 | 		str := subCtx.buf.String()
251 | 		return ctx.emit("*" + str + "*")
252 | 
253 | 	case atom.A:
254 | 		linkText := ""
255 | 		// For simple link element content with single text node only, peek at the link text.
256 | 		if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode {
257 | 			linkText = node.FirstChild.Data
258 | 		}
259 | 
260 | 		// If image is the only child, take its alt text as the link text.
261 | 		if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
262 | 			if altText := getAttrVal(img, "alt"); altText != "" {
263 | 				if err := ctx.emit(altText); err != nil {
264 | 					return err
265 | 				}
266 | 			}
267 | 		} else if err := ctx.traverseChildren(node); err != nil {
268 | 			return err
269 | 		}
270 | 
271 | 		hrefLink := ""
272 | 		if attrVal := getAttrVal(node, "href"); attrVal != "" {
273 | 			attrVal = ctx.normalizeHrefLink(attrVal)
274 | 			// Don't print link href if it matches link element content or if the link is empty.
275 | 			if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal {
276 | 				hrefLink = "( " + attrVal + " )"
277 | 			}
278 | 		}
279 | 
280 | 		return ctx.emit(hrefLink)
281 | 
282 | 	case atom.P, atom.Ul:
283 | 		return ctx.paragraphHandler(node)
284 | 
285 | 	case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td:
286 | 		if ctx.options.PrettyTables {
287 | 			return ctx.handleTableElement(node)
288 | 		} else if node.DataAtom == atom.Table {
289 | 			return ctx.paragraphHandler(node)
290 | 		}
291 | 		return ctx.traverseChildren(node)
292 | 
293 | 	case atom.Pre, atom.Code:
294 | 		ctx.isPre = true
295 | 		err := ctx.traverseChildren(node)
296 | 		ctx.isPre = false
297 | 		return err
298 | 
299 | 	case atom.Style, atom.Script, atom.Head:
300 | 		// Ignore the subtree.
301 | 		return nil
302 | 
303 | 	case atom.Noscript:
304 | 		// Because of bug https://github.com/golang/go/issues/16318 we have to remove the whole content in <noscript> tags.
305 | 		// The bug is that treating it as div above would show the whole content between the noscript tags as text instead of traversing it's HTML elements.
306 | 		return nil
307 | 
308 | 	default:
309 | 		return ctx.traverseChildren(node)
310 | 	}
311 | }
312 | 
313 | // paragraphHandler renders node children surrounded by double newlines.
314 | func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error {
315 | 	if err := ctx.emit("\n\n"); err != nil {
316 | 		return err
317 | 	}
318 | 	if err := ctx.traverseChildren(node); err != nil {
319 | 		return err
320 | 	}
321 | 	return ctx.emit("\n\n")
322 | }
323 | 
324 | // handleTableElement is only to be invoked when options.PrettyTables is active.
325 | func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error {
326 | 	if !ctx.options.PrettyTables {
327 | 		panic("handleTableElement invoked when PrettyTables not active")
328 | 	}
329 | 
330 | 	switch node.DataAtom {
331 | 	case atom.Table:
332 | 		if err := ctx.emit("\n\n"); err != nil {
333 | 			return err
334 | 		}
335 | 
336 | 		// Re-intialize all table context.
337 | 		ctx.tableCtx.init()
338 | 
339 | 		// Browse children, enriching context with table data.
340 | 		if err := ctx.traverseChildren(node); err != nil {
341 | 			return err
342 | 		}
343 | 
344 | 		buf := &bytes.Buffer{}
345 | 		table := tablewriter.NewWriter(buf)
346 | 		if ctx.options.PrettyTablesOptions != nil {
347 | 			options := ctx.options.PrettyTablesOptions
348 | 			table.SetAutoFormatHeaders(options.AutoFormatHeader)
349 | 			table.SetAutoWrapText(options.AutoWrapText)
350 | 			table.SetReflowDuringAutoWrap(options.ReflowDuringAutoWrap)
351 | 			table.SetColWidth(options.ColWidth)
352 | 			table.SetColumnSeparator(options.ColumnSeparator)
353 | 			table.SetRowSeparator(options.RowSeparator)
354 | 			table.SetCenterSeparator(options.CenterSeparator)
355 | 			table.SetHeaderAlignment(options.HeaderAlignment)
356 | 			table.SetFooterAlignment(options.FooterAlignment)
357 | 			table.SetAlignment(options.Alignment)
358 | 			table.SetColumnAlignment(options.ColumnAlignment)
359 | 			table.SetNewLine(options.NewLine)
360 | 			table.SetHeaderLine(options.HeaderLine)
361 | 			table.SetRowLine(options.RowLine)
362 | 			table.SetAutoMergeCells(options.AutoMergeCells)
363 | 			table.SetBorders(options.Borders)
364 | 		}
365 | 		table.SetHeader(ctx.tableCtx.header)
366 | 		table.SetFooter(ctx.tableCtx.footer)
367 | 		table.AppendBulk(ctx.tableCtx.body)
368 | 
369 | 		// Render the table using ASCII.
370 | 		table.Render()
371 | 		if err := ctx.emit(buf.String()); err != nil {
372 | 			return err
373 | 		}
374 | 
375 | 		return ctx.emit("\n\n")
376 | 
377 | 	case atom.Tfoot:
378 | 		ctx.tableCtx.isInFooter = true
379 | 		if err := ctx.traverseChildren(node); err != nil {
380 | 			return err
381 | 		}
382 | 		ctx.tableCtx.isInFooter = false
383 | 
384 | 	case atom.Tr:
385 | 		ctx.tableCtx.body = append(ctx.tableCtx.body, []string{})
386 | 		if err := ctx.traverseChildren(node); err != nil {
387 | 			return err
388 | 		}
389 | 		ctx.tableCtx.tmpRow++
390 | 
391 | 	case atom.Th:
392 | 		res, err := ctx.renderEachChild(node)
393 | 		if err != nil {
394 | 			return err
395 | 		}
396 | 
397 | 		ctx.tableCtx.header = append(ctx.tableCtx.header, res)
398 | 
399 | 	case atom.Td:
400 | 		res, err := ctx.renderEachChild(node)
401 | 		if err != nil {
402 | 			return err
403 | 		}
404 | 
405 | 		if ctx.tableCtx.isInFooter {
406 | 			ctx.tableCtx.footer = append(ctx.tableCtx.footer, res)
407 | 		} else {
408 | 			ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res)
409 | 		}
410 | 
411 | 	}
412 | 	return nil
413 | }
414 | 
415 | func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
416 | 	switch node.Type {
417 | 	default:
418 | 		return ctx.traverseChildren(node)
419 | 
420 | 	case html.TextNode:
421 | 		var data string
422 | 		if ctx.isPre {
423 | 			data = node.Data
424 | 		} else {
425 | 			data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " "))
426 | 		}
427 | 		return ctx.emit(data)
428 | 
429 | 	case html.ElementNode:
430 | 		return ctx.handleElement(node)
431 | 	}
432 | }
433 | 
434 | func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error {
435 | 	for c := node.FirstChild; c != nil; c = c.NextSibling {
436 | 		if err := ctx.traverse(c); err != nil {
437 | 			return err
438 | 		}
439 | 	}
440 | 
441 | 	return nil
442 | }
443 | 
444 | func (ctx *textifyTraverseContext) emit(data string) error {
445 | 	if data == "" {
446 | 		return nil
447 | 	}
448 | 	var (
449 | 		lines = ctx.breakLongLines(data)
450 | 		err   error
451 | 	)
452 | 	for _, line := range lines {
453 | 		runes := []rune(line)
454 | 		startsWithSpace := unicode.IsSpace(runes[0])
455 | 		if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
456 | 			if err = ctx.buf.WriteByte(' '); err != nil {
457 | 				return err
458 | 			}
459 | 			ctx.lineLength++
460 | 		}
461 | 		ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
462 | 		for _, c := range line {
463 | 			if _, err = ctx.buf.WriteString(string(c)); err != nil {
464 | 				return err
465 | 			}
466 | 			ctx.lineLength++
467 | 			if c == '\n' {
468 | 				ctx.lineLength = 0
469 | 				if ctx.prefix != "" {
470 | 					if _, err = ctx.buf.WriteString(ctx.prefix); err != nil {
471 | 						return err
472 | 					}
473 | 				}
474 | 			}
475 | 		}
476 | 	}
477 | 	return nil
478 | }
479 | 
480 | const maxLineLen = 74
481 | 
482 | func (ctx *textifyTraverseContext) breakLongLines(data string) []string {
483 | 	// Only break lines when in blockquotes.
484 | 	if ctx.blockquoteLevel == 0 {
485 | 		return []string{data}
486 | 	}
487 | 	var (
488 | 		ret      = []string{}
489 | 		runes    = []rune(data)
490 | 		l        = len(runes)
491 | 		existing = ctx.lineLength
492 | 	)
493 | 	if existing >= maxLineLen {
494 | 		ret = append(ret, "\n")
495 | 		existing = 0
496 | 	}
497 | 	for l+existing > maxLineLen {
498 | 		i := maxLineLen - existing
499 | 		for i >= 0 && !unicode.IsSpace(runes[i]) {
500 | 			i--
501 | 		}
502 | 		if i == -1 {
503 | 			// No spaces, so go the other way.
504 | 			i = maxLineLen - existing
505 | 			for i < l && !unicode.IsSpace(runes[i]) {
506 | 				i++
507 | 			}
508 | 		}
509 | 		ret = append(ret, string(runes[:i])+"\n")
510 | 		for i < l && unicode.IsSpace(runes[i]) {
511 | 			i++
512 | 		}
513 | 		runes = runes[i:]
514 | 		l = len(runes)
515 | 		existing = 0
516 | 	}
517 | 	if len(runes) > 0 {
518 | 		ret = append(ret, string(runes))
519 | 	}
520 | 	return ret
521 | }
522 | 
523 | func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
524 | 	link = strings.TrimSpace(link)
525 | 	link = strings.TrimPrefix(link, "mailto:")
526 | 	return link
527 | }
528 | 
529 | // renderEachChild visits each direct child of a node and collects the sequence of
530 | // textuual representaitons separated by a single newline.
531 | func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) {
532 | 	buf := &bytes.Buffer{}
533 | 	for c := node.FirstChild; c != nil; c = c.NextSibling {
534 | 		s, err := FromHTMLNode(c, ctx.options)
535 | 		if err != nil {
536 | 			return "", err
537 | 		}
538 | 		if _, err = buf.WriteString(s); err != nil {
539 | 			return "", err
540 | 		}
541 | 		if c.NextSibling != nil {
542 | 			if err = buf.WriteByte('\n'); err != nil {
543 | 				return "", err
544 | 			}
545 | 		}
546 | 	}
547 | 	return buf.String(), nil
548 | }
549 | 
550 | func getAttrVal(node *html.Node, attrName string) string {
551 | 	for _, attr := range node.Attr {
552 | 		if attr.Key == attrName {
553 | 			return attr.Val
554 | 		}
555 | 	}
556 | 
557 | 	return ""
558 | }
559 | 


--------------------------------------------------------------------------------
/html2text/html2text_test.go:
--------------------------------------------------------------------------------
   1 | package html2text
   2 | 
   3 | import (
   4 | 	"bytes"
   5 | 	"fmt"
   6 | 	"io/ioutil"
   7 | 	"os"
   8 | 	"path"
   9 | 	"regexp"
  10 | 	"strings"
  11 | 	"testing"
  12 | )
  13 | 
  14 | const destPath = "testdata"
  15 | 
  16 | // EnableExtraLogging turns on additional testing log output.
  17 | // Extra test logging can be enabled by setting the environment variable
  18 | // HTML2TEXT_EXTRA_LOGGING to "1" or "true".
  19 | var EnableExtraLogging bool
  20 | 
  21 | func init() {
  22 | 	if v := os.Getenv("HTML2TEXT_EXTRA_LOGGING"); v == "1" || v == "true" {
  23 | 		EnableExtraLogging = true
  24 | 	}
  25 | }
  26 | 
  27 | // TODO Add tests for FromHTMLNode and FromReader.
  28 | 
  29 | func TestParseUTF8(t *testing.T) {
  30 | 	htmlFiles := []struct {
  31 | 		file                  string
  32 | 		keywordShouldNotExist string
  33 | 		keywordShouldExist    string
  34 | 	}{
  35 | 		{
  36 | 			"utf8.html",
  37 | 			"学习之道:美国公认学习第一书title",
  38 | 			"次世界冠军赛上，我几近疯狂",
  39 | 		},
  40 | 		{
  41 | 			"utf8_with_bom.xhtml",
  42 | 			"1892年波兰文版序言title",
  43 | 			"种新的波兰文本已成为必要",
  44 | 		},
  45 | 	}
  46 | 
  47 | 	for _, htmlFile := range htmlFiles {
  48 | 		bs, err := ioutil.ReadFile(path.Join(destPath, htmlFile.file))
  49 | 		if err != nil {
  50 | 			t.Fatal(err)
  51 | 		}
  52 | 		text, err := FromReader(bytes.NewReader(bs))
  53 | 		if err != nil {
  54 | 			t.Fatal(err)
  55 | 		}
  56 | 		if !strings.Contains(text, htmlFile.keywordShouldExist) {
  57 | 			t.Fatalf("keyword %s should  exists in file %s", htmlFile.keywordShouldExist, htmlFile.file)
  58 | 		}
  59 | 		if strings.Contains(text, htmlFile.keywordShouldNotExist) {
  60 | 			t.Fatalf("keyword %s should not exists in file %s", htmlFile.keywordShouldNotExist, htmlFile.file)
  61 | 		}
  62 | 	}
  63 | }
  64 | 
  65 | func TestStrippingWhitespace(t *testing.T) {
  66 | 	testCases := []struct {
  67 | 		input  string
  68 | 		output string
  69 | 	}{
  70 | 		{
  71 | 			"test text",
  72 | 			"test text",
  73 | 		},
  74 | 		{
  75 | 			"  \ttext\ntext\n",
  76 | 			"text text",
  77 | 		},
  78 | 		{
  79 | 			"  \na \n\t \n \n a \t",
  80 | 			"a a",
  81 | 		},
  82 | 		{
  83 | 			"test        text",
  84 | 			"test text",
  85 | 		},
  86 | 		{
  87 | 			"test&nbsp;&nbsp;&nbsp; text&nbsp;",
  88 | 			"test    text",
  89 | 		},
  90 | 	}
  91 | 
  92 | 	for _, testCase := range testCases {
  93 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
  94 | 			t.Error(err)
  95 | 		} else if len(msg) > 0 {
  96 | 			t.Log(msg)
  97 | 		}
  98 | 	}
  99 | }
 100 | 
 101 | func TestParagraphsAndBreaks(t *testing.T) {
 102 | 	testCases := []struct {
 103 | 		input  string
 104 | 		output string
 105 | 	}{
 106 | 		{
 107 | 			"Test text",
 108 | 			"Test text",
 109 | 		},
 110 | 		{
 111 | 			"Test text<br>",
 112 | 			"Test text",
 113 | 		},
 114 | 		{
 115 | 			"Test text<br>Test",
 116 | 			"Test text\nTest",
 117 | 		},
 118 | 		{
 119 | 			"<p>Test text</p>",
 120 | 			"Test text",
 121 | 		},
 122 | 		{
 123 | 			"<p>Test text</p><p>Test text</p>",
 124 | 			"Test text\n\nTest text",
 125 | 		},
 126 | 		{
 127 | 			"\n<p>Test text</p>\n\n\n\t<p>Test text</p>\n",
 128 | 			"Test text\n\nTest text",
 129 | 		},
 130 | 		{
 131 | 			"\n<p>Test text<br/>Test text</p>\n",
 132 | 			"Test text\nTest text",
 133 | 		},
 134 | 		{
 135 | 			"\n<p>Test text<br> \tTest text<br></p>\n",
 136 | 			"Test text\nTest text",
 137 | 		},
 138 | 		{
 139 | 			"Test text<br><BR />Test text",
 140 | 			"Test text\n\nTest text",
 141 | 		},
 142 | 		{
 143 | 			"<pre>test1\ntest 2\n\ntest  3</pre>",
 144 | 			"test1\ntest 2\n\ntest  3",
 145 | 		},
 146 | 	}
 147 | 
 148 | 	for _, testCase := range testCases {
 149 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 150 | 			t.Error(err)
 151 | 		} else if len(msg) > 0 {
 152 | 			t.Log(msg)
 153 | 		}
 154 | 	}
 155 | }
 156 | 
 157 | func TestTables(t *testing.T) {
 158 | 	testCases := []struct {
 159 | 		input           string
 160 | 		tabularOutput   string
 161 | 		plaintextOutput string
 162 | 	}{
 163 | 		{
 164 | 			"<table><tr><td></td><td></td></tr></table>",
 165 | 			// Empty table
 166 | 			// +--+--+
 167 | 			// |  |  |
 168 | 			// +--+--+
 169 | 			"+--+--+\n|  |  |\n+--+--+",
 170 | 			"",
 171 | 		},
 172 | 		{
 173 | 			"<table><tr><td>cell1</td><td>cell2</td></tr></table>",
 174 | 			// +-------+-------+
 175 | 			// | cell1 | cell2 |
 176 | 			// +-------+-------+
 177 | 			"+-------+-------+\n| cell1 | cell2 |\n+-------+-------+",
 178 | 			"cell1 cell2",
 179 | 		},
 180 | 		{
 181 | 			"<table><tr><td>row1</td></tr><tr><td>row2</td></tr></table>",
 182 | 			// +------+
 183 | 			// | row1 |
 184 | 			// | row2 |
 185 | 			// +------+
 186 | 			"+------+\n| row1 |\n| row2 |\n+------+",
 187 | 			"row1 row2",
 188 | 		},
 189 | 		{
 190 | 			`<table>
 191 | 				<tbody>
 192 | 					<tr><td><p>Row-1-Col-1-Msg123456789012345</p><p>Row-1-Col-1-Msg2</p></td><td>Row-1-Col-2</td></tr>
 193 | 					<tr><td>Row-2-Col-1</td><td>Row-2-Col-2</td></tr>
 194 | 				</tbody>
 195 | 			</table>`,
 196 | 			// +--------------------------------+-------------+
 197 | 			// | Row-1-Col-1-Msg123456789012345 | Row-1-Col-2 |
 198 | 			// | Row-1-Col-1-Msg2               |             |
 199 | 			// | Row-2-Col-1                    | Row-2-Col-2 |
 200 | 			// +--------------------------------+-------------+
 201 | 			`+--------------------------------+-------------+
 202 | | Row-1-Col-1-Msg123456789012345 | Row-1-Col-2 |
 203 | | Row-1-Col-1-Msg2               |             |
 204 | | Row-2-Col-1                    | Row-2-Col-2 |
 205 | +--------------------------------+-------------+`,
 206 | 			`Row-1-Col-1-Msg123456789012345
 207 | 
 208 | Row-1-Col-1-Msg2
 209 | 
 210 | Row-1-Col-2 Row-2-Col-1 Row-2-Col-2`,
 211 | 		},
 212 | 		{
 213 | 			`<table>
 214 | 			   <tr><td>cell1-1</td><td>cell1-2</td></tr>
 215 | 			   <tr><td>cell2-1</td><td>cell2-2</td></tr>
 216 | 			</table>`,
 217 | 			// +---------+---------+
 218 | 			// | cell1-1 | cell1-2 |
 219 | 			// | cell2-1 | cell2-2 |
 220 | 			// +---------+---------+
 221 | 			"+---------+---------+\n| cell1-1 | cell1-2 |\n| cell2-1 | cell2-2 |\n+---------+---------+",
 222 | 			"cell1-1 cell1-2 cell2-1 cell2-2",
 223 | 		},
 224 | 		{
 225 | 			`<table>
 226 | 				<thead>
 227 | 					<tr><th>Header 1</th><th>Header 2</th></tr>
 228 | 				</thead>
 229 | 				<tfoot>
 230 | 					<tr><td>Footer 1</td><td>Footer 2</td></tr>
 231 | 				</tfoot>
 232 | 				<tbody>
 233 | 					<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
 234 | 					<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
 235 | 				</tbody>
 236 | 			</table>`,
 237 | 			`+-------------+-------------+
 238 | |  HEADER 1   |  HEADER 2   |
 239 | +-------------+-------------+
 240 | | Row 1 Col 1 | Row 1 Col 2 |
 241 | | Row 2 Col 1 | Row 2 Col 2 |
 242 | +-------------+-------------+
 243 | |  FOOTER 1   |  FOOTER 2   |
 244 | +-------------+-------------+`,
 245 | 			"Header 1 Header 2 Footer 1 Footer 2 Row 1 Col 1 Row 1 Col 2 Row 2 Col 1 Row 2 Col 2",
 246 | 		},
 247 | 		// Two tables in same HTML (goal is to test that context is
 248 | 		// reinitialized correctly).
 249 | 		{
 250 | 			`<p>
 251 | 				<table>
 252 | 					<thead>
 253 | 						<tr><th>Table 1 Header 1</th><th>Table 1 Header 2</th></tr>
 254 | 					</thead>
 255 | 					<tfoot>
 256 | 						<tr><td>Table 1 Footer 1</td><td>Table 1 Footer 2</td></tr>
 257 | 					</tfoot>
 258 | 					<tbody>
 259 | 						<tr><td>Table 1 Row 1 Col 1</td><td>Table 1 Row 1 Col 2</td></tr>
 260 | 						<tr><td>Table 1 Row 2 Col 1</td><td>Table 1 Row 2 Col 2</td></tr>
 261 | 					</tbody>
 262 | 				</table>
 263 | 				<table>
 264 | 					<thead>
 265 | 						<tr><th>Table 2 Header 1</th><th>Table 2 Header 2</th></tr>
 266 | 					</thead>
 267 | 					<tfoot>
 268 | 						<tr><td>Table 2 Footer 1</td><td>Table 2 Footer 2</td></tr>
 269 | 					</tfoot>
 270 | 					<tbody>
 271 | 						<tr><td>Table 2 Row 1 Col 1</td><td>Table 2 Row 1 Col 2</td></tr>
 272 | 						<tr><td>Table 2 Row 2 Col 1</td><td>Table 2 Row 2 Col 2</td></tr>
 273 | 					</tbody>
 274 | 				</table>
 275 | 			</p>`,
 276 | 			`+---------------------+---------------------+
 277 | |  TABLE 1 HEADER 1   |  TABLE 1 HEADER 2   |
 278 | +---------------------+---------------------+
 279 | | Table 1 Row 1 Col 1 | Table 1 Row 1 Col 2 |
 280 | | Table 1 Row 2 Col 1 | Table 1 Row 2 Col 2 |
 281 | +---------------------+---------------------+
 282 | |  TABLE 1 FOOTER 1   |  TABLE 1 FOOTER 2   |
 283 | +---------------------+---------------------+
 284 | 
 285 | +---------------------+---------------------+
 286 | |  TABLE 2 HEADER 1   |  TABLE 2 HEADER 2   |
 287 | +---------------------+---------------------+
 288 | | Table 2 Row 1 Col 1 | Table 2 Row 1 Col 2 |
 289 | | Table 2 Row 2 Col 1 | Table 2 Row 2 Col 2 |
 290 | +---------------------+---------------------+
 291 | |  TABLE 2 FOOTER 1   |  TABLE 2 FOOTER 2   |
 292 | +---------------------+---------------------+`,
 293 | 			`Table 1 Header 1 Table 1 Header 2 Table 1 Footer 1 Table 1 Footer 2 Table 1 Row 1 Col 1 Table 1 Row 1 Col 2 Table 1 Row 2 Col 1 Table 1 Row 2 Col 2
 294 | 
 295 | Table 2 Header 1 Table 2 Header 2 Table 2 Footer 1 Table 2 Footer 2 Table 2 Row 1 Col 1 Table 2 Row 1 Col 2 Table 2 Row 2 Col 1 Table 2 Row 2 Col 2`,
 296 | 		},
 297 | 		{
 298 | 			"_<table><tr><td>cell</td></tr></table>_",
 299 | 			"_\n\n+------+\n| cell |\n+------+\n\n_",
 300 | 			"_\n\ncell\n\n_",
 301 | 		},
 302 | 		{
 303 | 			`<table>
 304 | 				<tr>
 305 | 					<th>Item</th>
 306 | 					<th>Description</th>
 307 | 					<th>Price</th>
 308 | 				</tr>
 309 | 				<tr>
 310 | 					<td>Golang</td>
 311 | 					<td>Open source programming language that makes it easy to build simple, reliable, and efficient software</td>
 312 | 					<td>$10.99</td>
 313 | 				</tr>
 314 | 				<tr>
 315 | 					<td>Hermes</td>
 316 | 					<td>Programmatically create beautiful e-mails using Golang.</td>
 317 | 					<td>$1.99</td>
 318 | 				</tr>
 319 | 			</table>`,
 320 | 			`+--------+--------------------------------+--------+
 321 | |  ITEM  |          DESCRIPTION           | PRICE  |
 322 | +--------+--------------------------------+--------+
 323 | | Golang | Open source programming        | $10.99 |
 324 | |        | language that makes it easy    |        |
 325 | |        | to build simple, reliable, and |        |
 326 | |        | efficient software             |        |
 327 | | Hermes | Programmatically create        | $1.99  |
 328 | |        | beautiful e-mails using        |        |
 329 | |        | Golang.                        |        |
 330 | +--------+--------------------------------+--------+`,
 331 | 			"Item Description Price Golang Open source programming language that makes it easy to build simple, reliable, and efficient software $10.99 Hermes Programmatically create beautiful e-mails using Golang. $1.99",
 332 | 		},
 333 | 	}
 334 | 
 335 | 	for _, testCase := range testCases {
 336 | 		options := Options{
 337 | 			PrettyTables:        true,
 338 | 			PrettyTablesOptions: NewPrettyTablesOptions(),
 339 | 		}
 340 | 		// Check pretty tabular ASCII version.
 341 | 		if msg, err := wantString(testCase.input, testCase.tabularOutput, options); err != nil {
 342 | 			t.Error(err)
 343 | 		} else if len(msg) > 0 {
 344 | 			t.Log(msg)
 345 | 		}
 346 | 
 347 | 		// Check plain version.
 348 | 		if msg, err := wantString(testCase.input, testCase.plaintextOutput); err != nil {
 349 | 			t.Error(err)
 350 | 		} else if len(msg) > 0 {
 351 | 			t.Log(msg)
 352 | 		}
 353 | 	}
 354 | }
 355 | 
 356 | func TestStrippingLists(t *testing.T) {
 357 | 	testCases := []struct {
 358 | 		input  string
 359 | 		output string
 360 | 	}{
 361 | 		{
 362 | 			"<ul></ul>",
 363 | 			"",
 364 | 		},
 365 | 		{
 366 | 			"<ul><li>item</li></ul>_",
 367 | 			"* item\n\n_",
 368 | 		},
 369 | 		{
 370 | 			"<li class='123'>item 1</li> <li>item 2</li>\n_",
 371 | 			"* item 1\n* item 2\n_",
 372 | 		},
 373 | 		{
 374 | 			"<li>item 1</li> \t\n <li>item 2</li> <li> item 3</li>\n_",
 375 | 			"* item 1\n* item 2\n* item 3\n_",
 376 | 		},
 377 | 	}
 378 | 
 379 | 	for _, testCase := range testCases {
 380 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 381 | 			t.Error(err)
 382 | 		} else if len(msg) > 0 {
 383 | 			t.Log(msg)
 384 | 		}
 385 | 	}
 386 | }
 387 | 
 388 | func TestLinks(t *testing.T) {
 389 | 	testCases := []struct {
 390 | 		input  string
 391 | 		output string
 392 | 	}{
 393 | 		{
 394 | 			`<a></a>`,
 395 | 			``,
 396 | 		},
 397 | 		{
 398 | 			`<a href=""></a>`,
 399 | 			``,
 400 | 		},
 401 | 		{
 402 | 			`<a href="http://example.com/"></a>`,
 403 | 			`( http://example.com/ )`,
 404 | 		},
 405 | 		{
 406 | 			`<a href="">Link</a>`,
 407 | 			`Link`,
 408 | 		},
 409 | 		{
 410 | 			`<a href="http://example.com/">Link</a>`,
 411 | 			`Link ( http://example.com/ )`,
 412 | 		},
 413 | 		{
 414 | 			`<a href="http://example.com/"><span class="a">Link</span></a>`,
 415 | 			`Link ( http://example.com/ )`,
 416 | 		},
 417 | 		{
 418 | 			"<a href='http://example.com/'>\n\t<span class='a'>Link</span>\n\t</a>",
 419 | 			`Link ( http://example.com/ )`,
 420 | 		},
 421 | 		{
 422 | 			"<a href='mailto:contact@example.org'>Contact Us</a>",
 423 | 			`Contact Us ( contact@example.org )`,
 424 | 		},
 425 | 		{
 426 | 			"<a href=\"http://example.com:80/~user?aaa=bb&amp;c=d,e,f#foo\">Link</a>",
 427 | 			`Link ( http://example.com:80/~user?aaa=bb&c=d,e,f#foo )`,
 428 | 		},
 429 | 		{
 430 | 			"<a title='title' href=\"http://example.com/\">Link</a>",
 431 | 			`Link ( http://example.com/ )`,
 432 | 		},
 433 | 		{
 434 | 			"<a href=\"   http://example.com/ \"> Link </a>",
 435 | 			`Link ( http://example.com/ )`,
 436 | 		},
 437 | 		{
 438 | 			"<a href=\"http://example.com/a/\">Link A</a> <a href=\"http://example.com/b/\">Link B</a>",
 439 | 			`Link A ( http://example.com/a/ ) Link B ( http://example.com/b/ )`,
 440 | 		},
 441 | 		{
 442 | 			"<a href=\"%%LINK%%\">Link</a>",
 443 | 			`Link ( %%LINK%% )`,
 444 | 		},
 445 | 		{
 446 | 			"<a href=\"[LINK]\">Link</a>",
 447 | 			`Link ( [LINK] )`,
 448 | 		},
 449 | 		{
 450 | 			"<a href=\"{LINK}\">Link</a>",
 451 | 			`Link ( {LINK} )`,
 452 | 		},
 453 | 		{
 454 | 			"<a href=\"[[!unsubscribe]]\">Link</a>",
 455 | 			`Link ( [[!unsubscribe]] )`,
 456 | 		},
 457 | 		{
 458 | 			"<p>This is <a href=\"http://www.google.com\" >link1</a> and <a href=\"http://www.google.com\" >link2 </a> is next.</p>",
 459 | 			`This is link1 ( http://www.google.com ) and link2 ( http://www.google.com ) is next.`,
 460 | 		},
 461 | 		{
 462 | 			"<a href=\"http://www.google.com\" >http://www.google.com</a>",
 463 | 			`http://www.google.com`,
 464 | 		},
 465 | 	}
 466 | 
 467 | 	for _, testCase := range testCases {
 468 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 469 | 			t.Error(err)
 470 | 		} else if len(msg) > 0 {
 471 | 			t.Log(msg)
 472 | 		}
 473 | 	}
 474 | }
 475 | 
 476 | func TestOmitLinks(t *testing.T) {
 477 | 	testCases := []struct {
 478 | 		input  string
 479 | 		output string
 480 | 	}{
 481 | 		{
 482 | 			`<a></a>`,
 483 | 			``,
 484 | 		},
 485 | 		{
 486 | 			`<a href=""></a>`,
 487 | 			``,
 488 | 		},
 489 | 		{
 490 | 			`<a href="http://example.com/"></a>`,
 491 | 			``,
 492 | 		},
 493 | 		{
 494 | 			`<a href="">Link</a>`,
 495 | 			`Link`,
 496 | 		},
 497 | 		{
 498 | 			`<a href="http://example.com/">Link</a>`,
 499 | 			`Link`,
 500 | 		},
 501 | 		{
 502 | 			`<a href="http://example.com/"><span class="a">Link</span></a>`,
 503 | 			`Link`,
 504 | 		},
 505 | 		{
 506 | 			"<a href='http://example.com/'>\n\t<span class='a'>Link</span>\n\t</a>",
 507 | 			`Link`,
 508 | 		},
 509 | 		{
 510 | 			`<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"></a>`,
 511 | 			`Example`,
 512 | 		},
 513 | 	}
 514 | 
 515 | 	for _, testCase := range testCases {
 516 | 		if msg, err := wantString(testCase.input, testCase.output, Options{OmitLinks: true}); err != nil {
 517 | 			t.Error(err)
 518 | 		} else if len(msg) > 0 {
 519 | 			t.Log(msg)
 520 | 		}
 521 | 	}
 522 | }
 523 | 
 524 | func TestImageAltTags(t *testing.T) {
 525 | 	testCases := []struct {
 526 | 		input  string
 527 | 		output string
 528 | 	}{
 529 | 		{
 530 | 			`<img />`,
 531 | 			``,
 532 | 		},
 533 | 		{
 534 | 			`<img src="http://example.ru/hello.jpg" />`,
 535 | 			``,
 536 | 		},
 537 | 		{
 538 | 			`<img alt="Example"/>`,
 539 | 			``,
 540 | 		},
 541 | 		{
 542 | 			`<img src="http://example.ru/hello.jpg" alt="Example"/>`,
 543 | 			``,
 544 | 		},
 545 | 		// Images do matter if they are in a link.
 546 | 		{
 547 | 			`<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"/></a>`,
 548 | 			`Example ( http://example.com/ )`,
 549 | 		},
 550 | 		{
 551 | 			`<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"></a>`,
 552 | 			`Example ( http://example.com/ )`,
 553 | 		},
 554 | 		{
 555 | 			`<a href='http://example.com/'><img src='http://example.ru/hello.jpg' alt='Example'/></a>`,
 556 | 			`Example ( http://example.com/ )`,
 557 | 		},
 558 | 		{
 559 | 			`<a href='http://example.com/'><img src='http://example.ru/hello.jpg' alt='Example'></a>`,
 560 | 			`Example ( http://example.com/ )`,
 561 | 		},
 562 | 	}
 563 | 
 564 | 	for _, testCase := range testCases {
 565 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 566 | 			t.Error(err)
 567 | 		} else if len(msg) > 0 {
 568 | 			t.Log(msg)
 569 | 		}
 570 | 	}
 571 | }
 572 | 
 573 | func TestHeadings(t *testing.T) {
 574 | 	testCases := []struct {
 575 | 		input  string
 576 | 		output string
 577 | 	}{
 578 | 		{
 579 | 			"<h1>Test</h1>",
 580 | 			"****\nTest\n****",
 581 | 		},
 582 | 		{
 583 | 			"\t<h1>\nTest</h1> ",
 584 | 			"****\nTest\n****",
 585 | 		},
 586 | 		{
 587 | 			"\t<h1>\nTest line 1<br>Test 2</h1> ",
 588 | 			"***********\nTest line 1\nTest 2\n***********",
 589 | 		},
 590 | 		{
 591 | 			"<h1>Test</h1> <h1>Test</h1>",
 592 | 			"****\nTest\n****\n\n****\nTest\n****",
 593 | 		},
 594 | 		{
 595 | 			"<h2>Test</h2>",
 596 | 			"----\nTest\n----",
 597 | 		},
 598 | 		{
 599 | 			"<h1><a href='http://example.com/'>Test</a></h1>",
 600 | 			"****************************\nTest ( http://example.com/ )\n****************************",
 601 | 		},
 602 | 		{
 603 | 			"<h3> <span class='a'>Test </span></h3>",
 604 | 			"Test\n----",
 605 | 		},
 606 | 	}
 607 | 
 608 | 	for _, testCase := range testCases {
 609 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 610 | 			t.Error(err)
 611 | 		} else if len(msg) > 0 {
 612 | 			t.Log(msg)
 613 | 		}
 614 | 	}
 615 | 
 616 | }
 617 | 
 618 | func TestBold(t *testing.T) {
 619 | 	testCases := []struct {
 620 | 		input  string
 621 | 		output string
 622 | 	}{
 623 | 		{
 624 | 			"<b>Test</b>",
 625 | 			"*Test*",
 626 | 		},
 627 | 		{
 628 | 			"\t<b>Test</b> ",
 629 | 			"*Test*",
 630 | 		},
 631 | 		{
 632 | 			"\t<b>Test line 1<br>Test 2</b> ",
 633 | 			"*Test line 1\nTest 2*",
 634 | 		},
 635 | 		{
 636 | 			"<b>Test</b> <b>Test</b>",
 637 | 			"*Test* *Test*",
 638 | 		},
 639 | 	}
 640 | 
 641 | 	for _, testCase := range testCases {
 642 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 643 | 			t.Error(err)
 644 | 		} else if len(msg) > 0 {
 645 | 			t.Log(msg)
 646 | 		}
 647 | 	}
 648 | 
 649 | }
 650 | 
 651 | func TestDiv(t *testing.T) {
 652 | 	testCases := []struct {
 653 | 		input  string
 654 | 		output string
 655 | 	}{
 656 | 		{
 657 | 			"<div>Test</div>",
 658 | 			"Test",
 659 | 		},
 660 | 		{
 661 | 			"\t<div>Test</div> ",
 662 | 			"Test",
 663 | 		},
 664 | 		{
 665 | 			"<div>Test line 1<div>Test 2</div></div>",
 666 | 			"Test line 1\nTest 2",
 667 | 		},
 668 | 		{
 669 | 			"Test 1<div>Test 2</div> <div>Test 3</div>Test 4",
 670 | 			"Test 1\nTest 2\nTest 3\nTest 4",
 671 | 		},
 672 | 		{
 673 | 			"Test 1<div>&nbsp;Test 2&nbsp;</div>",
 674 | 			"Test 1\nTest 2",
 675 | 		},
 676 | 	}
 677 | 
 678 | 	for _, testCase := range testCases {
 679 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 680 | 			t.Error(err)
 681 | 		} else if len(msg) > 0 {
 682 | 			t.Log(msg)
 683 | 		}
 684 | 	}
 685 | 
 686 | }
 687 | 
 688 | func TestBlockquotes(t *testing.T) {
 689 | 	testCases := []struct {
 690 | 		input  string
 691 | 		output string
 692 | 	}{
 693 | 		{
 694 | 			"<div>level 0<blockquote>level 1<br><blockquote>level 2</blockquote>level 1</blockquote><div>level 0</div></div>",
 695 | 			"level 0\n> \n> level 1\n> \n>> level 2\n> \n> level 1\n\nlevel 0",
 696 | 		},
 697 | 		{
 698 | 			"<blockquote>Test</blockquote>Test",
 699 | 			"> \n> Test\n\nTest",
 700 | 		},
 701 | 		{
 702 | 			"\t<blockquote> \nTest<br></blockquote> ",
 703 | 			"> \n> Test\n>",
 704 | 		},
 705 | 		{
 706 | 			"\t<blockquote> \nTest line 1<br>Test 2</blockquote> ",
 707 | 			"> \n> Test line 1\n> Test 2",
 708 | 		},
 709 | 		{
 710 | 			"<blockquote>Test</blockquote> <blockquote>Test</blockquote> Other Test",
 711 | 			"> \n> Test\n\n> \n> Test\n\nOther Test",
 712 | 		},
 713 | 		{
 714 | 			"<blockquote>Lorem ipsum Commodo id consectetur pariatur ea occaecat minim aliqua ad sit consequat quis ex commodo Duis incididunt eu mollit consectetur fugiat voluptate dolore in pariatur in commodo occaecat Ut occaecat velit esse labore aute quis commodo non sit dolore officia Excepteur cillum amet cupidatat culpa velit labore ullamco dolore mollit elit in aliqua dolor irure do</blockquote>",
 715 | 			"> \n> Lorem ipsum Commodo id consectetur pariatur ea occaecat minim aliqua ad\n> sit consequat quis ex commodo Duis incididunt eu mollit consectetur fugiat\n> voluptate dolore in pariatur in commodo occaecat Ut occaecat velit esse\n> labore aute quis commodo non sit dolore officia Excepteur cillum amet\n> cupidatat culpa velit labore ullamco dolore mollit elit in aliqua dolor\n> irure do",
 716 | 		},
 717 | 		{
 718 | 			"<blockquote>Lorem<b>ipsum</b><b>Commodo</b><b>id</b><b>consectetur</b><b>pariatur</b><b>ea</b><b>occaecat</b><b>minim</b><b>aliqua</b><b>ad</b><b>sit</b><b>consequat</b><b>quis</b><b>ex</b><b>commodo</b><b>Duis</b><b>incididunt</b><b>eu</b><b>mollit</b><b>consectetur</b><b>fugiat</b><b>voluptate</b><b>dolore</b><b>in</b><b>pariatur</b><b>in</b><b>commodo</b><b>occaecat</b><b>Ut</b><b>occaecat</b><b>velit</b><b>esse</b><b>labore</b><b>aute</b><b>quis</b><b>commodo</b><b>non</b><b>sit</b><b>dolore</b><b>officia</b><b>Excepteur</b><b>cillum</b><b>amet</b><b>cupidatat</b><b>culpa</b><b>velit</b><b>labore</b><b>ullamco</b><b>dolore</b><b>mollit</b><b>elit</b><b>in</b><b>aliqua</b><b>dolor</b><b>irure</b><b>do</b></blockquote>",
 719 | 			"> \n> Lorem *ipsum* *Commodo* *id* *consectetur* *pariatur* *ea* *occaecat* *minim*\n> *aliqua* *ad* *sit* *consequat* *quis* *ex* *commodo* *Duis* *incididunt* *eu*\n> *mollit* *consectetur* *fugiat* *voluptate* *dolore* *in* *pariatur* *in* *commodo*\n> *occaecat* *Ut* *occaecat* *velit* *esse* *labore* *aute* *quis* *commodo*\n> *non* *sit* *dolore* *officia* *Excepteur* *cillum* *amet* *cupidatat* *culpa*\n> *velit* *labore* *ullamco* *dolore* *mollit* *elit* *in* *aliqua* *dolor* *irure*\n> *do*",
 720 | 		},
 721 | 	}
 722 | 
 723 | 	for _, testCase := range testCases {
 724 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 725 | 			t.Error(err)
 726 | 		} else if len(msg) > 0 {
 727 | 			t.Log(msg)
 728 | 		}
 729 | 	}
 730 | 
 731 | }
 732 | 
 733 | func TestIgnoreStylesScriptsHead(t *testing.T) {
 734 | 	testCases := []struct {
 735 | 		input  string
 736 | 		output string
 737 | 	}{
 738 | 		{
 739 | 			"<style>Test</style>",
 740 | 			"",
 741 | 		},
 742 | 		{
 743 | 			"<style type=\"text/css\">body { color: #fff; }</style>",
 744 | 			"",
 745 | 		},
 746 | 		{
 747 | 			"<link rel=\"stylesheet\" href=\"main.css\">",
 748 | 			"",
 749 | 		},
 750 | 		{
 751 | 			"<script>Test</script>",
 752 | 			"",
 753 | 		},
 754 | 		{
 755 | 			"<script src=\"main.js\"></script>",
 756 | 			"",
 757 | 		},
 758 | 		{
 759 | 			"<script type=\"text/javascript\" src=\"main.js\"></script>",
 760 | 			"",
 761 | 		},
 762 | 		{
 763 | 			"<script type=\"text/javascript\">Test</script>",
 764 | 			"",
 765 | 		},
 766 | 		{
 767 | 			"<script type=\"text/ng-template\" id=\"template.html\"><a href=\"http://google.com\">Google</a></script>",
 768 | 			"",
 769 | 		},
 770 | 		{
 771 | 			"<script type=\"bla-bla-bla\" id=\"template.html\">Test</script>",
 772 | 			"",
 773 | 		},
 774 | 		{
 775 | 			`<html><head><title>Title</title></head><body></body></html>`,
 776 | 			"",
 777 | 		},
 778 | 	}
 779 | 
 780 | 	for _, testCase := range testCases {
 781 | 		if msg, err := wantString(testCase.input, testCase.output); err != nil {
 782 | 			t.Error(err)
 783 | 		} else if len(msg) > 0 {
 784 | 			t.Log(msg)
 785 | 		}
 786 | 	}
 787 | }
 788 | 
 789 | func TestText(t *testing.T) {
 790 | 	testCases := []struct {
 791 | 		input string
 792 | 		expr  string
 793 | 	}{
 794 | 		{
 795 | 			`<li>
 796 | 		  <a href="/new" data-ga-click="Header, create new repository, icon:repo"><span class="octicon octicon-repo"></span> New repository</a>
 797 | 		</li>`,
 798 | 			`\* New repository \( /new \)`,
 799 | 		},
 800 | 		{
 801 | 			`hi
 802 | 
 803 | 			<br>
 804 | 
 805 | 	hello <a href="https://google.com">google</a>
 806 | 	<br><br>
 807 | 	test<p>List:</p>
 808 | 
 809 | 	<ul>
 810 | 		<li><a href="foo">Foo</a></li>
 811 | 		<li><a href="http://www.microshwhat.com/bar/soapy">Barsoap</a></li>
 812 |         <li>Baz</li>
 813 | 	</ul>
 814 | `,
 815 | 			`hi
 816 | hello google \( https://google.com \)
 817 | 
 818 | test
 819 | 
 820 | List:
 821 | 
 822 | \* Foo \( foo \)
 823 | \* Barsoap \( http://www.microshwhat.com/bar/soapy \)
 824 | \* Baz`,
 825 | 		},
 826 | 		// Malformed input html.
 827 | 		{
 828 | 			`hi
 829 | 
 830 | 			hello <a href="https://google.com">google</a>
 831 | 
 832 | 			test<p>List:</p>
 833 | 
 834 | 			<ul>
 835 | 				<li><a href="foo">Foo</a>
 836 | 				<li><a href="/
 837 | 		                bar/baz">Bar</a>
 838 | 		        <li>Baz</li>
 839 | 			</ul>
 840 | 		`,
 841 | 			`hi hello google \( https://google.com \) test
 842 | 
 843 | List:
 844 | 
 845 | \* Foo \( foo \)
 846 | \* Bar \( /\n[ \t]+bar/baz \)
 847 | \* Baz`,
 848 | 		},
 849 | 	}
 850 | 
 851 | 	for _, testCase := range testCases {
 852 | 		if msg, err := wantRegExp(testCase.input, testCase.expr); err != nil {
 853 | 			t.Error(err)
 854 | 		} else if len(msg) > 0 {
 855 | 			t.Log(msg)
 856 | 		}
 857 | 	}
 858 | }
 859 | 
 860 | func TestPeriod(t *testing.T) {
 861 | 	testCases := []struct {
 862 | 		input string
 863 | 		expr  string
 864 | 	}{
 865 | 		{
 866 | 			`<p>Lorem ipsum <span>test</span>.</p>`,
 867 | 			`Lorem ipsum test\.`,
 868 | 		},
 869 | 		{
 870 | 			`<p>Lorem ipsum <span>test.</span></p>`,
 871 | 			`Lorem ipsum test\.`,
 872 | 		},
 873 | 	}
 874 | 
 875 | 	for _, testCase := range testCases {
 876 | 		if msg, err := wantRegExp(testCase.input, testCase.expr); err != nil {
 877 | 			t.Error(err)
 878 | 		} else if len(msg) > 0 {
 879 | 			t.Log(msg)
 880 | 		}
 881 | 	}
 882 | }
 883 | 
 884 | type StringMatcher interface {
 885 | 	MatchString(string) bool
 886 | 	String() string
 887 | }
 888 | 
 889 | type RegexpStringMatcher string
 890 | 
 891 | func (m RegexpStringMatcher) MatchString(str string) bool {
 892 | 	return regexp.MustCompile(string(m)).MatchString(str)
 893 | }
 894 | func (m RegexpStringMatcher) String() string {
 895 | 	return string(m)
 896 | }
 897 | 
 898 | type ExactStringMatcher string
 899 | 
 900 | func (m ExactStringMatcher) MatchString(str string) bool {
 901 | 	return string(m) == str
 902 | }
 903 | func (m ExactStringMatcher) String() string {
 904 | 	return string(m)
 905 | }
 906 | 
 907 | func wantRegExp(input string, outputRE string, options ...Options) (string, error) {
 908 | 	return match(input, RegexpStringMatcher(outputRE), options...)
 909 | }
 910 | 
 911 | func wantString(input string, output string, options ...Options) (string, error) {
 912 | 	return match(input, ExactStringMatcher(output), options...)
 913 | }
 914 | 
 915 | func match(input string, matcher StringMatcher, options ...Options) (string, error) {
 916 | 	text, err := FromString(input, options...)
 917 | 	if err != nil {
 918 | 		return "", err
 919 | 	}
 920 | 	if !matcher.MatchString(text) {
 921 | 		return "", fmt.Errorf(`error: input did not match specified expression
 922 | Input:
 923 | >>>>
 924 | %v
 925 | <<<<
 926 | 
 927 | Output:
 928 | >>>>
 929 | %v
 930 | <<<<
 931 | 
 932 | Expected:
 933 | >>>>
 934 | %v
 935 | <<<<`,
 936 | 			input,
 937 | 			text,
 938 | 			matcher.String(),
 939 | 		)
 940 | 	}
 941 | 
 942 | 	var msg string
 943 | 
 944 | 	if EnableExtraLogging {
 945 | 		msg = fmt.Sprintf(
 946 | 			`
 947 | input:
 948 | 
 949 | %v
 950 | 
 951 | output:
 952 | 
 953 | %v
 954 | `,
 955 | 			input,
 956 | 			text,
 957 | 		)
 958 | 	}
 959 | 	return msg, nil
 960 | }
 961 | 
 962 | func Example() {
 963 | 	inputHTML := `
 964 | <html>
 965 | 	<head>
 966 | 		<title>My Mega Service</title>
 967 | 		<link rel=\"stylesheet\" href=\"main.css\">
 968 | 		<style type=\"text/css\">body { color: #fff; }</style>
 969 | 	</head>
 970 | 
 971 | 	<body>
 972 | 		<div class="logo">
 973 | 			<a href="http://jaytaylor.com/"><img src="/logo-image.jpg" alt="Mega Service"/></a>
 974 | 		</div>
 975 | 
 976 | 		<h1>Welcome to your new account on my service!</h1>
 977 | 
 978 | 		<p>
 979 | 			Here is some more information:
 980 | 
 981 | 			<ul>
 982 | 				<li>Link 1: <a href="https://example.com">Example.com</a></li>
 983 | 				<li>Link 2: <a href="https://example2.com">Example2.com</a></li>
 984 | 				<li>Something else</li>
 985 | 			</ul>
 986 | 		</p>
 987 | 
 988 | 		<table>
 989 | 			<thead>
 990 | 				<tr><th>Header 1</th><th>Header 2</th></tr>
 991 | 			</thead>
 992 | 			<tfoot>
 993 | 				<tr><td>Footer 1</td><td>Footer 2</td></tr>
 994 | 			</tfoot>
 995 | 			<tbody>
 996 | 				<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
 997 | 				<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
 998 | 			</tbody>
 999 | 		</table>
1000 | 	</body>
1001 | </html>`
1002 | 
1003 | 	text, err := FromString(inputHTML, Options{PrettyTables: true})
1004 | 	if err != nil {
1005 | 		panic(err)
1006 | 	}
1007 | 	fmt.Println(text)
1008 | 
1009 | 	// Output:
1010 | 	// Mega Service ( http://jaytaylor.com/ )
1011 | 	//
1012 | 	// ******************************************
1013 | 	// Welcome to your new account on my service!
1014 | 	// ******************************************
1015 | 	//
1016 | 	// Here is some more information:
1017 | 	//
1018 | 	// * Link 1: Example.com ( https://example.com )
1019 | 	// * Link 2: Example2.com ( https://example2.com )
1020 | 	// * Something else
1021 | 	//
1022 | 	// +-------------+-------------+
1023 | 	// |  HEADER 1   |  HEADER 2   |
1024 | 	// +-------------+-------------+
1025 | 	// | Row 1 Col 1 | Row 1 Col 2 |
1026 | 	// | Row 2 Col 1 | Row 2 Col 2 |
1027 | 	// +-------------+-------------+
1028 | 	// |  FOOTER 1   |  FOOTER 2   |
1029 | 	// +-------------+-------------+
1030 | }
1031 | 


--------------------------------------------------------------------------------
/html2text/testdata/utf8.html:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='utf-8'?>
 2 | <html xmlns="http://www.w3.org/1999/xhtml">
 3 | 
 4 | <head>
 5 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 6 |     <title>学习之道:美国公认学习第一书title</title>
 7 |     <link href="stylesheet.css" rel="stylesheet" type="text/css" />
 8 |     <link href="page_styles.css" rel="stylesheet" type="text/css" />
 9 | </head>
10 | 
11 | <body class="calibre">
12 |     <p id="filepos9452" class="calibre_"><span class="calibre6"><span class="bold">写在前面的话</span></span>
13 |     </p>
14 |     <p class="calibre_12">在台湾的那次世界冠军赛上，我几近疯狂，直至两年后的今天，我仍沉浸在这次的经历中。这是我生平第一次如此深入地审视我自己，甚至是第一次尝试审视自己。这个过程令人很是兴奋，同时也有点感觉怪异。我重新认识了自我，看到了自己的另外一面，自己从未发觉的另外一面。为了生存，为了取胜，我成了一名角斗士，彻头彻尾，简单纯粹。我并没有意识到这一角色早已在我的心中生根发芽，呼之欲出。也许，他的出现已是不可避免。</p>
15 |     <p class="calibre_7">而我这全新的一面，与我一直熟识的那个乔希，那个曾经害怕黑暗的孩子，那个象棋手，那个狂热于雨水、反复诵读杰克·克鲁亚克作品的年轻人之间，又有什么样的联系呢？这些都是我正在努力弄清楚的问题。</p>
16 |     <p class="calibre_7">自台湾赛事之后，我急切非常，一心想要回到训练中去，摆脱自己已经达到巅峰的想法。在过去的两年中，我已经重新开始。这是一个新的起点。前方的路还很长，有待进一步的探索。</p>
17 |     <p class="calibre_7">这本书的创作耗费了相当多的时间和精力。在成长的过程中，我在我的小房间里从未想过等待我的会是这样的战斗。在创作中，我的思想逐渐成熟；爱恋从分崩离析，到失而复得，世界冠军头衔从失之交臂，到囊中取物。如果说在我人生的第一个二十九年中，我学到了什么，那就是，我们永远无法预测结局，无论是重要的比赛、冒险，还是轰轰烈烈的爱情。我们唯一可以肯定的只有，出乎意料。不管我们做了多么万全的准备，在生活的真实场景中，我们总是会处于陌生的境地。我们也许会无法冷静，失去理智，感觉似乎整个世界都在针对我们。在这个时候，我们所要做的是要付出加倍的努力，要表现得比预想得更好。我认为，关键在于准备好随机应变，准备好在所能想象的高压下发挥出创造力。</p>
18 |     <p class="calibre_7">读者朋友们，我非常希望你们在读过这本书后，可以得到启发，甚至会得到触动，从而能够根据各自的天赋与特长，去实现自己的梦想。这就是我写作此书的目的。我在字里行间所传达的理念曾经使我受益匪浅，我很希望它们可以为大家提供一个基本的框架和方向。如果我的方法言之有理，那么就请接受它，琢磨它，并加之自己的见解。忘记我的那些数字。真正的掌握需要通过自己发现一些最能够引起共鸣的信息，并将其彻底地融合进来，直至成为一体，这样我们才能随心所欲地驾驭它。</p>
19 |     <div class="mbp_pagebreak" id="calibre_pb_4"></div>
20 | </body>
21 | 
22 | </html>


--------------------------------------------------------------------------------
/html2text/testdata/utf8_with_bom.xhtml:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8" ?>
 2 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">
 3 | 
 4 | <head>
 5 |     <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
 6 |     <title>1892年波兰文版序言title</title>
 7 |     <link rel="stylesheet" href="css/stylesheet.css" type="text/css" />
 8 | </head>
 9 | 
10 | <body>
11 |     <div id="page30" />
12 |     <h2 id="CHP2-6">1892年波兰文版序言<a id="wzyy_18_30" href="#wz_18_30"><sup>[18]</sup></a></h2>
13 |     <p>出版共产主义宣言的一种新的波兰文本已成为必要，这一事实，引起了许多感想。</p>
14 |     <p>首先值得注意的是，近来宣言在一定程度上已成为欧洲大陆大工业发展的一种尺度。一个国家的大工业越发展，该国工人中想认清自己作为工人阶级在有产阶级面前所处地位的要求就越增加，他们中间的社会主义运动也越扩大，因而对宣言的需求也越增长。这样，根据宣言用某国文字销行的份数，不仅能够相当确切地断定该国工人运动的状况，而且还能够相当确切地断定该国大工业发展的程度。</p>
15 |     <p>因此，波兰文的新版本标志着波兰工业的决定性进步。从十年前发表的上一个版本以来确实有了这种进步，对此丝毫不容置疑。俄国的波兰，会议的波兰<a id="wzyy_19_30" href="#wz_19_30"><sup>[19]</sup></a>，成了俄罗斯帝国巨大的工业区。俄国大工业是零星分散的，一部分在芬兰湾沿岸，一部分在中央区（莫斯科和弗拉基米尔），第三部分在黑海和亚速海沿岸，还有另一些散布在别处；而波兰工业则紧缩于相对狭小的地区，享受到由这种积聚引起的长处与短处。这种长处是竞争着的俄罗斯工厂主所承认的，他们要求实行保护关税以对付波兰，尽管他们渴望使波兰人俄罗斯化。这种短处，对波兰工厂主与俄罗斯政府来说，表现在社会主义思想在波兰工人中间的迅速传播和对宣言需求的增长。</p>
16 |     <p>但是，波兰工业的迅速发展——它超过了俄国工业——本身<a id="page31" />是波兰人民的坚强生命力的一个新证明，是波兰人民临近的民族复兴的一个新保证。而一个独立强盛的波兰的复兴，不只是一件同波兰人有关、而且是同我们大家有关的事情。只有当每个民族在自己内部完全自主时，欧洲各民族间真诚的国际合作才是可能的。1848年革命在无产阶级旗帜下，使无产阶级的战士最终只作了资产阶级的工作，这次革命通过自己遗嘱的执行者路易·波拿巴和俾斯麦也实现了意大利、德国和匈牙利的独立。然而波兰，它从1792年以来为革命做的比所有这三个国家总共做的还要多，而当它1863年失败于强大十倍的俄军的时候，人们却把它抛弃不顾了。贵族既未能保持住、也未能重新争得波兰的独立；今天波兰的独立对资产阶级至少是无所谓的。然而波兰的独立对于欧洲各民族和谐的合作是必需的。这种独立只有年轻的波兰无产阶级才能争得，而且在它的手中会很好地保持住。因为欧洲所有其余的工人都象波兰工人自己一样也需要波兰的独立。</p>
17 |     <p>弗·恩格斯</p>
18 |     <p>1892年2月10日于伦敦</p>
19 |     <div id="page74" />
20 |     <div><a id="wz_18_30" href="#wzyy_18_30">[18]</a>　恩格斯用德文为《宣言》新的波兰文本写了这篇序言。1892年由波兰社会主义者在伦敦办的《黎明》杂志社出版。序言寄出后，恩格斯写信给门德尔森（1892年2月11日），信中说，他很愿意学会波兰文，并且深入研究波兰工人运动的发展，以便能够为《宣言》的下一版写一篇更详细的序言。——第20页</div>
21 |     <div><a id="wz_19_30" href="#wzyy_19_30">[19]</a>　指维也纳会议的波兰，即根据1814—1815年维也纳会议的决定，以波兰王国的正式名义割给俄国的那部分波兰土地。——第20页</div>
22 | </body>
23 | 
24 | </html>


--------------------------------------------------------------------------------
/odf/Readme.md:
--------------------------------------------------------------------------------
1 | This code was forked from https://github.com/knieriem/odf.
2 | 
3 | This projekt contains two Go packages – odf and odf/ods
4 | – that allow basic read-only access to the tables of Open
5 | Document Spreadsheets, making use of Go's encoding/xml package.
6 | 
7 | For now the ods package makes it easy to convert a table to a
8 | `[][]string`.
9 | 


--------------------------------------------------------------------------------
/odf/meta.go:
--------------------------------------------------------------------------------
 1 | package odf
 2 | 
 3 | import (
 4 | 	"encoding/xml"
 5 | 	"time"
 6 | )
 7 | 
 8 | const (
 9 | 	ISO8601 = "2006-01-02T15:04:05"
10 | )
11 | 
12 | type DocumentMeta struct {
13 | 	XMLName xml.Name `xml:"urn:oasis:names:tc:opendocument:xmlns:office:1.0 document-meta"`
14 | 
15 | 	Version string `xml:"office version,attr"`
16 | 	Meta    Meta   `xml:"meta"`
17 | }
18 | 
19 | type Meta struct {
20 | 	Title string `xml:"title"`
21 | 
22 | 	InitialCreator Time `xml:"initial-creator"`
23 | 	CreationDate   Time `xml:"creation-date"`
24 | 
25 | 	DcCreator string `xml:"dc creator"`
26 | 	DcDate    string `xml:"dc date"`
27 | 	DcLang    string `xml:"dc language"`
28 | 
29 | 	EditingCycles   int    `xml:"editing-cycles"`
30 | 	EditingDuration string `xml:"editing-duration"`
31 | 
32 | 	Stats DocStats `xml:"document-statistic"`
33 | 
34 | 	Generator string `xml:"generator"`
35 | 
36 | 	UserDefined []struct {
37 | 		Name string `xml:"name,attr"`
38 | 		Text string `xml:",chardata"`
39 | 	} `xml:"user-defined"`
40 | }
41 | 
42 | type DocStats struct {
43 | 	Tables     int `xml:"table-count,attr"`
44 | 	Cells      int `xml:"cell-count,attr"`
45 | 	Images     int `xml:"image-count,attr"`
46 | 	Objects    int `xml:"object-count,attr"`
47 | 	Pages      int `xml:"page-count,attr"`
48 | 	Paragraphs int `xml:"paragraph-count,attr"`
49 | 	Words      int `xml:"word-count,attr"`
50 | 	Characters int `xml:"character-count,attr"`
51 | }
52 | 
53 | type Time string
54 | 
55 | func (s Time) Time() (t time.Time, err error) {
56 | 	return time.Parse(ISO8601, string(s))
57 | }
58 | 
59 | func (f *File) Meta() (docMeta *DocumentMeta, err error) {
60 | 	var dm DocumentMeta
61 | 
62 | 	mf, err := f.Open("meta.xml")
63 | 	if err != nil {
64 | 		return
65 | 	}
66 | 	defer mf.Close()
67 | 
68 | 	d := xml.NewDecoder(mf)
69 | 	if err = d.Decode(&dm); err == nil {
70 | 		docMeta = &dm
71 | 	}
72 | 	return
73 | }
74 | 


--------------------------------------------------------------------------------
/odf/meta_test.go:
--------------------------------------------------------------------------------
 1 | package odf
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"testing"
 7 | )
 8 | 
 9 | func ExampleMetaTitle() {
10 | 	f, err := Open("./ods/test.ods")
11 | 	if err != nil {
12 | 		fmt.Fprintln(os.Stderr, err)
13 | 		return
14 | 	}
15 | 	defer f.Close()
16 | 
17 | 	m, err := f.Meta()
18 | 	if err != nil {
19 | 		fmt.Fprintln(os.Stderr, err)
20 | 		return
21 | 	}
22 | 	t, _ := m.Meta.CreationDate.Time()
23 | 	fmt.Println(m.Meta.Title, t.Format("(2006-01-02)"))
24 | 
25 | 	// Output: Test Spreadsheet for odf/ods package (2012-01-10)
26 | }
27 | 
28 | func TestDummy(_ *testing.T) {
29 | }
30 | 


--------------------------------------------------------------------------------
/odf/ods/ods_test.go:
--------------------------------------------------------------------------------
 1 | package ods
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"strconv"
 7 | 	"testing"
 8 | )
 9 | 
10 | func ExampleParseContent() {
11 | 	var doc Doc
12 | 
13 | 	f, err := Open("./test.ods")
14 | 	if err != nil {
15 | 		fmt.Fprintln(os.Stderr, err)
16 | 		return
17 | 	}
18 | 	defer f.Close()
19 | 	if err := f.ParseContent(&doc); err != nil {
20 | 		fmt.Fprintln(os.Stderr, err)
21 | 		return
22 | 	}
23 | 
24 | 	// Dump the first table one line per row, writing
25 | 	// tab separated, quoted fields.
26 | 	if len(doc.Table) > 0 {
27 | 		for _, row := range doc.Table[0].Strings() {
28 | 			if len(row) == 0 {
29 | 				fmt.Println("-")
30 | 				continue
31 | 			}
32 | 			sep := ""
33 | 			for _, field := range row {
34 | 				fmt.Print(sep, strconv.Quote(field))
35 | 				sep = "\t"
36 | 			}
37 | 			fmt.Print("\n")
38 | 		}
39 | 	}
40 | 	// Output:
41 | 	// "A"	"1"	"A cell containing\nmore than one line."
42 | 	// "B"	"foo"
43 | 	// ""	"4"	"quote\"quote"
44 | 	// "14.01.12"
45 | 	// -
46 | 	// "cell spanning two columns"	""
47 | 	// -
48 | 	// -
49 | 	// "aaa"	"cell spanning two rows"	"ccc"
50 | 	// "aa"	""	"cc"
51 | 	// -
52 | 	// "same content"	"same content"	"same content"
53 | 	// -
54 | 	// "same content"
55 | 	// "same content"
56 | 	// "same content"
57 | 	// -
58 | 	// "Cell with inline styles"
59 | }
60 | 
61 | func TestDummy(_ *testing.T) {
62 | }
63 | 


--------------------------------------------------------------------------------
/odf/ods/read.go:
--------------------------------------------------------------------------------
  1 | // This package implements rudimentary support
  2 | // for reading Open Document Spreadsheet files. At current
  3 | // stage table data can be accessed.
  4 | package ods
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"encoding/xml"
  9 | 	"errors"
 10 | 	"io"
 11 | 	"strconv"
 12 | 	"strings"
 13 | 
 14 | 	"github.com/IntelligenceX/fileconversion/odf"
 15 | )
 16 | 
 17 | type Doc struct {
 18 | 	XMLName xml.Name `xml:"document-content"`
 19 | 	Table   []Table  `xml:"body>spreadsheet>table"`
 20 | }
 21 | 
 22 | type Table struct {
 23 | 	Name   string   `xml:"name,attr"`
 24 | 	Column []string `xml:"table-column"`
 25 | 	Row    []Row    `xml:"table-row"`
 26 | }
 27 | 
 28 | type Row struct {
 29 | 	RepeatedRows int `xml:"number-rows-repeated,attr"`
 30 | 
 31 | 	Cell []Cell `xml:",any"` // use ",any" to match table-cell and covered-table-cell
 32 | }
 33 | 
 34 | func (r *Row) IsEmpty() bool {
 35 | 	for _, c := range r.Cell {
 36 | 		if !c.IsEmpty() {
 37 | 			return false
 38 | 		}
 39 | 	}
 40 | 	return true
 41 | }
 42 | 
 43 | // Return the contents of a row as a slice of strings. Cells that are
 44 | // covered by other cells will appear as empty strings.
 45 | func (r *Row) Strings(b *bytes.Buffer) (row []string) {
 46 | 	n := len(r.Cell)
 47 | 	if n == 0 {
 48 | 		return
 49 | 	}
 50 | 
 51 | 	// remove trailing empty cells
 52 | 	for i := n - 1; i >= 0; i-- {
 53 | 		if !r.Cell[i].IsEmpty() {
 54 | 			break
 55 | 		}
 56 | 		n--
 57 | 	}
 58 | 	r.Cell = r.Cell[:n]
 59 | 
 60 | 	n = 0
 61 | 	// calculate the real number of cells (including repeated)
 62 | 	for _, c := range r.Cell {
 63 | 		switch {
 64 | 		case c.RepeatedCols != 0:
 65 | 			n += c.RepeatedCols
 66 | 		default:
 67 | 			n++
 68 | 		}
 69 | 	}
 70 | 
 71 | 	row = make([]string, n)
 72 | 	w := 0
 73 | 	for _, c := range r.Cell {
 74 | 		cs := ""
 75 | 		if c.XMLName.Local != "covered-table-cell" {
 76 | 			cs = c.PlainText(b)
 77 | 		}
 78 | 		row[w] = cs
 79 | 		w++
 80 | 		switch {
 81 | 		case c.RepeatedCols != 0:
 82 | 			for j := 1; j < c.RepeatedCols; j++ {
 83 | 				row[w] = cs
 84 | 				w++
 85 | 			}
 86 | 		}
 87 | 	}
 88 | 	return
 89 | }
 90 | 
 91 | type Cell struct {
 92 | 	XMLName xml.Name
 93 | 
 94 | 	// attributes
 95 | 	ValueType    string `xml:"value-type,attr"`
 96 | 	Value        string `xml:"value,attr"`
 97 | 	Formula      string `xml:"formula,attr"`
 98 | 	RepeatedCols int    `xml:"number-columns-repeated,attr"`
 99 | 	ColSpan      int    `xml:"number-columns-spanned,attr"`
100 | 
101 | 	P []Par `xml:"p"`
102 | }
103 | 
104 | func (c *Cell) IsEmpty() (empty bool) {
105 | 	switch len(c.P) {
106 | 	case 0:
107 | 		empty = true
108 | 	case 1:
109 | 		if c.P[0].XML == "" {
110 | 			empty = true
111 | 		}
112 | 	}
113 | 	return
114 | }
115 | 
116 | // PlainText extracts the text from a cell. Space tags (<text:s text:c="#">)
117 | // are recognized. Inline elements (like span) are ignored, but the
118 | // text they contain is preserved
119 | func (c *Cell) PlainText(b *bytes.Buffer) string {
120 | 	n := len(c.P)
121 | 	if n == 1 {
122 | 		return c.P[0].PlainText(b)
123 | 	}
124 | 
125 | 	b.Reset()
126 | 	for i := range c.P {
127 | 		if i != n-1 {
128 | 			c.P[i].writePlainText(b)
129 | 			b.WriteByte('\n')
130 | 		} else {
131 | 			c.P[i].writePlainText(b)
132 | 		}
133 | 	}
134 | 	return b.String()
135 | }
136 | 
137 | type Par struct {
138 | 	XML string `xml:",innerxml"`
139 | }
140 | 
141 | func (p *Par) PlainText(b *bytes.Buffer) string {
142 | 	for i := range p.XML {
143 | 		if p.XML[i] == '<' || p.XML[i] == '&' {
144 | 			b.Reset()
145 | 			p.writePlainText(b)
146 | 			return b.String()
147 | 		}
148 | 	}
149 | 	return p.XML
150 | }
151 | func (p *Par) writePlainText(b *bytes.Buffer) {
152 | 	for i := range p.XML {
153 | 		if p.XML[i] == '<' || p.XML[i] == '&' {
154 | 			goto decode
155 | 		}
156 | 	}
157 | 	b.WriteString(p.XML)
158 | 	return
159 | 
160 | decode:
161 | 	d := xml.NewDecoder(strings.NewReader(p.XML))
162 | 	for {
163 | 		t, _ := d.Token()
164 | 		if t == nil {
165 | 			break
166 | 		}
167 | 		switch el := t.(type) {
168 | 		case xml.StartElement:
169 | 			switch el.Name.Local {
170 | 			case "s":
171 | 				n := 1
172 | 				for _, a := range el.Attr {
173 | 					if a.Name.Local == "c" {
174 | 						n, _ = strconv.Atoi(a.Value)
175 | 					}
176 | 				}
177 | 				for i := 0; i < n; i++ {
178 | 					b.WriteByte(' ')
179 | 				}
180 | 			}
181 | 		case xml.CharData:
182 | 			b.Write(el)
183 | 		}
184 | 	}
185 | }
186 | 
187 | func (t *Table) Width() int {
188 | 	return len(t.Column)
189 | }
190 | func (t *Table) Height() int {
191 | 	return len(t.Row)
192 | }
193 | func (t *Table) Strings() (s [][]string) {
194 | 	var b bytes.Buffer
195 | 
196 | 	n := len(t.Row)
197 | 	if n == 0 {
198 | 		return
199 | 	}
200 | 
201 | 	// remove trailing empty rows
202 | 	for i := n - 1; i >= 0; i-- {
203 | 		if !t.Row[i].IsEmpty() {
204 | 			break
205 | 		}
206 | 		n--
207 | 	}
208 | 	t.Row = t.Row[:n]
209 | 
210 | 	n = 0
211 | 	// calculate the real number of rows (including repeated rows)
212 | 	for _, r := range t.Row {
213 | 		switch {
214 | 		case r.RepeatedRows != 0:
215 | 			n += r.RepeatedRows
216 | 		default:
217 | 			n++
218 | 		}
219 | 	}
220 | 
221 | 	s = make([][]string, n)
222 | 	w := 0
223 | 	for _, r := range t.Row {
224 | 		row := r.Strings(&b)
225 | 		s[w] = row
226 | 		w++
227 | 		for j := 1; j < r.RepeatedRows; j++ {
228 | 			s[w] = row
229 | 			w++
230 | 		}
231 | 	}
232 | 	return
233 | }
234 | 
235 | type File struct {
236 | 	*odf.File
237 | }
238 | 
239 | // Open an ODS file. If the file doesn't exist or doesn't look
240 | // like a spreadsheet file, an error is returned.
241 | func Open(fileName string) (*File, error) {
242 | 	f, err := odf.Open(fileName)
243 | 	if err != nil {
244 | 		return nil, err
245 | 	}
246 | 	return newFile(f)
247 | }
248 | 
249 | // NewReader initializes a File struct with an already opened
250 | // ODS file, and checks the spreadsheet's media type.
251 | func NewReader(r io.ReaderAt, size int64) (*File, error) {
252 | 	f, err := odf.NewReader(r, size)
253 | 	if err != nil {
254 | 		return nil, err
255 | 	}
256 | 	return newFile(f)
257 | }
258 | 
259 | func newFile(f *odf.File) (*File, error) {
260 | 	if f.MimeType != odf.MimeTypePfx+"spreadsheet" {
261 | 		f.Close()
262 | 		return nil, errors.New("not a spreadsheet")
263 | 	}
264 | 	return &File{f}, nil
265 | }
266 | 
267 | // Parse the content.xml part of an ODS file. On Success
268 | // the returned Doc will contain the data of the rows and cells
269 | // of the table(s) contained in the ODS file.
270 | func (f *File) ParseContent(doc *Doc) (err error) {
271 | 	content, err := f.Open("content.xml")
272 | 	if err != nil {
273 | 		return
274 | 	}
275 | 	defer content.Close()
276 | 
277 | 	d := xml.NewDecoder(content)
278 | 	err = d.Decode(doc)
279 | 	return
280 | }
281 | 


--------------------------------------------------------------------------------
/odf/ods/test.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelligenceX/fileconversion/1b64e2d06acedecada44aff7a3942bef5b811409/odf/ods/test.ods


--------------------------------------------------------------------------------
/odf/read.go:
--------------------------------------------------------------------------------
 1 | package odf
 2 | 
 3 | import (
 4 | 	"archive/zip"
 5 | 	"errors"
 6 | 	"io"
 7 | 	"io/ioutil"
 8 | 	"strings"
 9 | )
10 | 
11 | const (
12 | 	MimeTypePfx = "application/vnd.oasis.opendocument."
13 | )
14 | 
15 | type File struct {
16 | 	*zip.Reader
17 | 	cl       io.Closer
18 | 	MimeType string
19 | }
20 | 
21 | // Open an OpenDocument file for reading, and check its MIME type.
22 | // The returned *File provides -- via its Open method -- access to
23 | // files embedded in the ODF, like content.xml.
24 | func Open(odfName string) (*File, error) {
25 | 	z, err := zip.OpenReader(odfName)
26 | 	if err != nil {
27 | 		return nil, err
28 | 	}
29 | 	return newFile(&z.Reader, z)
30 | }
31 | 
32 | // NewReader initializes a File struct with an already opened ODF
33 | // file, and checks the file's MIME type. The returned *File provides
34 | // access to files embedded in the ODF file, like content.xml.
35 | func NewReader(r io.ReaderAt, size int64) (*File, error) {
36 | 	z, err := zip.NewReader(r, size)
37 | 	if err != nil {
38 | 		return nil, err
39 | 	}
40 | 	return newFile(z, nil)
41 | }
42 | 
43 | func newFile(z *zip.Reader, closer io.Closer) (*File, error) {
44 | 	f := new(File)
45 | 	f.Reader = z
46 | 	mf, err := f.Open("mimetype")
47 | 	if err != nil {
48 | 		if closer != nil {
49 | 			closer.Close()
50 | 		}
51 | 		return nil, err
52 | 	}
53 | 
54 | 	b, err := ioutil.ReadAll(mf)
55 | 	mf.Close()
56 | 	if err != nil {
57 | 		if closer != nil {
58 | 			closer.Close()
59 | 		}
60 | 		return nil, err
61 | 	}
62 | 	f.MimeType = string(b)
63 | 	f.cl = closer
64 | 
65 | 	if !strings.HasPrefix(f.MimeType, MimeTypePfx) {
66 | 		return nil, errors.New("not an Open Document mime type")
67 | 	}
68 | 	return f, nil
69 | }
70 | 
71 | func (f *File) Close() error {
72 | 	if f.cl == nil {
73 | 		return nil
74 | 	}
75 | 	return f.cl.Close()
76 | }
77 | 
78 | func (f *File) Open(name string) (io.ReadCloser, error) {
79 | 	for _, zf := range f.File {
80 | 		if zf.Name == name {
81 | 			return zf.Open()
82 | 		}
83 | 	}
84 | 	return nil, errors.New("odf: open " + name + ": no such file")
85 | }
86 | 


--------------------------------------------------------------------------------
/ole2/README.md:
--------------------------------------------------------------------------------
 1 | # ole2
 2 | Microsoft Compound Document File Format library in Golang
 3 | 
 4 | Forked from https://github.com/extrame/ole2.
 5 | 
 6 | The ole code had major bugs. Slice bounds were not checked and caused crashes.
 7 | 
 8 | The code was adapted to resemble https://github.com/ElevenPaths/FOCA/blob/master/MetadataExtractCore/Metadata/OleDocument.cs.
 9 | Alternative implementation for reference: https://github.com/sassoftware/relic/blob/4db78dcc59ae33d7565f3927e4c7bc8a86ee146c/lib/comdoc/msat.go
10 | 


--------------------------------------------------------------------------------
/ole2/dir.go:
--------------------------------------------------------------------------------
 1 | package ole2
 2 | 
 3 | import (
 4 | 	"unicode/utf16"
 5 | )
 6 | 
 7 | // constants for directory
 8 | const (
 9 | 	EMPTY       = iota
10 | 	USERSTORAGE = iota
11 | 	USERSTREAM  = iota
12 | 	LOCKBYTES   = iota
13 | 	PROPERTY    = iota
14 | 	ROOT        = iota
15 | )
16 | 
17 | // File is an OLE file
18 | type File struct {
19 | 	NameBts   [32]uint16
20 | 	Bsize     uint16
21 | 	Type      byte
22 | 	Flag      byte
23 | 	Left      uint32
24 | 	Right     uint32
25 | 	Child     uint32
26 | 	GUID      [8]uint16
27 | 	Userflags uint32
28 | 	Time      [2]uint64
29 | 	Sstart    int32
30 | 	Size      uint32
31 | 	Proptype  uint32
32 | }
33 | 
34 | // Name returns the file name
35 | func (d *File) Name() string {
36 | 	if int(d.Bsize)/2-1 > len(d.NameBts) || int(d.Bsize)/2-1 < 0 {
37 | 		return ""
38 | 	}
39 | 
40 | 	runes := utf16.Decode(d.NameBts[:d.Bsize/2-1])
41 | 	return string(runes)
42 | }
43 | 


--------------------------------------------------------------------------------
/ole2/header.go:
--------------------------------------------------------------------------------
 1 | package ole2
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/binary"
 6 | 	"fmt"
 7 | )
 8 | 
 9 | // Header represents the OLE header
10 | type Header struct {
11 | 	Signature               [2]uint32
12 | 	Clid                    [4]uint32
13 | 	RevisionNumber          uint16
14 | 	VersionNumber           uint16
15 | 	ByteOrder               uint16
16 | 	SizeOfSector            uint16
17 | 	SizeOfShortSector       uint16
18 | 	_                       uint16
19 | 	_                       uint64
20 | 	NumberOfSectorsSAT      uint32 //Total number of sectors used for the sector allocation table
21 | 	FirstSecIDDirectory     int32  //SecID of first sector of the directory stream
22 | 	_                       uint32
23 | 	MinSizeOfStandardStream uint32 //Minimum size of a standard stream
24 | 	FirstSecIDSSAT          int32  //SecID of first sector of the short-sector allocation table
25 | 	NumberOfSectorsSSAT     uint32 //Total number of sectors used for the short-sector allocation table
26 | 	FirstSecIDMSAT          int32  //SecID of first sector of the master sector allocation table
27 | 	NumberOfSectorsMSAT     uint32 //Total number of sectors used for the master sector allocation table
28 | 	FirstPartOfMSAT         [109]int32
29 | }
30 | 
31 | func parseHeader(bts []byte) (*Header, error) {
32 | 	buf := bytes.NewBuffer(bts)
33 | 	header := new(Header)
34 | 	binary.Read(buf, binary.LittleEndian, header)
35 | 	if header.Signature[0] != 0xE011CFD0 || header.Signature[1] != 0xE11AB1A1 || header.ByteOrder != 0xFFFE {
36 | 		return nil, fmt.Errorf("not an excel file")
37 | 	}
38 | 
39 | 	return header, nil
40 | }
41 | 


--------------------------------------------------------------------------------
/ole2/ole.go:
--------------------------------------------------------------------------------
  1 | package ole2
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"io"
  6 | )
  7 | 
  8 | // SecIDType
  9 | const (
 10 | 	MSATSecID = iota - 4
 11 | 	SATSecID
 12 | 	EOFSecID
 13 | 	FreeSecID
 14 | )
 15 | 
 16 | // Ole represents an OLE file
 17 | type Ole struct {
 18 | 	header   *Header
 19 | 	Lsector  uint32
 20 | 	Lssector uint32
 21 | 	SecID    []int32
 22 | 	SSecID   []int32
 23 | 	Files    []File
 24 | 	reader   io.ReadSeeker
 25 | }
 26 | 
 27 | func Open(reader io.ReadSeeker, charset string) (ole *Ole, err error) {
 28 | 	var header *Header
 29 | 	var hbts = make([]byte, 512)
 30 | 	reader.Read(hbts)
 31 | 	if header, err = parseHeader(hbts); err == nil {
 32 | 		ole = new(Ole)
 33 | 		ole.reader = reader
 34 | 		ole.header = header
 35 | 		ole.Lsector = 512 //TODO
 36 | 		ole.Lssector = 64 //TODO
 37 | 
 38 | 		if err = ole.readMSAT(); err != nil {
 39 | 			return ole, err
 40 | 		}
 41 | 
 42 | 		if err = ole.readSSAT(); err != nil {
 43 | 			return ole, err
 44 | 		}
 45 | 
 46 | 		return ole, nil
 47 | 	}
 48 | 
 49 | 	return nil, err
 50 | }
 51 | 
 52 | func (o *Ole) ListDir() (dir []*File, err error) {
 53 | 	sector := o.stream_read(o.header.FirstSecIDDirectory, 0)
 54 | 	dir = make([]*File, 0)
 55 | 	for {
 56 | 		d := new(File)
 57 | 		err = binary.Read(sector, binary.LittleEndian, d)
 58 | 		if err == nil && d.Type != EMPTY {
 59 | 			dir = append(dir, d)
 60 | 		} else {
 61 | 			break
 62 | 		}
 63 | 	}
 64 | 	if err == io.EOF && dir != nil {
 65 | 		return dir, nil
 66 | 	}
 67 | 
 68 | 	return
 69 | }
 70 | 
 71 | func (o *Ole) OpenFile(file *File, root *File) io.ReadSeeker {
 72 | 	if file.Size < o.header.MinSizeOfStandardStream {
 73 | 		return o.short_stream_read(file.Sstart, file.Size, root.Sstart)
 74 | 	} else {
 75 | 		return o.stream_read(file.Sstart, file.Size)
 76 | 	}
 77 | }
 78 | 
 79 | // Read MSAT
 80 | func (o *Ole) readMSAT() error {
 81 | 	for i := uint32(0); i < 109 && i < o.header.NumberOfSectorsSAT; i++ {
 82 | 		if sector, err := o.sector_read(o.header.FirstPartOfMSAT[i]); err == nil {
 83 | 			sids := sector.AllValues(o.Lsector)
 84 | 			o.SecID = append(o.SecID, sids...)
 85 | 		} else {
 86 | 			return err
 87 | 		}
 88 | 	}
 89 | 
 90 | 	if o.header.NumberOfSectorsSAT > 109 && o.header.NumberOfSectorsMSAT != 0 && o.header.FirstSecIDMSAT >= 0 {
 91 | 		sid := o.header.FirstSecIDMSAT
 92 | 
 93 | 		for j := uint32(109); sid != EOFSecID && j < o.header.NumberOfSectorsSAT; {
 94 | 			if sector, err := o.sector_read(sid); err == nil {
 95 | 				sids := sector.MsatValues(o.Lsector)
 96 | 
 97 | 				for _, sid := range sids {
 98 | 					j++
 99 | 					if sector, err := o.sector_read(int32(sid)); err == nil {
100 | 						sids := sector.AllValues(o.Lsector)
101 | 
102 | 						o.SecID = append(o.SecID, sids...)
103 | 					} else {
104 | 						return err
105 | 					}
106 | 				}
107 | 
108 | 				sid = sector.NextSid(o.Lsector)
109 | 			} else {
110 | 				return err
111 | 			}
112 | 		}
113 | 	}
114 | 
115 | 	return nil
116 | }
117 | 
118 | func (o *Ole) readSSAT() error {
119 | 	sid := o.header.FirstSecIDSSAT
120 | 
121 | 	for i := uint32(0); i < o.header.NumberOfSectorsSSAT; i++ {
122 | 		if sid != EOFSecID {
123 | 			if sector, err := o.sector_read(sid); err == nil {
124 | 				sids := sector.MsatValues(o.Lsector)
125 | 
126 | 				o.SSecID = append(o.SSecID, sids...)
127 | 
128 | 				sid = sector.NextSid(o.Lsector)
129 | 			} else {
130 | 				return err
131 | 			}
132 | 		}
133 | 	}
134 | 
135 | 	return nil
136 | }
137 | 
138 | func (o *Ole) stream_read(sid int32, size uint32) *StreamReader {
139 | 	return &StreamReader{o.SecID, sid, o.reader, sid, 0, o.Lsector, int64(size), 0, sector_pos}
140 | }
141 | 
142 | func (o *Ole) short_stream_read(sid int32, size uint32, startSecId int32) *StreamReader {
143 | 	ssatReader := &StreamReader{o.SecID, startSecId, o.reader, sid, 0, o.Lsector, int64(uint32(len(o.SSecID)) * o.Lssector), 0, sector_pos}
144 | 	return &StreamReader{o.SSecID, sid, ssatReader, sid, 0, o.Lssector, int64(size), 0, short_sector_pos}
145 | }
146 | 
147 | func (o *Ole) sector_read(sid int32) (Sector, error) {
148 | 	return o.sector_read_internal(sid, o.Lsector)
149 | }
150 | 
151 | func (o *Ole) short_sector_read(sid int32) (Sector, error) {
152 | 	return o.sector_read_internal(sid, o.Lssector)
153 | }
154 | 
155 | func (o *Ole) sector_read_internal(sid int32, size uint32) (Sector, error) {
156 | 	pos := sector_pos(sid, size)
157 | 	if _, err := o.reader.Seek(int64(pos), 0); err == nil {
158 | 		var bts = make([]byte, size)
159 | 		o.reader.Read(bts)
160 | 		return Sector(bts), nil
161 | 	} else {
162 | 		return nil, err
163 | 	}
164 | }
165 | 
166 | func sector_pos(sid int32, size uint32) int32 {
167 | 	return 512 + sid*int32(size)
168 | }
169 | 
170 | func short_sector_pos(sid int32, size uint32) int32 {
171 | 	return sid * int32(size)
172 | }
173 | 


--------------------------------------------------------------------------------
/ole2/pss.go:
--------------------------------------------------------------------------------
 1 | package ole2
 2 | 
 3 | import ()
 4 | 
 5 | type PSS struct {
 6 | 	name      [64]byte
 7 | 	bsize     uint16
 8 | 	typ       byte
 9 | 	flag      byte
10 | 	left      uint32
11 | 	right     uint32
12 | 	child     uint32
13 | 	guid      [16]uint16
14 | 	userflags uint32
15 | 	time      [2]uint64
16 | 	sstart    uint32
17 | 	size      uint32
18 | 	_         uint32
19 | }
20 | 


--------------------------------------------------------------------------------
/ole2/sector.go:
--------------------------------------------------------------------------------
 1 | package ole2
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/binary"
 6 | )
 7 | 
 8 | type Sector []byte
 9 | 
10 | func (s *Sector) Uint32(bit uint32) uint32 {
11 | 	return binary.LittleEndian.Uint32((*s)[bit : bit+4])
12 | }
13 | 
14 | func (s *Sector) NextSid(size uint32) int32 {
15 | 	return int32(s.Uint32(size - 4))
16 | }
17 | 
18 | func (s *Sector) MsatValues(size uint32) []int32 {
19 | 
20 | 	return s.values(size, int(size/4-1))
21 | }
22 | 
23 | func (s *Sector) AllValues(size uint32) []int32 {
24 | 
25 | 	return s.values(size, int(size/4))
26 | }
27 | 
28 | func (s *Sector) values(size uint32, length int) []int32 {
29 | 
30 | 	var res = make([]int32, length)
31 | 
32 | 	buf := bytes.NewBuffer((*s))
33 | 
34 | 	binary.Read(buf, binary.LittleEndian, res)
35 | 
36 | 	return res
37 | }
38 | 


--------------------------------------------------------------------------------
/ole2/stream_reader.go:
--------------------------------------------------------------------------------
  1 | package ole2
  2 | 
  3 | import (
  4 | 	"io"
  5 | 	"log"
  6 | )
  7 | 
  8 | // DEBUG enables debug
  9 | var DEBUG = false
 10 | 
 11 | // StreamReader is an OLE stream reader
 12 | type StreamReader struct {
 13 | 	sat            []int32
 14 | 	start          int32
 15 | 	reader         io.ReadSeeker
 16 | 	offsetOfSector int32
 17 | 	offsetInSector uint32
 18 | 	sizeSector     uint32
 19 | 	size           int64
 20 | 	offset         int64
 21 | 	sectorPos      func(int32, uint32) int32
 22 | }
 23 | 
 24 | // Read reads data from the stream into p
 25 | func (r *StreamReader) Read(p []byte) (n int, err error) {
 26 | 	if r.offsetOfSector == EOFSecID {
 27 | 		return 0, io.EOF
 28 | 	}
 29 | 	pos := r.sectorPos(r.offsetOfSector, r.sizeSector) + int32(r.offsetInSector)
 30 | 	r.reader.Seek(int64(pos), 0)
 31 | 	readed := uint32(0)
 32 | 	for remainLen := uint32(len(p)) - readed; remainLen > r.sizeSector-r.offsetInSector; remainLen = uint32(len(p)) - readed {
 33 | 		if n, err := r.reader.Read(p[readed : readed+r.sizeSector-r.offsetInSector]); err != nil {
 34 | 			return int(readed) + n, err
 35 | 		} else {
 36 | 			readed += uint32(n)
 37 | 			r.offsetInSector = 0
 38 | 			if r.offsetOfSector >= int32(len(r.sat)) {
 39 | 				//log.Fatal(`
 40 | 				//THIS SHOULD NOT HAPPEN, IF YOUR PROGRAM BREAK,
 41 | 				//COMMENT THIS LINE TO CONTINUE AND MAIL ME XLS FILE
 42 | 				//TO TEST, THANKS`)
 43 | 				return int(readed), io.EOF
 44 | 			} else {
 45 | 				r.offsetOfSector = r.sat[r.offsetOfSector]
 46 | 			}
 47 | 			if r.offsetOfSector == EOFSecID {
 48 | 				return int(readed), io.EOF
 49 | 			}
 50 | 			pos := r.sectorPos(r.offsetOfSector, r.sizeSector) + int32(r.offsetInSector)
 51 | 			r.reader.Seek(int64(pos), 0)
 52 | 		}
 53 | 	}
 54 | 	if n, err := r.reader.Read(p[readed:len(p)]); err == nil {
 55 | 		r.offsetInSector += uint32(n)
 56 | 		if DEBUG {
 57 | 			log.Printf("pos:%x,bit:% X", r.offsetOfSector, p)
 58 | 		}
 59 | 		return len(p), nil
 60 | 	} else {
 61 | 		return int(readed) + n, err
 62 | 	}
 63 | 
 64 | }
 65 | 
 66 | // Seek seeks the stream to the given offset
 67 | func (r *StreamReader) Seek(offset int64, whence int) (offsetResult int64, err error) {
 68 | 
 69 | 	if whence == 0 {
 70 | 		r.offsetOfSector = r.start
 71 | 		r.offsetInSector = 0
 72 | 		r.offset = offset
 73 | 	} else {
 74 | 		r.offset += offset
 75 | 	}
 76 | 
 77 | 	if r.offsetOfSector == EOFSecID {
 78 | 		return r.offset, io.EOF
 79 | 	}
 80 | 
 81 | 	for offset >= int64(r.sizeSector-r.offsetInSector) {
 82 | 		r.offsetOfSector = r.sat[r.offsetOfSector]
 83 | 		offset -= int64(r.sizeSector - r.offsetInSector)
 84 | 		r.offsetInSector = 0
 85 | 		if r.offsetOfSector == EOFSecID {
 86 | 			err = io.EOF
 87 | 			goto return_res
 88 | 		}
 89 | 	}
 90 | 
 91 | 	if r.size <= r.offset {
 92 | 		err = io.EOF
 93 | 		r.offset = r.size
 94 | 	} else {
 95 | 		r.offsetInSector += uint32(offset)
 96 | 	}
 97 | return_res:
 98 | 	offsetResult = r.offset
 99 | 	return
100 | }
101 | 


--------------------------------------------------------------------------------
/ole2/stream_reader_test.go:
--------------------------------------------------------------------------------
 1 | package ole2
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"testing"
 7 | )
 8 | 
 9 | func TestRead(t *testing.T) {
10 | 	bts := make([]byte, 1<<10)
11 | 	for i := 0; i < 1<<10; i++ {
12 | 		bts[i] = byte(i)
13 | 	}
14 | 	ole := &Ole{nil, 8, 1, []uint32{2, 1, ENDOFCHAIN}, []uint32{}, []File{}, bytes.NewReader(bts)}
15 | 	r := ole.stream_read(0, 30)
16 | 	res := make([]byte, 14)
17 | 	fmt.Println(r.Read(res))
18 | 	fmt.Println(res)
19 | }
20 | 
21 | func TestSeek(t *testing.T) {
22 | 	bts := make([]byte, 1<<10)
23 | 	for i := 0; i < 1<<10; i++ {
24 | 		bts[i] = byte(i)
25 | 	}
26 | 	ole := &Ole{nil, 8, 1, []uint32{2, 1, ENDOFCHAIN}, []uint32{}, []File{}, bytes.NewReader(bts)}
27 | 	r := ole.stream_read(0, 30)
28 | 	fmt.Println(r.Seek(2, 1))
29 | 	fmt.Println(r.Seek(2, 1))
30 | 	fmt.Println(r.Seek(2, 1))
31 | 	fmt.Println(r.Seek(2, 1))
32 | 	fmt.Println(r.Seek(2, 1))
33 | 	fmt.Println(r.Seek(2, 1))
34 | 	fmt.Println(r.Seek(2, 1))
35 | 	fmt.Println(r.Seek(2, 1))
36 | 	fmt.Println(r.Seek(2, 1))
37 | 	fmt.Println(r.Seek(2, 1))
38 | 	fmt.Println(r.Seek(2, 1))
39 | 	fmt.Println(r.Seek(2, 1))
40 | 	fmt.Println(r.Seek(2, 1))
41 | 	fmt.Println(r.Seek(2, 1))
42 | 	fmt.Println(r.Seek(2, 1))
43 | 	fmt.Println(r.Seek(2, 1))
44 | 	fmt.Println(r.Seek(2, 1))
45 | 	fmt.Println(r.Seek(2, 1))
46 | 	fmt.Println(r.Seek(2, 1))
47 | }
48 | 
49 | func TestSeek1(t *testing.T) {
50 | 	bts := make([]byte, 1<<10)
51 | 	for i := 0; i < 1<<10; i++ {
52 | 		bts[i] = byte(i)
53 | 	}
54 | 	ole := &Ole{nil, 8, 1, []uint32{2, 1, ENDOFCHAIN}, []uint32{}, []File{}, bytes.NewReader(bts)}
55 | 	r := ole.stream_read(0, 30)
56 | 	fmt.Println(r.Seek(2, 1))
57 | 	fmt.Println(r.Seek(2, 1))
58 | 	fmt.Println(r.Seek(2, 1))
59 | 	fmt.Println(r.Seek(2, 1))
60 | 	fmt.Println(r.Seek(2, 1))
61 | 	fmt.Println(r.Seek(2, 1))
62 | 	fmt.Println(r.Seek(2, 1))
63 | 	fmt.Println(r.Seek(2, 1))
64 | 	fmt.Println(r.Seek(2, 1))
65 | 	fmt.Println(r.Seek(2, 1))
66 | 	fmt.Println(r.Seek(2, 1))
67 | 	fmt.Println(r.Seek(2, 1))
68 | 	fmt.Println(r.Seek(2, 1))
69 | 	fmt.Println(r.Seek(2, 1))
70 | 	fmt.Println(r.Seek(2, 1))
71 | 	fmt.Println(r.Seek(2, 1))
72 | 	fmt.Println(r.Seek(2, 1))
73 | 	fmt.Println(r.Seek(2, 1))
74 | 	fmt.Println(r.Seek(2, 1))
75 | }
76 | 


--------------------------------------------------------------------------------
/xls/README.md:
--------------------------------------------------------------------------------
 1 | # xls
 2 | 
 3 | Forked from https://github.com/sergeilem/xls which is a fork from https://github.com/extrame/xls.
 4 | 
 5 | Pure Golang xls library writen by [Rongshu Tech (chinese)](http://www.rongshu.tech), based on libxls. 
 6 | 
 7 | Thanks for contributions from Tamás Gulácsi @tgulacsi, @flyin9.
 8 | 
 9 | # Basic Usage
10 | 
11 | * Use **Open** function for open file
12 | * Use **OpenWithCloser** function for open file and use the return value closer for close file
13 | * Use **OpenReader** function for open xls from a reader, you should close related file in your own code
14 | 
15 | * Follow the example in GoDoc


--------------------------------------------------------------------------------
/xls/bigtable_test.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 	"time"
 7 | )
 8 | 
 9 | func TestBigTable(t *testing.T) {
10 | 	xlFile, err := Open("BigTable.xls", "utf-8")
11 | 	if err != nil {
12 | 		t.Fatalf("Cant open xls file: %s", err)
13 | 	}
14 | 
15 | 	sheet := xlFile.GetSheet(0)
16 | 	if sheet == nil {
17 | 		t.Fatal("Cant get sheet")
18 | 	}
19 | 
20 | 	cnt1 := 1
21 | 	cnt2 := 10000
22 | 	cnt3 := 20000
23 | 	date1, _ := time.Parse("2006-01-02", "2015-01-01")
24 | 	date2, _ := time.Parse("2006-01-02", "2016-01-01")
25 | 	date3, _ := time.Parse("2006-01-02", "2017-01-01")
26 | 
27 | 	for i := 1; i <= 4999; i++ {
28 | 		row := sheet.Row(i)
29 | 		if row == nil {
30 | 			continue
31 | 		}
32 | 
33 | 		col2sample := fmt.Sprintf("%d от %s", cnt1, date1.Format("02.01.2006"))
34 | 		col5sample := fmt.Sprintf("%d от %s", cnt2, date2.Format("02.01.2006"))
35 | 		col8sample := fmt.Sprintf("%d от %s", cnt3, date3.Format("02.01.2006"))
36 | 
37 | 		col2 := row.Col(2)
38 | 		col5 := row.Col(5)
39 | 		col8 := row.Col(8)
40 | 
41 | 		if col2 != col2sample {
42 | 			t.Fatalf("Row %d: col 2 val not eq base value: %s != %s", i, col2, col2sample)
43 | 		}
44 | 		if col5 != col5sample {
45 | 			t.Fatalf("Row %d: col 5 val not eq base value: %s != %s", i, col5, col5sample)
46 | 		}
47 | 		if col8 != col8sample {
48 | 			t.Fatalf("Row %d: col 8 val not eq base value: %s != %s", i, col8, col8sample)
49 | 		}
50 | 
51 | 		cnt1++
52 | 		cnt2++
53 | 		cnt3++
54 | 		date1 = date1.AddDate(0, 0, 1)
55 | 		date2 = date2.AddDate(0, 0, 1)
56 | 		date3 = date3.AddDate(0, 0, 1)
57 | 
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/xls/bof.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"encoding/binary"
 5 | 	"io"
 6 | 	"unicode/utf16"
 7 | )
 8 | 
 9 | //the information unit in xls file
10 | type bof struct {
11 | 	Id   uint16
12 | 	Size uint16
13 | }
14 | 
15 | //read the utf16 string from reader
16 | func (b *bof) utf16String(buf io.ReadSeeker, count uint32) string {
17 | 	var bts = make([]uint16, count)
18 | 	binary.Read(buf, binary.LittleEndian, &bts)
19 | 	runes := utf16.Decode(bts[:len(bts)-1])
20 | 	return string(runes)
21 | }
22 | 
23 | type biffHeader struct {
24 | 	Ver     uint16
25 | 	Type    uint16
26 | 	Id_make uint16
27 | 	Year    uint16
28 | 	Flags   uint32
29 | 	Min_ver uint32
30 | }
31 | 


--------------------------------------------------------------------------------
/xls/cell_range.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | // range type of multi rows
 8 | type Ranger interface {
 9 | 	FirstRow() uint16
10 | 	LastRow() uint16
11 | }
12 | 
13 | // range type of multi cells in multi rows
14 | type CellRange struct {
15 | 	FirstRowB uint16
16 | 	LastRowB  uint16
17 | 	FristColB uint16
18 | 	LastColB  uint16
19 | }
20 | 
21 | func (c *CellRange) FirstRow() uint16 {
22 | 	return c.FirstRowB
23 | }
24 | 
25 | func (c *CellRange) LastRow() uint16 {
26 | 	return c.LastRowB
27 | }
28 | 
29 | func (c *CellRange) FirstCol() uint16 {
30 | 	return c.FristColB
31 | }
32 | 
33 | func (c *CellRange) LastCol() uint16 {
34 | 	return c.LastColB
35 | }
36 | 
37 | //hyperlink type's content
38 | type HyperLink struct {
39 | 	CellRange
40 | 	Description      string
41 | 	TextMark         string
42 | 	TargetFrame      string
43 | 	Url              string
44 | 	ShortedFilePath  string
45 | 	ExtendedFilePath string
46 | 	IsUrl            bool
47 | }
48 | 
49 | //get the hyperlink string, use the public variable Url to get the original Url
50 | func (h *HyperLink) String(wb *WorkBook) []string {
51 | 	res := make([]string, h.LastColB-h.FristColB+1)
52 | 	var str string
53 | 	if h.IsUrl {
54 | 		str = fmt.Sprintf("%s(%s)", h.Description, h.Url)
55 | 	} else {
56 | 		str = h.ExtendedFilePath
57 | 	}
58 | 
59 | 	for i := uint16(0); i < h.LastColB-h.FristColB+1; i++ {
60 | 		res[i] = str
61 | 	}
62 | 	return res
63 | }
64 | 


--------------------------------------------------------------------------------
/xls/col.go:
--------------------------------------------------------------------------------
  1 | package xls
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"strconv"
  7 | 	"strings"
  8 | 
  9 | 	"time"
 10 | 
 11 | 	yymmdd "github.com/extrame/goyymmdd"
 12 | )
 13 | 
 14 | //content type
 15 | type contentHandler interface {
 16 | 	String(*WorkBook) []string
 17 | 	FirstCol() uint16
 18 | 	LastCol() uint16
 19 | }
 20 | 
 21 | type Col struct {
 22 | 	RowB      uint16
 23 | 	FirstColB uint16
 24 | }
 25 | 
 26 | type Coler interface {
 27 | 	Row() uint16
 28 | }
 29 | 
 30 | func (c *Col) Row() uint16 {
 31 | 	return c.RowB
 32 | }
 33 | 
 34 | func (c *Col) FirstCol() uint16 {
 35 | 	return c.FirstColB
 36 | }
 37 | 
 38 | func (c *Col) LastCol() uint16 {
 39 | 	return c.FirstColB
 40 | }
 41 | 
 42 | func (c *Col) String(wb *WorkBook) []string {
 43 | 	return []string{"default"}
 44 | }
 45 | 
 46 | type XfRk struct {
 47 | 	Index uint16
 48 | 	Rk    RK
 49 | }
 50 | 
 51 | func (xf *XfRk) String(wb *WorkBook) string {
 52 | 	idx := int(xf.Index)
 53 | 	if len(wb.Xfs) > idx {
 54 | 		fNo := wb.Xfs[idx].formatNo()
 55 | 		if fNo >= 164 { // user defined format
 56 | 			if formatter := wb.Formats[fNo]; formatter != nil {
 57 | 				formatterLower := strings.ToLower(formatter.str)
 58 | 				if formatterLower == "general" ||
 59 | 					strings.Contains(formatter.str, "#") ||
 60 | 					strings.Contains(formatter.str, ".00") ||
 61 | 					strings.Contains(formatterLower, "m/y") ||
 62 | 					strings.Contains(formatterLower, "d/y") ||
 63 | 					strings.Contains(formatterLower, "m.y") ||
 64 | 					strings.Contains(formatterLower, "d.y") ||
 65 | 					strings.Contains(formatterLower, "h:") ||
 66 | 					strings.Contains(formatterLower, "д.г") {
 67 | 					//If format contains # or .00 then this is a number
 68 | 					return xf.Rk.String()
 69 | 				} else {
 70 | 					i, f, isFloat := xf.Rk.number()
 71 | 					if !isFloat {
 72 | 						f = float64(i)
 73 | 					}
 74 | 					t := timeFromExcelTime(f, wb.dateMode == 1)
 75 | 
 76 | 					return yymmdd.Format(t, formatter.str)
 77 | 				}
 78 | 			}
 79 | 			// see http://www.openoffice.org/sc/excelfileformat.pdf Page #174
 80 | 		} else if 14 <= fNo && fNo <= 17 || fNo == 22 || 27 <= fNo && fNo <= 36 || 50 <= fNo && fNo <= 58 { // jp. date format
 81 | 			i, f, isFloat := xf.Rk.number()
 82 | 			if !isFloat {
 83 | 				f = float64(i)
 84 | 			}
 85 | 			t := timeFromExcelTime(f, wb.dateMode == 1)
 86 | 			return t.Format(time.RFC3339) //TODO it should be international
 87 | 		}
 88 | 	}
 89 | 	return xf.Rk.String()
 90 | }
 91 | 
 92 | type RK uint32
 93 | 
 94 | func (rk RK) number() (intNum int64, floatNum float64, isFloat bool) {
 95 | 	multiplied := rk & 1
 96 | 	isInt := rk & 2
 97 | 	val := int32(rk) >> 2
 98 | 	if isInt == 0 {
 99 | 		isFloat = true
100 | 		floatNum = math.Float64frombits(uint64(val) << 34)
101 | 		if multiplied != 0 {
102 | 			floatNum = floatNum / 100
103 | 		}
104 | 		return
105 | 	}
106 | 	if multiplied != 0 {
107 | 		isFloat = true
108 | 		floatNum = float64(val) / 100
109 | 		return
110 | 	}
111 | 	return int64(val), 0, false
112 | }
113 | 
114 | func (rk RK) String() string {
115 | 	i, f, isFloat := rk.number()
116 | 	if isFloat {
117 | 		return strconv.FormatFloat(f, 'f', -1, 64)
118 | 	}
119 | 	return strconv.FormatInt(i, 10)
120 | }
121 | 
122 | var ErrIsInt = fmt.Errorf("is int")
123 | 
124 | func (rk RK) Float() (float64, error) {
125 | 	_, f, isFloat := rk.number()
126 | 	if !isFloat {
127 | 		return 0, ErrIsInt
128 | 	}
129 | 	return f, nil
130 | }
131 | 
132 | type MulrkCol struct {
133 | 	Col
134 | 	Xfrks    []XfRk
135 | 	LastColB uint16
136 | }
137 | 
138 | func (c *MulrkCol) LastCol() uint16 {
139 | 	return c.LastColB
140 | }
141 | 
142 | func (c *MulrkCol) String(wb *WorkBook) []string {
143 | 	var res = make([]string, len(c.Xfrks))
144 | 	for i := 0; i < len(c.Xfrks); i++ {
145 | 		xfrk := c.Xfrks[i]
146 | 		res[i] = xfrk.String(wb)
147 | 	}
148 | 	return res
149 | }
150 | 
151 | type MulBlankCol struct {
152 | 	Col
153 | 	Xfs      []uint16
154 | 	LastColB uint16
155 | }
156 | 
157 | func (c *MulBlankCol) LastCol() uint16 {
158 | 	return c.LastColB
159 | }
160 | 
161 | func (c *MulBlankCol) String(wb *WorkBook) []string {
162 | 	return make([]string, len(c.Xfs))
163 | }
164 | 
165 | type NumberCol struct {
166 | 	Col
167 | 	Index uint16
168 | 	Float float64
169 | }
170 | 
171 | func (c *NumberCol) String(wb *WorkBook) []string {
172 | 	return []string{strconv.FormatFloat(c.Float, 'f', -1, 64)}
173 | }
174 | 
175 | type FormulaCol struct {
176 | 	Header struct {
177 | 		Col
178 | 		IndexXf uint16
179 | 		Result  [8]byte
180 | 		Flags   uint16
181 | 		_       uint32
182 | 	}
183 | 	Bts []byte
184 | }
185 | 
186 | func (c *FormulaCol) String(wb *WorkBook) []string {
187 | 	return []string{"FormulaCol"}
188 | }
189 | 
190 | type RkCol struct {
191 | 	Col
192 | 	Xfrk XfRk
193 | }
194 | 
195 | func (c *RkCol) String(wb *WorkBook) []string {
196 | 	return []string{c.Xfrk.String(wb)}
197 | }
198 | 
199 | type LabelsstCol struct {
200 | 	Col
201 | 	Xf  uint16
202 | 	Sst uint32
203 | }
204 | 
205 | func (c *LabelsstCol) String(wb *WorkBook) []string {
206 | 	if int(c.Sst) >= len(wb.sst) || int(c.Sst) < 0 {
207 | 		return []string{""}
208 | 	}
209 | 	return []string{wb.sst[int(c.Sst)]}
210 | }
211 | 
212 | type labelCol struct {
213 | 	BlankCol
214 | 	Str string
215 | }
216 | 
217 | func (c *labelCol) String(wb *WorkBook) []string {
218 | 	return []string{c.Str}
219 | }
220 | 
221 | type BlankCol struct {
222 | 	Col
223 | 	Xf uint16
224 | }
225 | 
226 | func (c *BlankCol) String(wb *WorkBook) []string {
227 | 	return []string{""}
228 | }
229 | 


--------------------------------------------------------------------------------
/xls/comparexlsxlsx.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/tealeg/xlsx"
 6 | 	"math"
 7 | 	"strconv"
 8 | )
 9 | 
10 | //Compares xls and xlsx files
11 | func CompareXlsXlsx(xlsfilepathname string, xlsxfilepathname string) string {
12 | 	xlsFile, err := Open(xlsfilepathname, "utf-8")
13 | 	if err != nil {
14 | 		return fmt.Sprintf("Cant open xls file: %s", err)
15 | 	}
16 | 
17 | 	xlsxFile, err := xlsx.OpenFile(xlsxfilepathname)
18 | 	if err != nil {
19 | 		return fmt.Sprintf("Cant open xlsx file: %s", err)
20 | 	}
21 | 
22 | 	for sheet, xlsxSheet := range xlsxFile.Sheets {
23 | 		xlsSheet := xlsFile.GetSheet(sheet)
24 | 		if xlsSheet == nil {
25 | 			return fmt.Sprintf("Cant get xls sheet")
26 | 		}
27 | 		for row, xlsxRow := range xlsxSheet.Rows {
28 | 			xlsRow := xlsSheet.Row(row)
29 | 			for cell, xlsxCell := range xlsxRow.Cells {
30 | 				xlsxText := xlsxCell.String()
31 | 				xlsText := xlsRow.Col(cell)
32 | 				if xlsText != xlsxText {
33 | 					//try to convert to numbers
34 | 					xlsFloat, xlsErr := strconv.ParseFloat(xlsText, 64)
35 | 					xlsxFloat, xlsxErr := strconv.ParseFloat(xlsxText, 64)
36 | 					//check if numbers have no significant difference
37 | 					if xlsErr == nil && xlsxErr == nil {
38 | 						diff := math.Abs(xlsFloat - xlsxFloat)
39 | 						if diff > 0.0000001 {
40 | 							return fmt.Sprintf("sheet:%d, row/col: %d/%d, xlsx: (%s)[%d], xls: (%s)[%d], numbers difference: %f.",
41 | 								sheet, row, cell, xlsxText, len(xlsxText),
42 | 								xlsText, len(xlsText), diff)
43 | 						}
44 | 					} else {
45 | 						return fmt.Sprintf("sheet:%d, row/col: %d/%d, xlsx: (%s)[%d], xls: (%s)[%d].",
46 | 							sheet, row, cell, xlsxText, len(xlsxText),
47 | 							xlsText, len(xlsText))
48 | 					}
49 | 				}
50 | 			}
51 | 		}
52 | 	}
53 | 
54 | 	return ""
55 | }
56 | 


--------------------------------------------------------------------------------
/xls/date.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"time"
 6 | )
 7 | 
 8 | const MJD_0 float64 = 2400000.5
 9 | const MJD_JD2000 float64 = 51544.5
10 | 
11 | func shiftJulianToNoon(julianDays, julianFraction float64) (float64, float64) {
12 | 	switch {
13 | 	case -0.5 < julianFraction && julianFraction < 0.5:
14 | 		julianFraction += 0.5
15 | 	case julianFraction >= 0.5:
16 | 		julianDays += 1
17 | 		julianFraction -= 0.5
18 | 	case julianFraction <= -0.5:
19 | 		julianDays -= 1
20 | 		julianFraction += 1.5
21 | 	}
22 | 	return julianDays, julianFraction
23 | }
24 | 
25 | // Return the integer values for hour, minutes, seconds and
26 | // nanoseconds that comprised a given fraction of a day.
27 | func fractionOfADay(fraction float64) (hours, minutes, seconds, nanoseconds int) {
28 | 	f := 5184000000000000 * fraction
29 | 	nanoseconds = int(math.Mod(f, 1000000000))
30 | 	f = f / 1000000000
31 | 	seconds = int(math.Mod(f, 60))
32 | 	f = f / 3600
33 | 	minutes = int(math.Mod(f, 60))
34 | 	f = f / 60
35 | 	hours = int(f)
36 | 	return hours, minutes, seconds, nanoseconds
37 | }
38 | 
39 | func julianDateToGregorianTime(part1, part2 float64) time.Time {
40 | 	part1I, part1F := math.Modf(part1)
41 | 	part2I, part2F := math.Modf(part2)
42 | 	julianDays := part1I + part2I
43 | 	julianFraction := part1F + part2F
44 | 	julianDays, julianFraction = shiftJulianToNoon(julianDays, julianFraction)
45 | 	day, month, year := doTheFliegelAndVanFlandernAlgorithm(int(julianDays))
46 | 	hours, minutes, seconds, nanoseconds := fractionOfADay(julianFraction)
47 | 	return time.Date(year, time.Month(month), day, hours, minutes, seconds, nanoseconds, time.UTC)
48 | }
49 | 
50 | // By this point generations of programmers have repeated the
51 | // algorithm sent to the editor of "Communications of the ACM" in 1968
52 | // (published in CACM, volume 11, number 10, October 1968, p.657).
53 | // None of those programmers seems to have found it necessary to
54 | // explain the constants or variable names set out by Henry F. Fliegel
55 | // and Thomas C. Van Flandern.  Maybe one day I'll buy that jounal and
56 | // expand an explanation here - that day is not today.
57 | func doTheFliegelAndVanFlandernAlgorithm(jd int) (day, month, year int) {
58 | 	l := jd + 68569
59 | 	n := (4 * l) / 146097
60 | 	l = l - (146097*n+3)/4
61 | 	i := (4000 * (l + 1)) / 1461001
62 | 	l = l - (1461*i)/4 + 31
63 | 	j := (80 * l) / 2447
64 | 	d := l - (2447*j)/80
65 | 	l = j / 11
66 | 	m := j + 2 - (12 * l)
67 | 	y := 100*(n-49) + i + l
68 | 	return d, m, y
69 | }
70 | 
71 | // Convert an excelTime representation (stored as a floating point number) to a time.Time.
72 | func timeFromExcelTime(excelTime float64, date1904 bool) time.Time {
73 | 	var date time.Time
74 | 	var intPart int64 = int64(excelTime)
75 | 	// Excel uses Julian dates prior to March 1st 1900, and
76 | 	// Gregorian thereafter.
77 | 	if intPart <= 61 {
78 | 		const OFFSET1900 = 15018.0
79 | 		const OFFSET1904 = 16480.0
80 | 		var date time.Time
81 | 		if date1904 {
82 | 			date = julianDateToGregorianTime(MJD_0+OFFSET1904, excelTime)
83 | 		} else {
84 | 			date = julianDateToGregorianTime(MJD_0+OFFSET1900, excelTime)
85 | 		}
86 | 		return date
87 | 	}
88 | 	var floatPart float64 = excelTime - float64(intPart)
89 | 	var dayNanoSeconds float64 = 24 * 60 * 60 * 1000 * 1000 * 1000
90 | 	if date1904 {
91 | 		date = time.Date(1904, 1, 1, 0, 0, 0, 0, time.UTC)
92 | 	} else {
93 | 		date = time.Date(1899, 12, 30, 0, 0, 0, 0, time.UTC)
94 | 	}
95 | 	durationDays := time.Duration(intPart) * time.Hour * 24
96 | 	durationPart := time.Duration(dayNanoSeconds * floatPart)
97 | 	return date.Add(durationDays).Add(durationPart)
98 | }
99 | 


--------------------------------------------------------------------------------
/xls/doc.go:
--------------------------------------------------------------------------------
1 | //xls package use to parse the 97 -2004 microsoft xls file(".xls" suffix, NOT ".xlsx" suffix )
2 | //
3 | //there are some example in godoc, please follow them.
4 | package xls
5 | 


--------------------------------------------------------------------------------
/xls/example_test.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | func ExampleOpen() {
 8 | 	if xlFile, err := Open("Table.xls", "utf-8"); err == nil {
 9 | 		fmt.Println(xlFile.Author)
10 | 	}
11 | }
12 | 
13 | func ExampleWorkBook_NumberSheets() {
14 | 	if xlFile, err := Open("Table.xls", "utf-8"); err == nil {
15 | 		for i := 0; i < xlFile.NumSheets(); i++ {
16 | 			sheet := xlFile.GetSheet(i)
17 | 			fmt.Println(sheet.Name)
18 | 		}
19 | 	}
20 | }
21 | 
22 | //Output: read the content of first two cols in each row
23 | func ExampleWorkBook_GetSheet() {
24 | 	if xlFile, err := Open("Table.xls", "utf-8"); err == nil {
25 | 		if sheet1 := xlFile.GetSheet(0); sheet1 != nil {
26 | 			fmt.Print("Total Lines ", sheet1.MaxRow, sheet1.Name)
27 | 			col1 := sheet1.Row(0).Col(0)
28 | 			col2 := sheet1.Row(0).Col(0)
29 | 			for i := 0; i <= (int(sheet1.MaxRow)); i++ {
30 | 				row1 := sheet1.Row(i)
31 | 				col1 = row1.Col(0)
32 | 				col2 = row1.Col(1)
33 | 				fmt.Print("\n", col1, ",", col2)
34 | 			}
35 | 		}
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/xls/font.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | type FontInfo struct {
 4 | 	Height     uint16
 5 | 	Flag       uint16
 6 | 	Color      uint16
 7 | 	Bold       uint16
 8 | 	Escapement uint16
 9 | 	Underline  byte
10 | 	Family     byte
11 | 	Charset    byte
12 | 	Notused    byte
13 | 	NameB      byte
14 | }
15 | 
16 | type Font struct {
17 | 	Info *FontInfo
18 | 	Name string
19 | }
20 | 


--------------------------------------------------------------------------------
/xls/format.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | type Format struct {
 4 | 	Head struct {
 5 | 		Index uint16
 6 | 		Size  uint16
 7 | 	}
 8 | 	str string
 9 | }
10 | 


--------------------------------------------------------------------------------
/xls/issue47_test.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"io/ioutil"
 5 | 	"path"
 6 | 	"path/filepath"
 7 | 	"strings"
 8 | 	"testing"
 9 | )
10 | 
11 | func TestIssue47(t *testing.T) {
12 | 	testdatapath := "testdata"
13 | 	files, err := ioutil.ReadDir(testdatapath)
14 | 	if err != nil {
15 | 		t.Fatalf("Cant read testdata directory contents: %s", err)
16 | 	}
17 | 	for _, f := range files {
18 | 		if filepath.Ext(f.Name()) == ".xls" {
19 | 			xlsfilename := f.Name()
20 | 			xlsxfilename := strings.TrimSuffix(xlsfilename, filepath.Ext(xlsfilename)) + ".xlsx"
21 | 			err := CompareXlsXlsx(path.Join(testdatapath, xlsfilename),
22 | 				path.Join(testdatapath, xlsxfilename))
23 | 			if err != "" {
24 | 				t.Fatalf("XLS file %s an XLSX file are not equal: %s", xlsfilename, err)
25 | 			}
26 | 
27 | 		}
28 | 	}
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/xls/row.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | type rowInfo struct {
 4 | 	Index    uint16
 5 | 	Fcell    uint16
 6 | 	Lcell    uint16
 7 | 	Height   uint16
 8 | 	Notused  uint16
 9 | 	Notused2 uint16
10 | 	Flags    uint32
11 | }
12 | 
13 | //Row the data of one row
14 | type Row struct {
15 | 	wb   *WorkBook
16 | 	info *rowInfo
17 | 	cols map[uint16]contentHandler
18 | }
19 | 
20 | //Col Get the Nth Col from the Row, if has not, return nil.
21 | //Suggest use Has function to test it.
22 | func (r *Row) Col(i int) string {
23 | 	serial := uint16(i)
24 | 	if ch, ok := r.cols[serial]; ok {
25 | 		strs := ch.String(r.wb)
26 | 		return strs[0]
27 | 	} else {
28 | 		for _, v := range r.cols {
29 | 			if v.FirstCol() <= serial && v.LastCol() >= serial {
30 | 				strs := v.String(r.wb)
31 | 				if int(serial-v.FirstCol()) >= len(strs) || serial-v.FirstCol() < 0 {
32 | 					return ""
33 | 				}
34 | 				return strs[serial-v.FirstCol()]
35 | 			}
36 | 		}
37 | 	}
38 | 	return ""
39 | }
40 | 
41 | //LastCol Get the number of Last Col of the Row.
42 | func (r *Row) LastCol() int {
43 | 	return int(r.info.Lcell)
44 | }
45 | 
46 | //FirstCol Get the number of First Col of the Row.
47 | func (r *Row) FirstCol() int {
48 | 	return int(r.info.Fcell)
49 | }
50 | 


--------------------------------------------------------------------------------
/xls/sst.go:
--------------------------------------------------------------------------------
1 | package xls
2 | 
3 | type SstInfo struct {
4 | 	Total uint32
5 | 	Count uint32
6 | }
7 | 


--------------------------------------------------------------------------------
/xls/workbook.go:
--------------------------------------------------------------------------------
  1 | package xls
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"io"
  7 | 	"os"
  8 | 	"unicode/utf16"
  9 | 
 10 | 	"golang.org/x/text/encoding/charmap"
 11 | )
 12 | 
 13 | //xls workbook type
 14 | type WorkBook struct {
 15 | 	Is5ver   bool
 16 | 	Type     uint16
 17 | 	Codepage uint16
 18 | 	Xfs      []st_xf_data
 19 | 	Fonts    []Font
 20 | 	Formats  map[uint16]*Format
 21 | 	//All the sheets from the workbook
 22 | 	sheets         []*WorkSheet
 23 | 	Author         string
 24 | 	rs             io.ReadSeeker
 25 | 	sst            []string
 26 | 	continue_utf16 uint16
 27 | 	continue_rich  uint16
 28 | 	continue_apsb  uint32
 29 | 	dateMode       uint16
 30 | }
 31 | 
 32 | //read workbook from ole2 file
 33 | func newWorkBookFromOle2(rs io.ReadSeeker) *WorkBook {
 34 | 	wb := new(WorkBook)
 35 | 	wb.Formats = make(map[uint16]*Format)
 36 | 	// wb.bts = bts
 37 | 	wb.rs = rs
 38 | 	wb.sheets = make([]*WorkSheet, 0)
 39 | 	wb.Parse(rs)
 40 | 	return wb
 41 | }
 42 | 
 43 | func (w *WorkBook) Parse(buf io.ReadSeeker) {
 44 | 	b := new(bof)
 45 | 	bof_pre := new(bof)
 46 | 	// buf := bytes.NewReader(bts)
 47 | 	offset := 0
 48 | 	for {
 49 | 		if err := binary.Read(buf, binary.LittleEndian, b); err == nil {
 50 | 			bof_pre, b, offset = w.parseBof(buf, b, bof_pre, offset)
 51 | 		} else {
 52 | 			break
 53 | 		}
 54 | 	}
 55 | }
 56 | 
 57 | func (w *WorkBook) addXf(xf st_xf_data) {
 58 | 	w.Xfs = append(w.Xfs, xf)
 59 | }
 60 | 
 61 | func (w *WorkBook) addFont(font *FontInfo, buf io.ReadSeeker) {
 62 | 	name, _ := w.get_string(buf, uint16(font.NameB))
 63 | 	w.Fonts = append(w.Fonts, Font{Info: font, Name: name})
 64 | }
 65 | 
 66 | func (w *WorkBook) addFormat(format *Format) {
 67 | 	if w.Formats == nil {
 68 | 		os.Exit(1)
 69 | 	}
 70 | 	w.Formats[format.Head.Index] = format
 71 | }
 72 | 
 73 | func (wb *WorkBook) parseBof(buf io.ReadSeeker, b *bof, pre *bof, offset_pre int) (after *bof, after_using *bof, offset int) {
 74 | 	after = b
 75 | 	after_using = pre
 76 | 	bts := xlsAllocateBytes(int(b.Size))
 77 | 	if bts == nil {
 78 | 		return
 79 | 	}
 80 | 	binary.Read(buf, binary.LittleEndian, bts)
 81 | 	buf_item := bytes.NewReader(bts)
 82 | 	switch b.Id {
 83 | 	case 0x809:
 84 | 		bif := new(biffHeader)
 85 | 		binary.Read(buf_item, binary.LittleEndian, bif)
 86 | 		if bif.Ver != 0x600 {
 87 | 			wb.Is5ver = true
 88 | 		}
 89 | 		wb.Type = bif.Type
 90 | 	case 0x042: // CODEPAGE
 91 | 		binary.Read(buf_item, binary.LittleEndian, &wb.Codepage)
 92 | 	case 0x3c: // CONTINUE
 93 | 		if pre.Id == 0xfc {
 94 | 			var size uint16
 95 | 			var err error
 96 | 			if wb.continue_utf16 >= 1 {
 97 | 				size = wb.continue_utf16
 98 | 				wb.continue_utf16 = 0
 99 | 			} else {
100 | 				err = binary.Read(buf_item, binary.LittleEndian, &size)
101 | 			}
102 | 			for err == nil && offset_pre < len(wb.sst) {
103 | 				var str string
104 | 				str, err = wb.get_string(buf_item, size)
105 | 				wb.sst[offset_pre] = wb.sst[offset_pre] + str
106 | 
107 | 				if err == io.EOF {
108 | 					break
109 | 				}
110 | 
111 | 				offset_pre++
112 | 				err = binary.Read(buf_item, binary.LittleEndian, &size)
113 | 			}
114 | 		}
115 | 		offset = offset_pre
116 | 		after = pre
117 | 		after_using = b
118 | 	case 0xfc: // SST
119 | 		info := new(SstInfo)
120 | 		binary.Read(buf_item, binary.LittleEndian, info)
121 | 		wb.sst = xlsAllocateString(int(info.Count))
122 | 		if wb.sst == nil {
123 | 			return
124 | 		}
125 | 		var size uint16
126 | 		var i = 0
127 | 		// dont forget to initialize offset
128 | 		offset = 0
129 | 		for ; i < int(info.Count); i++ {
130 | 			var err error
131 | 			err = binary.Read(buf_item, binary.LittleEndian, &size)
132 | 			if err == nil {
133 | 				var str string
134 | 				str, err = wb.get_string(buf_item, size)
135 | 				wb.sst[i] = wb.sst[i] + str
136 | 			}
137 | 
138 | 			if err == io.EOF {
139 | 				break
140 | 			}
141 | 		}
142 | 		offset = i
143 | 	case 0x85: // boundsheet
144 | 		var bs = new(boundsheet)
145 | 		binary.Read(buf_item, binary.LittleEndian, bs)
146 | 		// different for BIFF5 and BIFF8
147 | 		wb.addSheet(bs, buf_item)
148 | 	case 0x0e0: // XF
149 | 		if wb.Is5ver {
150 | 			xf := new(Xf5)
151 | 			binary.Read(buf_item, binary.LittleEndian, xf)
152 | 			wb.addXf(xf)
153 | 		} else {
154 | 			xf := new(Xf8)
155 | 			binary.Read(buf_item, binary.LittleEndian, xf)
156 | 			wb.addXf(xf)
157 | 		}
158 | 	case 0x031: // FONT
159 | 		f := new(FontInfo)
160 | 		binary.Read(buf_item, binary.LittleEndian, f)
161 | 		wb.addFont(f, buf_item)
162 | 	case 0x41E: //FORMAT
163 | 		font := new(Format)
164 | 		binary.Read(buf_item, binary.LittleEndian, &font.Head)
165 | 		font.str, _ = wb.get_string(buf_item, font.Head.Size)
166 | 		wb.addFormat(font)
167 | 	case 0x22: //DATEMODE
168 | 		binary.Read(buf_item, binary.LittleEndian, &wb.dateMode)
169 | 	}
170 | 	return
171 | }
172 | func decodeWindows1251(enc []byte) string {
173 | 	dec := charmap.Windows1251.NewDecoder()
174 | 	out, _ := dec.Bytes(enc)
175 | 	return string(out)
176 | }
177 | func (w *WorkBook) get_string(buf io.ReadSeeker, size uint16) (res string, err error) {
178 | 	if w.Is5ver {
179 | 		bts := xlsAllocateBytes(int(size))
180 | 		if bts == nil {
181 | 			return
182 | 		}
183 | 		_, err = buf.Read(bts)
184 | 		res = decodeWindows1251(bts)
185 | 		//res = string(bts)
186 | 	} else {
187 | 		var richtext_num = uint16(0)
188 | 		var phonetic_size = uint32(0)
189 | 		var flag byte
190 | 		err = binary.Read(buf, binary.LittleEndian, &flag)
191 | 		if flag&0x8 != 0 {
192 | 			err = binary.Read(buf, binary.LittleEndian, &richtext_num)
193 | 		} else if w.continue_rich > 0 {
194 | 			richtext_num = w.continue_rich
195 | 			w.continue_rich = 0
196 | 		}
197 | 		if flag&0x4 != 0 {
198 | 			err = binary.Read(buf, binary.LittleEndian, &phonetic_size)
199 | 		} else if w.continue_apsb > 0 {
200 | 			phonetic_size = w.continue_apsb
201 | 			w.continue_apsb = 0
202 | 		}
203 | 		if flag&0x1 != 0 {
204 | 			var bts = make([]uint16, size)
205 | 			var i = uint16(0)
206 | 			for ; i < size && err == nil; i++ {
207 | 				err = binary.Read(buf, binary.LittleEndian, &bts[i])
208 | 			}
209 | 
210 | 			// when eof found, we dont want to append last element
211 | 			var runes []rune
212 | 			if err == io.EOF {
213 | 				i = i - 1
214 | 			}
215 | 			runes = utf16.Decode(bts[:i])
216 | 
217 | 			res = string(runes)
218 | 			if i < size {
219 | 				w.continue_utf16 = size - i
220 | 			}
221 | 
222 | 		} else {
223 | 			bts := xlsAllocateBytes(int(size))
224 | 			if bts == nil {
225 | 				return
226 | 			}
227 | 			var n int
228 | 			n, err = buf.Read(bts)
229 | 			if uint16(n) < size {
230 | 				w.continue_utf16 = size - uint16(n)
231 | 				err = io.EOF
232 | 			}
233 | 
234 | 			var bts1 = make([]uint16, n)
235 | 			for k, v := range bts[:n] {
236 | 				bts1[k] = uint16(v)
237 | 			}
238 | 			runes := utf16.Decode(bts1)
239 | 			res = string(runes)
240 | 		}
241 | 		if richtext_num > 0 {
242 | 			var bts []byte
243 | 			var seek_size int64
244 | 			if w.Is5ver {
245 | 				seek_size = int64(2 * richtext_num)
246 | 			} else {
247 | 				seek_size = int64(4 * richtext_num)
248 | 			}
249 | 			bts = xlsAllocateBytes(int(seek_size))
250 | 			if bts == nil {
251 | 				return
252 | 			}
253 | 			err = binary.Read(buf, binary.LittleEndian, bts)
254 | 			if err == io.EOF {
255 | 				w.continue_rich = richtext_num
256 | 			}
257 | 
258 | 			// err = binary.Read(buf, binary.LittleEndian, bts)
259 | 		}
260 | 		if phonetic_size > 0 {
261 | 			var bts []byte
262 | 			bts = xlsAllocateBytes(int(phonetic_size))
263 | 			if bts == nil {
264 | 				return
265 | 			}
266 | 			err = binary.Read(buf, binary.LittleEndian, bts)
267 | 			if err == io.EOF {
268 | 				w.continue_apsb = phonetic_size
269 | 			}
270 | 		}
271 | 	}
272 | 	return
273 | }
274 | 
275 | func (w *WorkBook) addSheet(sheet *boundsheet, buf io.ReadSeeker) {
276 | 	name, _ := w.get_string(buf, uint16(sheet.Name))
277 | 	w.sheets = append(w.sheets, &WorkSheet{bs: sheet, Name: name, wb: w})
278 | }
279 | 
280 | //reading a sheet from the compress file to memory, you should call this before you try to get anything from sheet
281 | func (w *WorkBook) prepareSheet(sheet *WorkSheet) {
282 | 	w.rs.Seek(int64(sheet.bs.Filepos), 0)
283 | 	sheet.parse(w.rs)
284 | }
285 | 
286 | //Get one sheet by its number
287 | func (w *WorkBook) GetSheet(num int) *WorkSheet {
288 | 	if num < len(w.sheets) {
289 | 		s := w.sheets[num]
290 | 		if !s.parsed {
291 | 			w.prepareSheet(s)
292 | 		}
293 | 		return s
294 | 	} else {
295 | 		return nil
296 | 	}
297 | }
298 | 
299 | //Get the number of all sheets, look into example
300 | func (w *WorkBook) NumSheets() int {
301 | 	return len(w.sheets)
302 | }
303 | 
304 | //helper function to read all cells from file
305 | //Notice: the max value is the limit of the max capacity of lines.
306 | //Warning: the helper function will need big memeory if file is large.
307 | func (w *WorkBook) ReadAllCells(max int) (res [][]string) {
308 | 	res = make([][]string, 0)
309 | 	for _, sheet := range w.sheets {
310 | 		if len(res) < max {
311 | 			max = max - len(res)
312 | 			w.prepareSheet(sheet)
313 | 			if sheet.MaxRow != 0 {
314 | 				leng := int(sheet.MaxRow) + 1
315 | 				if max < leng {
316 | 					leng = max
317 | 				}
318 | 				temp := make([][]string, leng)
319 | 				for k, row := range sheet.rows {
320 | 					data := make([]string, 0)
321 | 					if len(row.cols) > 0 {
322 | 						for _, col := range row.cols {
323 | 							if uint16(len(data)) <= col.LastCol() {
324 | 								data = append(data, make([]string, col.LastCol()-uint16(len(data))+1)...)
325 | 							}
326 | 							str := col.String(w)
327 | 
328 | 							for i := uint16(0); i < col.LastCol()-col.FirstCol()+1; i++ {
329 | 								data[col.FirstCol()+i] = str[i]
330 | 							}
331 | 						}
332 | 						if leng > int(k) {
333 | 							temp[k] = data
334 | 						}
335 | 					}
336 | 				}
337 | 				res = append(res, temp...)
338 | 			}
339 | 		}
340 | 	}
341 | 	return
342 | }
343 | 
344 | // Functions below check for excessive allocations and prevent it.
345 | const xlsAllocateLimit = 1 * 1024 * 1024
346 | 
347 | func xlsAllocateBytes(size int) []byte {
348 | 	if size > xlsAllocateLimit {
349 | 		//fmt.Printf("Size warning: %d\n", size)
350 | 		return nil
351 | 	}
352 | 
353 | 	return make([]byte, size)
354 | }
355 | 
356 | func xlsAllocateString(size int) []string {
357 | 	if size > xlsAllocateLimit {
358 | 		//fmt.Printf("Size warning: %d\n", size)
359 | 		return nil
360 | 	}
361 | 
362 | 	return make([]string, size)
363 | }
364 | 


--------------------------------------------------------------------------------
/xls/worksheet.go:
--------------------------------------------------------------------------------
  1 | package xls
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"io"
  6 | 	"unicode/utf16"
  7 | )
  8 | 
  9 | type boundsheet struct {
 10 | 	Filepos uint32
 11 | 	Type    byte
 12 | 	Visible byte
 13 | 	Name    byte
 14 | }
 15 | 
 16 | //WorkSheet in one WorkBook
 17 | type WorkSheet struct {
 18 | 	bs   *boundsheet
 19 | 	wb   *WorkBook
 20 | 	Name string
 21 | 	rows map[uint16]*Row
 22 | 	//NOTICE: this is the max row number of the sheet, so it should be count -1
 23 | 	MaxRow uint16
 24 | 	parsed bool
 25 | }
 26 | 
 27 | func (w *WorkSheet) Row(i int) *Row {
 28 | 	row := w.rows[uint16(i)]
 29 | 	if row != nil {
 30 | 		row.wb = w.wb
 31 | 	}
 32 | 	return row
 33 | }
 34 | 
 35 | func (w *WorkSheet) parse(buf io.ReadSeeker) {
 36 | 	w.rows = make(map[uint16]*Row)
 37 | 	b := new(bof)
 38 | 	var bof_pre *bof
 39 | 	for {
 40 | 		if err := binary.Read(buf, binary.LittleEndian, b); err == nil {
 41 | 			bof_pre = w.parseBof(buf, b, bof_pre)
 42 | 			if b.Id == 0xa {
 43 | 				break
 44 | 			}
 45 | 		} else {
 46 | 			//fmt.Println(err)
 47 | 			break
 48 | 		}
 49 | 	}
 50 | 	w.parsed = true
 51 | }
 52 | 
 53 | func (w *WorkSheet) parseBof(buf io.ReadSeeker, b *bof, pre *bof) *bof {
 54 | 	var col interface{}
 55 | 	switch b.Id {
 56 | 	// case 0x0E5: //MERGEDCELLS
 57 | 	// ws.mergedCells(buf)
 58 | 	case 0x208: //ROW
 59 | 		r := new(rowInfo)
 60 | 		binary.Read(buf, binary.LittleEndian, r)
 61 | 		w.addRow(r)
 62 | 	case 0x0BD: //MULRK
 63 | 		mc := new(MulrkCol)
 64 | 		size := (b.Size - 6) / 6
 65 | 		binary.Read(buf, binary.LittleEndian, &mc.Col)
 66 | 		mc.Xfrks = make([]XfRk, size)
 67 | 		for i := uint16(0); i < size; i++ {
 68 | 			binary.Read(buf, binary.LittleEndian, &mc.Xfrks[i])
 69 | 		}
 70 | 		binary.Read(buf, binary.LittleEndian, &mc.LastColB)
 71 | 		col = mc
 72 | 	case 0x0BE: //MULBLANK
 73 | 		mc := new(MulBlankCol)
 74 | 		size := (b.Size - 6) / 2
 75 | 		binary.Read(buf, binary.LittleEndian, &mc.Col)
 76 | 		mc.Xfs = make([]uint16, size)
 77 | 		for i := uint16(0); i < size; i++ {
 78 | 			binary.Read(buf, binary.LittleEndian, &mc.Xfs[i])
 79 | 		}
 80 | 		binary.Read(buf, binary.LittleEndian, &mc.LastColB)
 81 | 		col = mc
 82 | 	case 0x203: //NUMBER
 83 | 		col = new(NumberCol)
 84 | 		binary.Read(buf, binary.LittleEndian, col)
 85 | 	case 0x06: //FORMULA
 86 | 		c := new(FormulaCol)
 87 | 		binary.Read(buf, binary.LittleEndian, &c.Header)
 88 | 		c.Bts = make([]byte, b.Size-20)
 89 | 		binary.Read(buf, binary.LittleEndian, &c.Bts)
 90 | 		col = c
 91 | 	case 0x27e: //RK
 92 | 		col = new(RkCol)
 93 | 		binary.Read(buf, binary.LittleEndian, col)
 94 | 	case 0xFD: //LABELSST
 95 | 		col = new(LabelsstCol)
 96 | 		binary.Read(buf, binary.LittleEndian, col)
 97 | 	case 0x204:
 98 | 		c := new(labelCol)
 99 | 		binary.Read(buf, binary.LittleEndian, &c.BlankCol)
100 | 		var count uint16
101 | 		binary.Read(buf, binary.LittleEndian, &count)
102 | 		c.Str, _ = w.wb.get_string(buf, count)
103 | 		col = c
104 | 	case 0x201: //BLANK
105 | 		col = new(BlankCol)
106 | 		binary.Read(buf, binary.LittleEndian, col)
107 | 	case 0x1b8: //HYPERLINK
108 | 		var hy HyperLink
109 | 		binary.Read(buf, binary.LittleEndian, &hy.CellRange)
110 | 		buf.Seek(20, 1)
111 | 		var flag uint32
112 | 		binary.Read(buf, binary.LittleEndian, &flag)
113 | 		var count uint32
114 | 
115 | 		if flag&0x14 != 0 {
116 | 			binary.Read(buf, binary.LittleEndian, &count)
117 | 			hy.Description = b.utf16String(buf, count)
118 | 		}
119 | 		if flag&0x80 != 0 {
120 | 			binary.Read(buf, binary.LittleEndian, &count)
121 | 			hy.TargetFrame = b.utf16String(buf, count)
122 | 		}
123 | 		if flag&0x1 != 0 {
124 | 			var guid [2]uint64
125 | 			binary.Read(buf, binary.BigEndian, &guid)
126 | 			if guid[0] == 0xE0C9EA79F9BACE11 && guid[1] == 0x8C8200AA004BA90B { //URL
127 | 				hy.IsUrl = true
128 | 				binary.Read(buf, binary.LittleEndian, &count)
129 | 				hy.Url = b.utf16String(buf, count/2)
130 | 			} else if guid[0] == 0x303000000000000 && guid[1] == 0xC000000000000046 { //URL{
131 | 				var upCount uint16
132 | 				binary.Read(buf, binary.LittleEndian, &upCount)
133 | 				binary.Read(buf, binary.LittleEndian, &count)
134 | 				bts := make([]byte, count)
135 | 				binary.Read(buf, binary.LittleEndian, &bts)
136 | 				hy.ShortedFilePath = string(bts)
137 | 				buf.Seek(24, 1)
138 | 				binary.Read(buf, binary.LittleEndian, &count)
139 | 				if count > 0 {
140 | 					binary.Read(buf, binary.LittleEndian, &count)
141 | 					buf.Seek(2, 1)
142 | 					hy.ExtendedFilePath = b.utf16String(buf, count/2+1)
143 | 				}
144 | 			}
145 | 		}
146 | 		if flag&0x8 != 0 {
147 | 			binary.Read(buf, binary.LittleEndian, &count)
148 | 			var bts = make([]uint16, count)
149 | 			binary.Read(buf, binary.LittleEndian, &bts)
150 | 			runes := utf16.Decode(bts[:len(bts)-1])
151 | 			hy.TextMark = string(runes)
152 | 		}
153 | 
154 | 		w.addRange(&hy.CellRange, &hy)
155 | 	case 0x809:
156 | 		buf.Seek(int64(b.Size), 1)
157 | 	case 0xa:
158 | 	default:
159 | 		// log.Printf("Unknow %X,%d\n", b.Id, b.Size)
160 | 		buf.Seek(int64(b.Size), 1)
161 | 	}
162 | 	if col != nil {
163 | 		w.add(col)
164 | 	}
165 | 	return b
166 | }
167 | 
168 | func (w *WorkSheet) add(content interface{}) {
169 | 	if ch, ok := content.(contentHandler); ok {
170 | 		if col, ok := content.(Coler); ok {
171 | 			w.addCell(col, ch)
172 | 		}
173 | 	}
174 | 
175 | }
176 | 
177 | func (w *WorkSheet) addCell(col Coler, ch contentHandler) {
178 | 	w.addContent(col.Row(), ch)
179 | }
180 | 
181 | func (w *WorkSheet) addRange(rang Ranger, ch contentHandler) {
182 | 
183 | 	for i := rang.FirstRow(); i <= rang.LastRow(); i++ {
184 | 		w.addContent(i, ch)
185 | 	}
186 | }
187 | 
188 | func (w *WorkSheet) addContent(row_num uint16, ch contentHandler) {
189 | 	var row *Row
190 | 	var ok bool
191 | 	if row, ok = w.rows[row_num]; !ok {
192 | 		info := new(rowInfo)
193 | 		info.Index = row_num
194 | 		row = w.addRow(info)
195 | 	}
196 | 	row.cols[ch.FirstCol()] = ch
197 | }
198 | 
199 | func (w *WorkSheet) addRow(info *rowInfo) (row *Row) {
200 | 	if info.Index > w.MaxRow {
201 | 		w.MaxRow = info.Index
202 | 	}
203 | 	var ok bool
204 | 	if row, ok = w.rows[info.Index]; ok {
205 | 		row.info = info
206 | 	} else {
207 | 		row = &Row{info: info, cols: make(map[uint16]contentHandler)}
208 | 		w.rows[info.Index] = row
209 | 	}
210 | 	return
211 | }
212 | 


--------------------------------------------------------------------------------
/xls/xf.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | type Xf5 struct {
 4 | 	Font      uint16
 5 | 	Format    uint16
 6 | 	Type      uint16
 7 | 	Align     uint16
 8 | 	Color     uint16
 9 | 	Fill      uint16
10 | 	Border    uint16
11 | 	Linestyle uint16
12 | }
13 | 
14 | func (x *Xf5) formatNo() uint16 {
15 | 	return x.Format
16 | }
17 | 
18 | type Xf8 struct {
19 | 	Font        uint16
20 | 	Format      uint16
21 | 	Type        uint16
22 | 	Align       byte
23 | 	Rotation    byte
24 | 	Ident       byte
25 | 	Usedattr    byte
26 | 	Linestyle   uint32
27 | 	Linecolor   uint32
28 | 	Groundcolor uint16
29 | }
30 | 
31 | func (x *Xf8) formatNo() uint16 {
32 | 	return x.Format
33 | }
34 | 
35 | type st_xf_data interface {
36 | 	formatNo() uint16
37 | }
38 | 


--------------------------------------------------------------------------------
/xls/xls.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"os"
 6 | 
 7 | 	"github.com/IntelligenceX/fileconversion/ole2"
 8 | )
 9 | 
10 | //Open one xls file
11 | func Open(file string, charset string) (*WorkBook, error) {
12 | 	if fi, err := os.Open(file); err == nil {
13 | 		return OpenReader(fi, charset)
14 | 	} else {
15 | 		return nil, err
16 | 	}
17 | }
18 | 
19 | //Open one xls file and return the closer
20 | func OpenWithCloser(file string, charset string) (*WorkBook, io.Closer, error) {
21 | 	if fi, err := os.Open(file); err == nil {
22 | 		wb, err := OpenReader(fi, charset)
23 | 		return wb, fi, err
24 | 	} else {
25 | 		return nil, nil, err
26 | 	}
27 | }
28 | 
29 | //Open xls file from reader
30 | func OpenReader(reader io.ReadSeeker, charset string) (wb *WorkBook, err error) {
31 | 	var ole *ole2.Ole
32 | 	if ole, err = ole2.Open(reader, charset); err == nil {
33 | 		var dir []*ole2.File
34 | 		if dir, err = ole.ListDir(); err == nil {
35 | 			var book *ole2.File
36 | 			var root *ole2.File
37 | 			for _, file := range dir {
38 | 				name := file.Name()
39 | 				if name == "Workbook" {
40 | 					if book == nil {
41 | 						book = file
42 | 					}
43 | 					//book = file
44 | 					// break
45 | 				}
46 | 				if name == "Book" {
47 | 					book = file
48 | 					// break
49 | 				}
50 | 				if name == "Root Entry" {
51 | 					root = file
52 | 				}
53 | 			}
54 | 			if book != nil {
55 | 				wb = newWorkBookFromOle2(ole.OpenFile(book, root))
56 | 				return
57 | 			}
58 | 		}
59 | 	}
60 | 	return
61 | }
62 | 


--------------------------------------------------------------------------------
/xls/xls_test.go:
--------------------------------------------------------------------------------
 1 | package xls
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestOpen(t *testing.T) {
 9 | 	if xlFile, err := Open("t1.xls", "utf-8"); err == nil {
10 | 		if sheet1 := xlFile.GetSheet(0); sheet1 != nil {
11 | 			fmt.Println("Total Lines ", sheet1.MaxRow, sheet1.Name)
12 | 			for i := 265; i <= 267; i++ {
13 | 				fmt.Printf("row %v point %v \n", i, sheet1.Row(i))
14 | 				if sheet1.Row(i) == nil {
15 | 					continue
16 | 				}
17 | 				row := sheet1.Row(i)
18 | 				for index := row.FirstCol(); index < row.LastCol(); index++ {
19 | 					fmt.Println(index, "==>", row.Col(index), " ")
20 | 					fmt.Printf("%T\n", row.cols[uint16(index)])
21 | 				}
22 | 				// col1 := .Cols[0]
23 | 				// col2 := sheet1.Row(uint16(i)].Cols[1]
24 | 				// fmt.Printf("\ncol1 %v \nCol2 %v \n", col1.String(xlFile), col2.String(xlFile))
25 | 			}
26 | 		}
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------