├── Conversion_test.go
├── DOC 2 Text.go
├── DOCX 2 Text.go
├── Decompress.go
├── EPUB 2 Text.go
├── HTML 2 Text.go
├── LICENSE
├── MBOX.go
├── MOBI 2 Text.go
├── ODS 2 Text.go
├── ODT 2 Text.go
├── PDF 2 Image.go
├── PDF 2 Text.go
├── PPT 2 Text.go
├── PPTX 2 Text.go
├── Picture.go
├── README.md
├── RTF 2 Text.go
├── XLS 2 Text.go
├── XLSX 2 Text.go
├── ZIP.go
├── html2text
├── README.md
├── html2text.go
├── html2text_test.go
└── testdata
│ ├── utf8.html
│ └── utf8_with_bom.xhtml
├── odf
├── Readme.md
├── meta.go
├── meta_test.go
├── ods
│ ├── ods_test.go
│ ├── read.go
│ └── test.ods
└── read.go
├── ole2
├── README.md
├── dir.go
├── header.go
├── ole.go
├── pss.go
├── sector.go
├── stream_reader.go
└── stream_reader_test.go
└── xls
├── README.md
├── bigtable_test.go
├── bof.go
├── cell_range.go
├── col.go
├── comparexlsxlsx.go
├── date.go
├── doc.go
├── example_test.go
├── font.go
├── format.go
├── issue47_test.go
├── row.go
├── sst.go
├── workbook.go
├── worksheet.go
├── xf.go
├── xls.go
└── xls_test.go
/Conversion_test.go:
--------------------------------------------------------------------------------
1 | /*
2 | File Name: Conversion_test.go
3 | Copyright: 2019 Kleissner Investments s.r.o.
4 | Author: Peter Kleissner
5 | */
6 |
7 | package fileconversion
8 |
9 | import (
10 | "bytes"
11 | "fmt"
12 | "io/ioutil"
13 | "os"
14 | "testing"
15 | )
16 |
17 | func TestXLS(t *testing.T) {
18 | // open local file to extract text and output to command line
19 | file, err := os.Open("test.xls")
20 | if err != nil {
21 | return
22 | }
23 |
24 | defer file.Close()
25 |
26 | XLS2Text(file, os.Stdout, 1*1024*1024)
27 | }
28 |
29 | func TestPPTX(t *testing.T) {
30 | // open local file to extract text and output to command line
31 | file, err := os.Open("test.pptx")
32 | if err != nil {
33 | return
34 | }
35 |
36 | defer file.Close()
37 |
38 | stat, _ := file.Stat()
39 |
40 | text, _ := PPTX2Text(file, stat.Size())
41 | fmt.Print(text)
42 | }
43 |
44 | func TestODS(t *testing.T) {
45 | // open local file to extract text and output to command line
46 | file, err := os.Open("test.ods")
47 | if err != nil {
48 | return
49 | }
50 |
51 | defer file.Close()
52 | stat, _ := file.Stat()
53 |
54 | ODS2Text(file, stat.Size(), os.Stdout, 1*1024*1024)
55 | }
56 |
57 | func TestExcelCell(t *testing.T) {
58 | file1, err := os.Open("test.xls")
59 | if err == nil {
60 | cells, _ := XLS2Cells(file1)
61 | for n, cell := range cells {
62 | fmt.Printf("%s\n", cell)
63 | if n > 20 {
64 | break
65 | }
66 | }
67 |
68 | file1.Close()
69 | }
70 |
71 | file1, err = os.Open("test.xlsx")
72 | if err == nil {
73 | stat, _ := file1.Stat()
74 | cells, _ := XLSX2Cells(file1, stat.Size(), 1000)
75 | for n, cell := range cells {
76 | fmt.Printf("%s\n", cell)
77 | if n > 20 {
78 | break
79 | }
80 | }
81 |
82 | file1.Close()
83 | }
84 |
85 | file1, err = os.Open("test.ods")
86 | if err == nil {
87 | stat, _ := file1.Stat()
88 | cells, _ := ODS2Cells(file1, stat.Size())
89 | for n, cell := range cells {
90 | fmt.Printf("%s\n", cell)
91 | if n > 20 {
92 | break
93 | }
94 | }
95 |
96 | file1.Close()
97 | }
98 |
99 | }
100 |
101 | func TestCSV(t *testing.T) {
102 | file, err := os.Open("test.txt")
103 | if err != nil {
104 | return
105 | }
106 | defer file.Close()
107 |
108 | content, _ := ioutil.ReadAll(file)
109 |
110 | IsCSV(content)
111 | }
112 |
113 | func TestEPUB(t *testing.T) {
114 | // open local file to extract text and output to command line
115 | file, err := os.Open("moby-dick.epub")
116 | if err != nil {
117 | return
118 | }
119 |
120 | defer file.Close()
121 |
122 | stat, _ := file.Stat()
123 |
124 | text, _ := EPUB2Text(file, stat.Size(), 1000)
125 | fmt.Print(text)
126 | }
127 |
128 | func TestMOBI(t *testing.T) {
129 | // open local file to extract text and output to command line
130 | file, err := os.Open("windows-1252.mobi")
131 | if err != nil {
132 | return
133 | }
134 |
135 | defer file.Close()
136 |
137 | text, _ := Mobi2Text(file)
138 | fmt.Print(text)
139 | }
140 |
141 | func TestPDFImage(t *testing.T) {
142 | // open local file to extract images
143 | file, err := os.Open("test.pdf")
144 | if err != nil {
145 | return
146 | }
147 |
148 | defer file.Close()
149 |
150 | images, _ := PDFExtractImages(file)
151 | fmt.Print(len(images))
152 | }
153 |
154 | func TestPD2Text(t *testing.T) {
155 | file, err := os.Open("1.pdf")
156 | if err != nil {
157 | return
158 | }
159 |
160 | defer file.Close()
161 |
162 | buffer := bytes.NewBuffer(make([]byte, 0, 2*1024))
163 | PDFListContentStreams(file, buffer, 2*1024)
164 |
165 | fmt.Println(buffer.String())
166 | }
167 |
168 | func TestODTText(t *testing.T) {
169 | file, err := os.Open("Test\\file-sample_500kB.odt")
170 | if err != nil {
171 | return
172 | }
173 |
174 | defer file.Close()
175 | stat, _ := file.Stat()
176 |
177 | buffer := bytes.NewBuffer(make([]byte, 0, 2*1024))
178 |
179 | ODT2Text(file, stat.Size(), buffer, 2*1024)
180 |
181 | fmt.Println(buffer.String())
182 | }
183 |
184 | // TestXLSX extracts text from an XLSX file.
185 | // Memory usage: 100 rows = 52 MB, 500 rows = 200 MB, 1000 rows = 400 MB, 2000/5000/10000/-1 rows = 700 MB
186 | func TestXLSX(t *testing.T) {
187 | file, err := os.Open("Test\\971bd55b-5cbd-43d2-899e-d4a2a7d0a883.xlsx")
188 | if err != nil {
189 | return
190 | }
191 |
192 | defer file.Close()
193 | stat, _ := file.Stat()
194 |
195 | buffer := bytes.NewBuffer(make([]byte, 0, 2*1024))
196 |
197 | XLSX2Text(file, stat.Size(), buffer, 2*1024, -1)
198 |
199 | fmt.Println(buffer.String())
200 | }
201 |
--------------------------------------------------------------------------------
/DOC 2 Text.go:
--------------------------------------------------------------------------------
1 | /*
2 | File Name: DOC 2 Text.go
3 | Copyright: 2018 Kleissner Investments s.r.o.
4 | Author: Peter Kleissner
5 |
6 | This code is forked from https://github.com/EndFirstCorp/doc2txt and extracts text from DOC files, the legacy binary Word files.
7 | */
8 |
9 | package fileconversion
10 |
11 | import (
12 | "bytes"
13 | "encoding/binary"
14 | "errors"
15 | "io"
16 | "unicode/utf16"
17 | "unicode/utf8"
18 |
19 | "github.com/mattetti/filebuffer"
20 | "github.com/richardlehane/mscfb"
21 | )
22 |
23 | // ---- file doc.go ----
24 | // There were a few changes in this file to actually support Unicode which the old code was not.
25 |
26 | var (
27 | errTable = errors.New("cannot find table stream")
28 | errDocEmpty = errors.New("WordDocument not found")
29 | errDocShort = errors.New("wordDoc block too short")
30 | errInvalidArgument = errors.New("invalid table and/or fib")
31 | )
32 |
33 | type allReader interface {
34 | io.Closer
35 | io.ReaderAt
36 | io.ReadSeeker
37 | }
38 |
39 | func wrapError(e error) error {
40 | return errors.New("Error processing file: " + e.Error())
41 | }
42 |
43 | // DOC2Text converts a standard io.Reader from a Microsoft Word .doc binary file and returns a reader (actually a bytes.Buffer) which will output the plain text found in the .doc file
44 | func DOC2Text(r io.Reader) (io.Reader, error) {
45 | ra, ok := r.(io.ReaderAt)
46 | if !ok {
47 | ra, _, err := toMemoryBuffer(r)
48 | if err != nil {
49 | return nil, wrapError(err)
50 | }
51 | defer ra.Close()
52 | }
53 |
54 | d, err := mscfb.New(ra)
55 | if err != nil {
56 | return nil, wrapError(err)
57 | }
58 |
59 | wordDoc, table0, table1 := getWordDocAndTables(d)
60 | fib, err := getFib(wordDoc)
61 | if err != nil {
62 | return nil, wrapError(err)
63 | }
64 |
65 | table := getActiveTable(table0, table1, fib)
66 | if table == nil {
67 | return nil, wrapError(errTable)
68 | }
69 |
70 | clx, err := getClx(table, fib)
71 | if err != nil {
72 | return nil, wrapError(err)
73 | }
74 |
75 | return getText(wordDoc, clx)
76 | }
77 |
78 | func toMemoryBuffer(r io.Reader) (allReader, int64, error) {
79 | var b bytes.Buffer
80 | size, err := b.ReadFrom(r)
81 | if err != nil {
82 | return nil, 0, err
83 | }
84 | fb := filebuffer.New(b.Bytes())
85 | return fb, size, nil
86 | }
87 |
88 | func getText(wordDoc *mscfb.File, clx *clx) (io.Reader, error) {
89 | var buf bytes.Buffer
90 | for i := 0; i < len(clx.pcdt.PlcPcd.aPcd); i++ {
91 | pcd := clx.pcdt.PlcPcd.aPcd[i]
92 | cp := clx.pcdt.PlcPcd.aCP[i]
93 | cpNext := clx.pcdt.PlcPcd.aCP[i+1]
94 |
95 | var start, end int
96 | // https://msdn.microsoft.com/ko-kr/library/office/gg615596(v=office.14).aspx
97 | // Read the value of the Pcd.Fc.fCompressed field at bit 46 of the current Pcd structure. If 0, the Pcd structure refers to a 16-bit Unicode character. If 1, it refers to an 8-bit ANSI character.
98 | if pcd.fc.fCompressed {
99 | start = pcd.fc.fc / 2
100 | end = start + cpNext - cp
101 | } else {
102 | // -> 16-bit Unicode characters
103 | start = pcd.fc.fc
104 | end = start + 2*(cpNext-cp)
105 | }
106 |
107 | b := make([]byte, end-start)
108 | _, err := wordDoc.ReadAt(b, int64(start)) // read all the characters
109 | if err != nil {
110 | return nil, err
111 | }
112 | translateText(b, &buf, pcd.fc.fCompressed)
113 | }
114 | return &buf, nil
115 | }
116 |
117 | // translateText translates the buffer into text. fCompressed = 0 for 16-bit Unicode, 1 = 8-bit ANSI characters.
118 | func translateText(b []byte, buf *bytes.Buffer, fCompressed bool) {
119 | u16s := make([]uint16, 1)
120 | b8buf := make([]byte, 4)
121 |
122 | fieldLevel := 0
123 | var isFieldChar bool
124 | for cIndex := range b {
125 | // Convert to rune
126 | var char rune
127 | if fCompressed {
128 | // ANSI, 1 byte
129 | char = rune(b[cIndex])
130 | } else {
131 | // 16-bit Unicode: skip every second byte
132 | if cIndex%2 != 0 {
133 | continue
134 | } else if (cIndex + 1) >= len(b) { // make sure there are at least 2 bytes for Unicode decoding
135 | continue
136 | }
137 |
138 | // convert from UTF16 to UTF8
139 | u16s[0] = uint16(b[cIndex]) + (uint16(b[cIndex+1]) << 8)
140 | r := utf16.Decode(u16s)
141 | if len(r) != 1 {
142 | //fmt.Printf("Invalid rune %v\n", r)
143 | continue
144 | }
145 | char = r[0]
146 | }
147 |
148 | // Handle special field characters (section 2.8.25)
149 | if char == 0x13 {
150 | isFieldChar = true
151 | fieldLevel++
152 | continue
153 | } else if char == 0x14 {
154 | isFieldChar = false
155 | continue
156 | } else if char == 0x15 {
157 | isFieldChar = false
158 | continue
159 | } else if isFieldChar {
160 | continue
161 | }
162 |
163 | if char == 7 { // table column separator
164 | buf.WriteByte(' ')
165 | continue
166 | } else if char < 32 && char != 9 && char != 10 && char != 13 { // skip non-printable ASCII characters
167 | //buf.Write([]byte(fmt.Sprintf("|%#x|", char)))
168 | continue
169 | }
170 |
171 | if fCompressed { // compressed, so replace compressed characters
172 | buf.Write(replaceCompressed(byte(char)))
173 | } else {
174 | // encode the rune to UTF-8
175 | n := utf8.EncodeRune(b8buf, char)
176 | buf.Write(b8buf[:n])
177 | }
178 | }
179 | }
180 |
181 | func replaceCompressed(char byte) []byte {
182 | var v uint16
183 | switch char {
184 | case 0x82:
185 | v = 0x201A
186 | case 0x83:
187 | v = 0x0192
188 | case 0x84:
189 | v = 0x201E
190 | case 0x85:
191 | v = 0x2026
192 | case 0x86:
193 | v = 0x2020
194 | case 0x87:
195 | v = 0x2021
196 | case 0x88:
197 | v = 0x02C6
198 | case 0x89:
199 | v = 0x2030
200 | case 0x8A:
201 | v = 0x0160
202 | case 0x8B:
203 | v = 0x2039
204 | case 0x8C:
205 | v = 0x0152
206 | case 0x91:
207 | v = 0x2018
208 | case 0x92:
209 | v = 0x2019
210 | case 0x93:
211 | v = 0x201C
212 | case 0x94:
213 | v = 0x201D
214 | case 0x95:
215 | v = 0x2022
216 | case 0x96:
217 | v = 0x2013
218 | case 0x97:
219 | v = 0x2014
220 | case 0x98:
221 | v = 0x02DC
222 | case 0x99:
223 | v = 0x2122
224 | case 0x9A:
225 | v = 0x0161
226 | case 0x9B:
227 | v = 0x203A
228 | case 0x9C:
229 | v = 0x0153
230 | case 0x9F:
231 | v = 0x0178
232 | default:
233 | return []byte{char}
234 | }
235 | out := make([]byte, 2)
236 | binary.LittleEndian.PutUint16(out, v)
237 | return out
238 | }
239 |
240 | func getWordDocAndTables(r *mscfb.Reader) (*mscfb.File, *mscfb.File, *mscfb.File) {
241 | var wordDoc, table0, table1 *mscfb.File
242 | for i := 0; i < len(r.File); i++ {
243 | stream := r.File[i]
244 |
245 | switch stream.Name {
246 | case "WordDocument":
247 | wordDoc = stream
248 | case "0Table":
249 | table0 = stream
250 | case "1Table":
251 | table1 = stream
252 | }
253 | }
254 | return wordDoc, table0, table1
255 | }
256 |
257 | func getActiveTable(table0 *mscfb.File, table1 *mscfb.File, f *fib) *mscfb.File {
258 | if f.base.fWhichTblStm == 0 {
259 | return table0
260 | }
261 | return table1
262 | }
263 |
264 | // ---- file fib.go ----
265 |
266 | var (
267 | errFibInvalid = errors.New("file information block validation failed")
268 | )
269 |
270 | type fib struct {
271 | base fibBase
272 | csw int
273 | fibRgW fibRgW
274 | cslw int
275 | fibRgLw fibRgLw
276 | cbRgFcLcb int
277 | fibRgFcLcb fibRgFcLcb
278 | }
279 |
280 | type fibBase struct {
281 | fWhichTblStm int
282 | }
283 |
284 | type fibRgW struct {
285 | }
286 |
287 | type fibRgLw struct {
288 | ccpText int
289 | ccpFtn int
290 | ccpHdd int
291 | ccpMcr int
292 | ccpAtn int
293 | ccpEdn int
294 | ccpTxbx int
295 | ccpHdrTxbx int
296 | cpLength int
297 | }
298 |
299 | type fibRgFcLcb struct {
300 | fcPlcfFldMom int
301 | lcbPlcfFldMom int
302 | fcPlcfFldHdr int
303 | lcbPlcfFldHdr int
304 | fcPlcfFldFtn int
305 | lcbPlcfFldFtn int
306 | fcPlcfFldAtn int
307 | lcbPlcfFldAtn int
308 | fcClx int
309 | lcbClx int
310 | }
311 |
312 | // parse File Information Block (section 2.5.1)
313 | func getFib(wordDoc *mscfb.File) (*fib, error) {
314 | if wordDoc == nil {
315 | return nil, errDocEmpty
316 | }
317 |
318 | b := make([]byte, 898) // get FIB block up to FibRgFcLcb97
319 | _, err := wordDoc.ReadAt(b, 0)
320 | if err != nil {
321 | return nil, err
322 | }
323 |
324 | fibBase := getFibBase(b[0:32])
325 |
326 | fibRgW, csw, err := getFibRgW(b, 32)
327 | if err != nil {
328 | return nil, err
329 | }
330 |
331 | fibRgLw, cslw, err := getFibRgLw(b, 34+csw)
332 | if err != nil {
333 | return nil, err
334 | }
335 |
336 | fibRgFcLcb, cbRgFcLcb, err := getFibRgFcLcb(b, 34+csw+2+cslw)
337 |
338 | return &fib{base: *fibBase, csw: csw, cslw: cslw, fibRgW: *fibRgW, fibRgLw: *fibRgLw, fibRgFcLcb: *fibRgFcLcb, cbRgFcLcb: cbRgFcLcb}, err
339 | }
340 |
341 | // parse FibBase (section 2.5.2)
342 | func getFibBase(fib []byte) *fibBase {
343 | byt := fib[11] // fWhichTblStm is 2nd highest bit in this byte
344 | fWhichTblStm := int(byt >> 1 & 1) // set which table (0Table or 1Table) is the table stream
345 | return &fibBase{fWhichTblStm: fWhichTblStm}
346 | }
347 |
348 | func getFibRgW(fib []byte, start int) (*fibRgW, int, error) {
349 | if start+2 >= len(fib) { // must be big enough for csw
350 | return &fibRgW{}, 0, errFibInvalid
351 | }
352 |
353 | csw := int(binary.LittleEndian.Uint16(fib[start:start+2])) * 2 // in bytes
354 | return &fibRgW{}, csw, nil
355 | }
356 |
357 | // parse FibRgLw (section 2.5.4)
358 | func getFibRgLw(fib []byte, start int) (*fibRgLw, int, error) {
359 | fibRgLwStart := start + 2 // skip cslw
360 | if fibRgLwStart+88 >= len(fib) { // expect 88 bytes in fibRgLw
361 | return &fibRgLw{}, 0, errFibInvalid
362 | }
363 |
364 | cslw := getInt16(fib, start) * 4 // in bytes
365 | ccpText := getInt(fib, fibRgLwStart+3*4)
366 | ccpFtn := getInt(fib, fibRgLwStart+4*4)
367 | ccpHdd := getInt(fib, fibRgLwStart+5*4)
368 | ccpMcr := getInt(fib, fibRgLwStart+6*4)
369 | ccpAtn := getInt(fib, fibRgLwStart+7*4)
370 | ccpEdn := getInt(fib, fibRgLwStart+8*4)
371 | ccpTxbx := getInt(fib, fibRgLwStart+9*4)
372 | ccpHdrTxbx := getInt(fib, fibRgLwStart+10*4)
373 |
374 | // calculate cpLength. Used in PlcPcd verification (see section 2.8.35)
375 | var cpLength int
376 | if ccpFtn != 0 || ccpHdd != 0 || ccpMcr != 0 || ccpAtn != 0 || ccpEdn != 0 || ccpTxbx != 0 || ccpHdrTxbx != 0 {
377 | cpLength = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx + ccpText + 1
378 | } else {
379 | cpLength = ccpText
380 | }
381 | return &fibRgLw{ccpText: ccpText, ccpFtn: ccpFtn, ccpHdd: ccpHdd, ccpMcr: ccpMcr, ccpAtn: ccpAtn,
382 | ccpEdn: ccpEdn, ccpTxbx: ccpTxbx, ccpHdrTxbx: ccpHdrTxbx, cpLength: cpLength}, cslw, nil
383 | }
384 |
385 | // parse FibRgFcLcb (section 2.5.5)
386 | func getFibRgFcLcb(fib []byte, start int) (*fibRgFcLcb, int, error) {
387 | fibRgFcLcbStart := start + 2 // skip cbRgFcLcb
388 | if fibRgFcLcbStart+186*4 < len(fib) { // expect 186+ values in FibRgFcLcb
389 | return &fibRgFcLcb{}, 0, errFibInvalid
390 | }
391 |
392 | cbRgFcLcb := getInt16(fib, start)
393 | fcPlcfFldMom := getInt(fib, fibRgFcLcbStart+32*4)
394 | lcbPlcfFldMom := getInt(fib, fibRgFcLcbStart+33*4)
395 | fcPlcfFldHdr := getInt(fib, fibRgFcLcbStart+34*4)
396 | lcbPlcfFldHdr := getInt(fib, fibRgFcLcbStart+35*4)
397 | fcPlcfFldFtn := getInt(fib, fibRgFcLcbStart+36*4)
398 | lcbPlcfFldFtn := getInt(fib, fibRgFcLcbStart+37*4)
399 | fcPlcfFldAtn := getInt(fib, fibRgFcLcbStart+38*4)
400 | lcbPlcfFldAtn := getInt(fib, fibRgFcLcbStart+39*4)
401 | fcClx := getInt(fib, fibRgFcLcbStart+66*4)
402 | lcbClx := getInt(fib, fibRgFcLcbStart+67*4)
403 | return &fibRgFcLcb{fcPlcfFldMom: fcPlcfFldMom, lcbPlcfFldMom: lcbPlcfFldMom, fcPlcfFldHdr: fcPlcfFldHdr, lcbPlcfFldHdr: lcbPlcfFldHdr,
404 | fcPlcfFldFtn: fcPlcfFldFtn, lcbPlcfFldFtn: lcbPlcfFldFtn, fcPlcfFldAtn: fcPlcfFldAtn, lcbPlcfFldAtn: lcbPlcfFldAtn,
405 | fcClx: fcClx, lcbClx: lcbClx}, cbRgFcLcb, nil
406 | }
407 |
408 | func getInt16(buf []byte, start int) int {
409 | return int(binary.LittleEndian.Uint16(buf[start : start+2]))
410 | }
411 | func getInt(buf []byte, start int) int {
412 | return int(binary.LittleEndian.Uint32(buf[start : start+4]))
413 | }
414 |
415 | // ---- file clx.go ----
416 |
417 | var (
418 | errInvalidPrc = errors.New("Invalid Prc structure")
419 | errInvalidClx = errors.New("expected last aCP value to equal fib.cpLength (2.8.35)")
420 | errInvalidPcdt = errors.New("expected clxt to be equal 0x02")
421 | )
422 |
423 | type clx struct {
424 | pcdt pcdt
425 | }
426 |
427 | type pcdt struct {
428 | lcb int
429 | PlcPcd plcPcd
430 | }
431 |
432 | type plcPcd struct {
433 | aCP []int
434 | aPcd []pcd
435 | }
436 |
437 | type pcd struct {
438 | fc fcCompressed
439 | }
440 |
441 | type fcCompressed struct {
442 | fc int
443 | fCompressed bool
444 | }
445 |
446 | // read Clx (section 2.9.38)
447 | func getClx(table *mscfb.File, fib *fib) (*clx, error) {
448 | if table == nil || fib == nil {
449 | return nil, errInvalidArgument
450 | }
451 | b, err := readClx(table, fib)
452 | if err != nil {
453 | return nil, err
454 | }
455 |
456 | pcdtOffset, err := getPrcArrayEnd(b)
457 | if err != nil {
458 | return nil, err
459 | }
460 |
461 | pcdt, err := getPcdt(b, pcdtOffset)
462 | if err != nil {
463 | return nil, err
464 | }
465 |
466 | if pcdt.PlcPcd.aCP[len(pcdt.PlcPcd.aCP)-1] != fib.fibRgLw.cpLength {
467 | return nil, errInvalidClx
468 | }
469 |
470 | return &clx{pcdt: *pcdt}, nil
471 | }
472 |
473 | func readClx(table *mscfb.File, fib *fib) ([]byte, error) {
474 | b := make([]byte, fib.fibRgFcLcb.lcbClx)
475 | _, err := table.ReadAt(b, int64(fib.fibRgFcLcb.fcClx))
476 | if err != nil {
477 | return nil, err
478 | }
479 | return b, nil
480 | }
481 |
482 | // read Pcdt from Clx (section 2.9.178)
483 | func getPcdt(clx []byte, pcdtOffset int) (*pcdt, error) {
484 | const pcdSize = 8
485 | if pcdtOffset < 0 || pcdtOffset+5 >= len(clx) {
486 | return nil, errInvalidPcdt
487 | }
488 | if clx[pcdtOffset] != 0x02 { // clxt must be 0x02 or invalid
489 | return nil, errInvalidPcdt
490 | }
491 | lcb := int(binary.LittleEndian.Uint32(clx[pcdtOffset+1 : pcdtOffset+5])) // skip clxt, get lcb
492 | plcPcdOffset := pcdtOffset + 5 // skip clxt and lcb
493 | numPcds := (lcb - 4) / (4 + pcdSize) // see 2.2.2 in the spec for equation
494 | numCps := numPcds + 1 // always 1 more cp than pcds
495 |
496 | cps := make([]int, numCps)
497 | for i := 0; i < numCps; i++ {
498 | cpOffset := plcPcdOffset + i*4
499 | if cpOffset < 0 || cpOffset+4 >= len(clx) {
500 | return nil, errInvalidPcdt
501 | }
502 | cps[i] = int(binary.LittleEndian.Uint32(clx[cpOffset : cpOffset+4]))
503 | }
504 |
505 | pcdStart := plcPcdOffset + 4*numCps
506 | pcds := make([]pcd, numPcds)
507 | for i := 0; i < numPcds; i++ {
508 | pcdOffset := pcdStart + i*pcdSize
509 | if pcdOffset < 0 || pcdOffset+pcdSize >= len(clx) {
510 | return nil, errInvalidPcdt
511 | }
512 | pcds[i] = *parsePcd(clx[pcdOffset : pcdOffset+pcdSize])
513 | }
514 | return &pcdt{lcb: lcb, PlcPcd: plcPcd{aCP: cps, aPcd: pcds}}, nil
515 | }
516 |
517 | // find end of RgPrc array (section 2.9.38)
518 | func getPrcArrayEnd(clx []byte) (int, error) {
519 | prcOffset := 0
520 | count := 0
521 | for {
522 | clxt := clx[prcOffset]
523 | if clxt != 0x01 { // this is not a Prc, so exit
524 | return prcOffset, nil
525 | }
526 | prcDataCbGrpprl := binary.LittleEndian.Uint16(clx[prcOffset+1 : prcOffset+3]) // skip the clxt and read 2 bytes
527 | prcOffset += 1 + 2 + int(prcDataCbGrpprl) // skip clxt, cbGrpprl, and GrpPrl
528 |
529 | if count > 10000 || prcDataCbGrpprl <= 0 || prcOffset+3 > len(clx) { // ensure no infinite loop
530 | return 0, errInvalidPrc
531 | }
532 | count++
533 | }
534 | }
535 |
536 | // parse Pcd (section 2.9.177)
537 | func parsePcd(pcdData []byte) *pcd {
538 | return &pcd{fc: *parseFcCompressed(pcdData[2:6])}
539 | }
540 |
541 | // parse FcCompressed (section 2.9.73)
542 | func parseFcCompressed(fcData []byte) *fcCompressed {
543 | fCompressed := fcData[3]&64 == 64 // check fcompressed value (second bit from lestmost of the last byte in fcdata)
544 | fcData[3] = fcData[3] & 63 // clear the fcompressed value from data
545 | fc := binary.LittleEndian.Uint32(fcData) // word doc generally uses little endian order (1.3.7)
546 | return &fcCompressed{fc: int(fc), fCompressed: fCompressed}
547 | }
548 |
549 | // IsFileDOC checks if the data indicates a DOC file
550 | // DOC has multiple signature according to https://filesignatures.net/index.php?search=doc&mode=EXT, D0 CF 11 E0 A1 B1 1A E1
551 | func IsFileDOC(data []byte) bool {
552 | return bytes.HasPrefix(data, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1})
553 | }
554 |
--------------------------------------------------------------------------------
/DOCX 2 Text.go:
--------------------------------------------------------------------------------
1 | /*
2 | File Name: DOCX 2 Text.go
3 | Copyright: 2018 Kleissner Investments s.r.o.
4 | Author: Peter Kleissner
5 |
6 | This code is forked from https://github.com/guylaor/goword and extracts text from DOCX files.
7 | */
8 |
9 | package fileconversion
10 |
11 | import (
12 | "archive/zip"
13 | "bytes"
14 | "encoding/xml"
15 | "fmt"
16 | "io"
17 | "io/ioutil"
18 | "strings"
19 | )
20 |
21 | // models.go
22 |
23 | // WordDocument is a full word doc
24 | type WordDocument struct {
25 | Paragraphs []WordParagraph
26 | }
27 |
28 | // WordParagraph is a single paragraph
29 | type WordParagraph struct {
30 | Style WordStyle `xml:"pPr>pStyle"`
31 | Rows []WordRow `xml:"r"`
32 | }
33 |
34 | // WordStyle ...
35 | type WordStyle struct {
36 | Val string `xml:"val,attr"`
37 | }
38 |
39 | // WordRow ...
40 | type WordRow struct {
41 | Text string `xml:"t"`
42 | }
43 |
44 | // AsText returns all text in the document
45 | func (w WordDocument) AsText() string {
46 | text := ""
47 | for _, v := range w.Paragraphs {
48 | for _, rv := range v.Rows {
49 | text += rv.Text
50 | }
51 | text += "\n"
52 | }
53 | return text
54 | }
55 |
56 | // goword.go
57 |
58 | // DOCX2Text extracts text of a Word document
59 | // Size is the full size of the input file.
60 | func DOCX2Text(file io.ReaderAt, size int64) (string, error) {
61 |
62 | doc, err := openWordFile(file, size)
63 | if err != nil {
64 | return "", err
65 | }
66 |
67 | docx, err := WordParse(doc)
68 | if err != nil {
69 | return "", err
70 | }
71 |
72 | return docx.AsText(), nil
73 | }
74 |
75 | // WordParse parses a word file
76 | func WordParse(doc string) (WordDocument, error) {
77 |
78 | docx := WordDocument{}
79 | r := strings.NewReader(string(doc))
80 | decoder := xml.NewDecoder(r)
81 |
82 | for {
83 | t, _ := decoder.Token()
84 | if t == nil {
85 | break
86 | }
87 | switch se := t.(type) {
88 | case xml.StartElement:
89 | if se.Name.Local == "p" {
90 | var p WordParagraph
91 | decoder.DecodeElement(&p, &se)
92 | docx.Paragraphs = append(docx.Paragraphs, p)
93 | }
94 | }
95 | }
96 | return docx, nil
97 | }
98 |
99 | func openWordFile(file io.ReaderAt, size int64) (string, error) {
100 |
101 | // Open a zip archive for reading. word files are zip archives
102 | r, err := zip.NewReader(file, size)
103 | if err != nil {
104 | return "", err
105 | }
106 |
107 | // Iterate through the files in the archive,
108 | // find document.xml
109 | for _, f := range r.File {
110 |
111 | //fmt.Printf("Contents of %s:\n", f.Name)
112 | rc, err := f.Open()
113 | if err != nil {
114 | return "", err
115 | }
116 | defer rc.Close()
117 | if f.Name == "word/document.xml" {
118 | doc, err := ioutil.ReadAll(rc)
119 | if err != nil {
120 | return "", err
121 | }
122 | return fmt.Sprintf("%s", doc), nil
123 | }
124 | }
125 |
126 | return "", nil
127 | }
128 |
129 | // IsFileDOCX checks if the data indicates a DOCX file
130 | // DOCX has a signature of 50 4B 03 04
131 | func IsFileDOCX(data []byte) bool {
132 | return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04})
133 | }
134 |
--------------------------------------------------------------------------------
/Decompress.go:
--------------------------------------------------------------------------------
1 | /*
2 | File Name: Decompress.go
3 | Copyright: 2019 Kleissner Investments s.r.o.
4 | Author: Peter Kleissner
5 | */
6 |
7 | package fileconversion
8 |
9 | import (
10 | "archive/tar"
11 | "archive/zip"
12 | "bytes"
13 | "compress/bzip2"
14 | "compress/gzip"
15 | "io"
16 | "io/ioutil"
17 | "time"
18 |
19 | "github.com/nwaples/rardecode"
20 | "github.com/saracen/go7z"
21 | "github.com/ulikunitz/xz"
22 | )
23 |
24 | // DecompressFile decompresses data. It supports: GZ, BZ, BZ2, XZ
25 | func DecompressFile(data []byte) (decompressed []byte, valid bool) {
26 | // Try GZ
27 | if gr, err := gzip.NewReader(bytes.NewBuffer(data)); err == nil {
28 | defer gr.Close()
29 | decompressed, err = ioutil.ReadAll(gr)
30 | if err == nil {
31 | return decompressed, true
32 | }
33 | }
34 |
35 | // BZ, BZ2
36 | br := bzip2.NewReader(bytes.NewBuffer(data))
37 | decompressed, err := ioutil.ReadAll(br)
38 | if err == nil {
39 | return decompressed, true
40 | }
41 |
42 | // XZ
43 | if xr, err := xz.NewReader(bytes.NewBuffer(data)); err == nil {
44 | decompressed, err = ioutil.ReadAll(xr)
45 | if err == nil {
46 | return decompressed, true
47 | }
48 | }
49 |
50 | return nil, false
51 | }
52 |
53 | // ContainerExtractFiles extracts files from supported containers: ZIP, RAR, 7Z, TAR
54 | func ContainerExtractFiles(data []byte, callback func(name string, size int64, date time.Time, data []byte)) {
55 |
56 | // ZIP
57 | if r, err := zip.NewReader(bytes.NewReader(data), int64(len(data))); err == nil {
58 | for _, f := range r.File {
59 | fileReader, err := f.Open()
60 | if err != nil {
61 | continue
62 | }
63 |
64 | data2, err := ioutil.ReadAll(fileReader)
65 | fileReader.Close()
66 | if err != nil {
67 | // If the file is encrypted with a password, this fails with error "4" here.
68 | continue
69 | }
70 |
71 | callback(f.Name, int64(f.UncompressedSize64), f.Modified, data2)
72 | }
73 |
74 | return
75 | }
76 |
77 | // RAR
78 | if rc, err := rardecode.NewReader(bytes.NewReader(data), ""); err == nil {
79 | for {
80 | hdr, err := rc.Next()
81 | if err == io.EOF || err != nil { // break if end of archive or other error returned
82 | break
83 | } else if err == nil && !hdr.IsDir {
84 | if data2, err := ioutil.ReadAll(rc); err == nil {
85 | callback(hdr.Name, hdr.UnPackedSize, hdr.CreationTime, data2)
86 | }
87 | }
88 | }
89 | }
90 |
91 | // 7Z
92 | if sz, err := go7z.NewReader(bytes.NewReader(data), int64(len(data))); err == nil {
93 | for {
94 | hdr, err := sz.Next()
95 | if err == io.EOF || err != nil { // break if end of archive or other error returned
96 | break // End of archive
97 | } else if err == nil && !hdr.IsEmptyFile {
98 | if data2, err := ioutil.ReadAll(sz); err == nil {
99 | callback(hdr.Name, int64(len(data2)), hdr.CreatedAt, data2)
100 | }
101 | }
102 | }
103 | } else if err == go7z.ErrDecompressorNotFound {
104 | // May happen if it's 7Z, but decompressor not available (like 7zAES).
105 | return
106 | }
107 |
108 | // TAR
109 | tr := tar.NewReader(bytes.NewReader(data))
110 | // Iterate through the files in the archive.
111 | for {
112 | hdr, err := tr.Next()
113 | if err == io.EOF {
114 | // end of tar archive
115 | break
116 | }
117 | if err != nil {
118 | // other error
119 | break
120 | }
121 | switch hdr.Typeflag {
122 | case tar.TypeDir:
123 | // directories are ignored
124 | case tar.TypeReg, tar.TypeRegA:
125 | // file
126 | data2, err := ioutil.ReadAll(tr)
127 | if err != nil {
128 | continue
129 | }
130 |
131 | callback(hdr.Name, hdr.Size, hdr.ModTime, data2)
132 | }
133 | }
134 |
135 | }
136 |
--------------------------------------------------------------------------------
/EPUB 2 Text.go:
--------------------------------------------------------------------------------
1 | /*
2 | File Name: EPUB 2 Text.go
3 | Copyright: 2019 Kleissner Investments s.r.o.
4 | Author: Peter Kleissner
5 |
6 | EPUB files are ZIP based and contain the content as HTML files.
7 |
8 | Tested but did not work:
9 | * https://github.com/n3integration/epub could not read 2 sample files. Also no NewReader function available.
10 |
11 | This one was tested and works:
12 | * https://github.com/taylorskalyo/goreader/tree/master/epub
13 |
14 | Sample files via https://github.com/IDPF/epub3-samples/releases.
15 | */
16 |
17 | package fileconversion
18 |
19 | import (
20 | "io"
21 |
22 | "github.com/taylorskalyo/goreader/epub"
23 | )
24 |
25 | // EPUB2Text converts an EPUB ebook to text
26 | func EPUB2Text(file io.ReaderAt, size int64, limit int64) (string, error) {
27 | text := ""
28 |
29 | rc, err := epub.NewReader(file, size)
30 | if err != nil {
31 | return "", nil
32 | }
33 |
34 | // The rootfile (content.opf) lists all of the contents of an epub file.
35 | // There may be multiple rootfiles, although typically there is only one.
36 | book := rc.Rootfiles[0]
37 |
38 | // Print book title.
39 | title := "Title: " + book.Title + "\n\n"
40 | limit -= int64(len(title))
41 | if limit <= 0 {
42 | return title, nil
43 | }
44 |
45 | // List the IDs of files in the book's spine.
46 | for _, item := range book.Spine.Itemrefs {
47 | // item.ID was observed to be in one book: cover,titlepage,brief-toc,xpreface_001,xintroduction_001,xepigraph_001,xchapter_001
48 | reader2, err := item.Open()
49 | if err != nil {
50 | continue
51 | }
52 |
53 | itemText, _ := HTML2Text(reader2)
54 |
55 | // check max length
56 | if limit <= int64(len(itemText)) {
57 | itemText = itemText[:limit]
58 | return title + text, nil
59 | }
60 |
61 | text += itemText
62 | limit -= int64(len(itemText))
63 | }
64 |
65 | if text == "" {
66 | return "", nil
67 | }
68 |
69 | return title + text, nil
70 | }
71 |
--------------------------------------------------------------------------------
/HTML 2 Text.go:
--------------------------------------------------------------------------------
1 | /*
2 | File Name: HTML 2 Text.go
3 | Copyright: 2018 Kleissner Investments s.r.o.
4 | Author: Peter Kleissner
5 | */
6 |
7 | package fileconversion
8 |
9 | import (
10 | "io"
11 | "net/url"
12 | "path"
13 | "strings"
14 |
15 | "github.com/IntelligenceX/fileconversion/html2text"
16 | "github.com/PuerkitoBio/goquery"
17 | "github.com/ssor/bom"
18 | "golang.org/x/net/html"
19 | "golang.org/x/net/html/charset"
20 | )
21 |
22 | // HTML2Text extracts the text from the HTML
23 | func HTML2Text(reader io.Reader) (pageText string, err error) {
24 | // The charset.NewReader ensures that foreign encodings are properly decoded to UTF-8.
25 | // It will make both heuristic checks as well as look for the HTML meta charset tag.
26 | reader, err = charset.NewReader(reader, "")
27 | if err != nil {
28 | return "", err
29 | }
30 |
31 | // The html2text is a forked improved version that converts HTML to human-friendly text.
32 | return html2text.FromReader(reader)
33 | }
34 |
35 | // HTML2TextAndLinks extracts the text from the HTML and all links from and
tags of a HTML
36 | // If the base URL is provided, relative links will be converted to absolute ones.
37 | func HTML2TextAndLinks(reader io.Reader, baseURL string) (pageText string, links []string, err error) {
38 | // The charset.NewReader ensures that foreign encodings are properly decoded to UTF-8.
39 | // It will make both heuristic checks as well as look for the HTML meta charset tag.
40 | reader, err = charset.NewReader(reader, "")
41 | if err != nil {
42 | return "", nil, err
43 | }
44 |
45 | // code from html2text.FromReader to parse the doc
46 | newReader, err := bom.NewReaderWithoutBom(reader)
47 | if err != nil {
48 | return "", nil, err
49 | }
50 | doc, err := html.Parse(newReader)
51 | if err != nil {
52 | return "", nil, err
53 | }
54 |
55 | // get the text
56 | pageText, err = html2text.FromHTMLNode(doc)
57 | if err != nil {
58 | return pageText, nil, err
59 | }
60 |
61 | // get the links
62 | docQ := goquery.NewDocumentFromNode(doc)
63 | docQ.Url, _ = url.Parse(baseURL)
64 | links = processLinks(docQ)
65 |
66 | return pageText, links, err
67 | }
68 |
69 | // ---- below 2 functions are forks from gocrawl/worker.go ----
70 |
71 | func handleBaseTag(root *url.URL, baseHref string, aHref string) string {
72 | resolvedBase, err := root.Parse(baseHref)
73 | if err != nil {
74 | return ""
75 | }
76 |
77 | parsedURL, err := url.Parse(aHref)
78 | if err != nil {
79 | return ""
80 | }
81 | // If a[href] starts with a /, it overrides the base[href]
82 | if parsedURL.Host == "" && !strings.HasPrefix(aHref, "/") {
83 | aHref = path.Join(resolvedBase.Path, aHref)
84 | }
85 |
86 | resolvedURL, err := resolvedBase.Parse(aHref)
87 | if err != nil {
88 | return ""
89 | }
90 | return resolvedURL.String()
91 | }
92 |
93 | // Scrape the document's content to gather all links
94 | func processLinks(doc *goquery.Document) (result []string) {
95 | // process links via tags
96 | baseURL, _ := doc.Find("base[href]").Attr("href")
97 | urls := doc.Find("a[href]").Map(func(_ int, s *goquery.Selection) string {
98 | val, _ := s.Attr("href")
99 | if baseURL != "" {
100 | val = handleBaseTag(doc.Url, baseURL, val)
101 | }
102 | return val
103 | })
104 |
105 | // all image references via
tag
106 | imgURLs := doc.Find("img[src]").Map(func(_ int, s *goquery.Selection) string {
107 | val, _ := s.Attr("src")
108 | if baseURL != "" {
109 | val = handleBaseTag(doc.Url, baseURL, val)
110 | }
111 | return val
112 | })
113 | urls = append(urls, imgURLs...)
114 |
115 | // form submission links
63 |
64 |
65 |
66 | Header 1 | Header 2 |
67 |
68 |
69 | Footer 1 | Footer 2 |
70 |
71 |
72 | Row 1 Col 1 | Row 1 Col 2 |
73 | Row 2 Col 1 | Row 2 Col 2 |
74 |
75 |
76 |
77 | `
78 |
79 | text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true})
80 | if err != nil {
81 | panic(err)
82 | }
83 | fmt.Println(text)
84 | }
85 | ```
86 |
87 | Output:
88 | ```
89 | Mega Service ( http://jaytaylor.com/ )
90 |
91 | ******************************************
92 | Welcome to your new account on my service!
93 | ******************************************
94 |
95 | Here is some more information:
96 |
97 | * Link 1: Example.com ( https://example.com )
98 | * Link 2: Example2.com ( https://example2.com )
99 | * Something else
100 |
101 | +-------------+-------------+
102 | | HEADER 1 | HEADER 2 |
103 | +-------------+-------------+
104 | | Row 1 Col 1 | Row 1 Col 2 |
105 | | Row 2 Col 1 | Row 2 Col 2 |
106 | +-------------+-------------+
107 | | FOOTER 1 | FOOTER 2 |
108 | +-------------+-------------+
109 | ```
110 |
111 |
112 | ## Unit-tests
113 |
114 | Running the unit-tests is straightforward and standard:
115 |
116 | ```bash
117 | go test
118 | ```
119 |
120 |
--------------------------------------------------------------------------------
/html2text/html2text.go:
--------------------------------------------------------------------------------
1 | package html2text
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | "regexp"
7 | "strings"
8 | "unicode"
9 |
10 | "github.com/olekukonko/tablewriter"
11 | "github.com/ssor/bom"
12 | "golang.org/x/net/html"
13 | "golang.org/x/net/html/atom"
14 | )
15 |
16 | // Options provide toggles and overrides to control specific rendering behaviors.
17 | type Options struct {
18 | PrettyTables bool // Turns on pretty ASCII rendering for table elements.
19 | PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
20 | OmitLinks bool // Turns on omitting links
21 | }
22 |
23 | // PrettyTablesOptions overrides tablewriter behaviors
24 | type PrettyTablesOptions struct {
25 | AutoFormatHeader bool
26 | AutoWrapText bool
27 | ReflowDuringAutoWrap bool
28 | ColWidth int
29 | ColumnSeparator string
30 | RowSeparator string
31 | CenterSeparator string
32 | HeaderAlignment int
33 | FooterAlignment int
34 | Alignment int
35 | ColumnAlignment []int
36 | NewLine string
37 | HeaderLine bool
38 | RowLine bool
39 | AutoMergeCells bool
40 | Borders tablewriter.Border
41 | }
42 |
43 | // NewPrettyTablesOptions creates PrettyTablesOptions with default settings
44 | func NewPrettyTablesOptions() *PrettyTablesOptions {
45 | return &PrettyTablesOptions{
46 | AutoFormatHeader: true,
47 | AutoWrapText: true,
48 | ReflowDuringAutoWrap: true,
49 | ColWidth: tablewriter.MAX_ROW_WIDTH,
50 | ColumnSeparator: tablewriter.COLUMN,
51 | RowSeparator: tablewriter.ROW,
52 | CenterSeparator: tablewriter.CENTER,
53 | HeaderAlignment: tablewriter.ALIGN_DEFAULT,
54 | FooterAlignment: tablewriter.ALIGN_DEFAULT,
55 | Alignment: tablewriter.ALIGN_DEFAULT,
56 | ColumnAlignment: []int{},
57 | NewLine: tablewriter.NEWLINE,
58 | HeaderLine: true,
59 | RowLine: false,
60 | AutoMergeCells: false,
61 | Borders: tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true},
62 | }
63 | }
64 |
65 | // FromHTMLNode renders text output from a pre-parsed HTML document.
66 | func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
67 | var options Options
68 | if len(o) > 0 {
69 | options = o[0]
70 | }
71 |
72 | ctx := textifyTraverseContext{
73 | buf: bytes.Buffer{},
74 | options: options,
75 | }
76 | if err := ctx.traverse(doc); err != nil {
77 | return "", err
78 | }
79 |
80 | text := strings.TrimSpace(newlineRe.ReplaceAllString(
81 | strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
82 | )
83 | return text, nil
84 | }
85 |
86 | // FromReader renders text output after parsing HTML for the specified
87 | // io.Reader.
88 | func FromReader(reader io.Reader, options ...Options) (string, error) {
89 | newReader, err := bom.NewReaderWithoutBom(reader)
90 | if err != nil {
91 | return "", err
92 | }
93 | doc, err := html.Parse(newReader)
94 | if err != nil {
95 | return "", err
96 | }
97 | return FromHTMLNode(doc, options...)
98 | }
99 |
100 | // FromString parses HTML from the input string, then renders the text form.
101 | func FromString(input string, options ...Options) (string, error) {
102 | bs := bom.CleanBom([]byte(input))
103 | text, err := FromReader(bytes.NewReader(bs), options...)
104 | if err != nil {
105 | return "", err
106 | }
107 | return text, nil
108 | }
109 |
110 | var (
111 | spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
112 | newlineRe = regexp.MustCompile(`\n\n+`)
113 | )
114 |
115 | // traverseTableCtx holds text-related context.
116 | type textifyTraverseContext struct {
117 | buf bytes.Buffer
118 |
119 | prefix string
120 | tableCtx tableTraverseContext
121 | options Options
122 | endsWithSpace bool
123 | justClosedDiv bool
124 | blockquoteLevel int
125 | lineLength int
126 | isPre bool
127 | isVirtualBQ bool // virtual blockquote
128 | }
129 |
130 | // tableTraverseContext holds table ASCII-form related context.
131 | type tableTraverseContext struct {
132 | header []string
133 | body [][]string
134 | footer []string
135 | tmpRow int
136 | isInFooter bool
137 | }
138 |
139 | func (tableCtx *tableTraverseContext) init() {
140 | tableCtx.body = [][]string{}
141 | tableCtx.header = []string{}
142 | tableCtx.footer = []string{}
143 | tableCtx.isInFooter = false
144 | tableCtx.tmpRow = 0
145 | }
146 |
147 | func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
148 | ctx.justClosedDiv = false
149 |
150 | switch node.DataAtom {
151 | case atom.Br:
152 | return ctx.emit("\n")
153 |
154 | case atom.H1, atom.H2, atom.H3:
155 | subCtx := textifyTraverseContext{}
156 | if err := subCtx.traverseChildren(node); err != nil {
157 | return err
158 | }
159 |
160 | str := subCtx.buf.String()
161 | dividerLen := 0
162 | for _, line := range strings.Split(str, "\n") {
163 | if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
164 | dividerLen = lineLen - 1
165 | }
166 | }
167 | var divider string
168 | if node.DataAtom == atom.H1 {
169 | divider = strings.Repeat("*", dividerLen)
170 | } else {
171 | divider = strings.Repeat("-", dividerLen)
172 | }
173 |
174 | if node.DataAtom == atom.H3 {
175 | return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
176 | }
177 | return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
178 |
179 | case atom.Blockquote:
180 | if ctx.buf.Len() == 0 && !ctx.isVirtualBQ { // do not apply blockquote if full html is blockquote
181 | return ctx.traverseChildren(node)
182 | }
183 |
184 | ctx.blockquoteLevel++
185 | ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
186 | // following lines are disabled, otherwise it outputs 2x empty '>' lines
187 | //if err := ctx.emit("\n"); err != nil {
188 | // return err
189 | //}
190 | //if ctx.blockquoteLevel == 1 {
191 | // if err := ctx.emit("\n"); err != nil {
192 | // return err
193 | // }
194 | //}
195 | if err := ctx.traverseChildren(node); err != nil {
196 | return err
197 | }
198 | ctx.blockquoteLevel--
199 | ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
200 | if ctx.blockquoteLevel > 0 {
201 | ctx.prefix += " "
202 | }
203 | // to remove the last "> " (or multiple on levels) added we would have to make some magic with the ctx.buf
204 | return ctx.emit("\n\n")
205 |
206 | case atom.Div:
207 | // hack as blockquote
208 | for _, attr := range node.Attr {
209 | if attr.Key == "class" && attr.Val == "quote_container" {
210 | node.DataAtom = atom.Blockquote
211 | ctx.isVirtualBQ = true
212 | err := ctx.handleElement(node)
213 | ctx.isVirtualBQ = false
214 | return err
215 | }
216 | }
217 |
218 | if ctx.lineLength > 0 {
219 | if err := ctx.emit("\n"); err != nil {
220 | return err
221 | }
222 | }
223 | if err := ctx.traverseChildren(node); err != nil {
224 | return err
225 | }
226 | var err error
227 | if !ctx.justClosedDiv {
228 | err = ctx.emit("\n")
229 | }
230 | ctx.justClosedDiv = true
231 | return err
232 |
233 | case atom.Li:
234 | if err := ctx.emit("* "); err != nil {
235 | return err
236 | }
237 |
238 | if err := ctx.traverseChildren(node); err != nil {
239 | return err
240 | }
241 |
242 | return ctx.emit("\n")
243 |
244 | case atom.B, atom.Strong:
245 | subCtx := textifyTraverseContext{}
246 | subCtx.endsWithSpace = true
247 | if err := subCtx.traverseChildren(node); err != nil {
248 | return err
249 | }
250 | str := subCtx.buf.String()
251 | return ctx.emit("*" + str + "*")
252 |
253 | case atom.A:
254 | linkText := ""
255 | // For simple link element content with single text node only, peek at the link text.
256 | if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode {
257 | linkText = node.FirstChild.Data
258 | }
259 |
260 | // If image is the only child, take its alt text as the link text.
261 | if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
262 | if altText := getAttrVal(img, "alt"); altText != "" {
263 | if err := ctx.emit(altText); err != nil {
264 | return err
265 | }
266 | }
267 | } else if err := ctx.traverseChildren(node); err != nil {
268 | return err
269 | }
270 |
271 | hrefLink := ""
272 | if attrVal := getAttrVal(node, "href"); attrVal != "" {
273 | attrVal = ctx.normalizeHrefLink(attrVal)
274 | // Don't print link href if it matches link element content or if the link is empty.
275 | if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal {
276 | hrefLink = "( " + attrVal + " )"
277 | }
278 | }
279 |
280 | return ctx.emit(hrefLink)
281 |
282 | case atom.P, atom.Ul:
283 | return ctx.paragraphHandler(node)
284 |
285 | case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td:
286 | if ctx.options.PrettyTables {
287 | return ctx.handleTableElement(node)
288 | } else if node.DataAtom == atom.Table {
289 | return ctx.paragraphHandler(node)
290 | }
291 | return ctx.traverseChildren(node)
292 |
293 | case atom.Pre, atom.Code:
294 | ctx.isPre = true
295 | err := ctx.traverseChildren(node)
296 | ctx.isPre = false
297 | return err
298 |
299 | case atom.Style, atom.Script, atom.Head:
300 | // Ignore the subtree.
301 | return nil
302 |
303 | case atom.Noscript:
304 | // Because of bug https://github.com/golang/go/issues/16318 we have to remove the whole content in