├── Conversion_test.go ├── DOC 2 Text.go ├── DOCX 2 Text.go ├── Decompress.go ├── EPUB 2 Text.go ├── HTML 2 Text.go ├── LICENSE ├── MBOX.go ├── MOBI 2 Text.go ├── ODS 2 Text.go ├── ODT 2 Text.go ├── PDF 2 Image.go ├── PDF 2 Text.go ├── PPT 2 Text.go ├── PPTX 2 Text.go ├── Picture.go ├── README.md ├── RTF 2 Text.go ├── XLS 2 Text.go ├── XLSX 2 Text.go ├── ZIP.go ├── html2text ├── README.md ├── html2text.go ├── html2text_test.go └── testdata │ ├── utf8.html │ └── utf8_with_bom.xhtml ├── odf ├── Readme.md ├── meta.go ├── meta_test.go ├── ods │ ├── ods_test.go │ ├── read.go │ └── test.ods └── read.go ├── ole2 ├── README.md ├── dir.go ├── header.go ├── ole.go ├── pss.go ├── sector.go ├── stream_reader.go └── stream_reader_test.go └── xls ├── README.md ├── bigtable_test.go ├── bof.go ├── cell_range.go ├── col.go ├── comparexlsxlsx.go ├── date.go ├── doc.go ├── example_test.go ├── font.go ├── format.go ├── issue47_test.go ├── row.go ├── sst.go ├── workbook.go ├── worksheet.go ├── xf.go ├── xls.go └── xls_test.go /Conversion_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: Conversion_test.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | */ 6 | 7 | package fileconversion 8 | 9 | import ( 10 | "bytes" 11 | "fmt" 12 | "io/ioutil" 13 | "os" 14 | "testing" 15 | ) 16 | 17 | func TestXLS(t *testing.T) { 18 | // open local file to extract text and output to command line 19 | file, err := os.Open("test.xls") 20 | if err != nil { 21 | return 22 | } 23 | 24 | defer file.Close() 25 | 26 | XLS2Text(file, os.Stdout, 1*1024*1024) 27 | } 28 | 29 | func TestPPTX(t *testing.T) { 30 | // open local file to extract text and output to command line 31 | file, err := os.Open("test.pptx") 32 | if err != nil { 33 | return 34 | } 35 | 36 | defer file.Close() 37 | 38 | stat, _ := file.Stat() 39 | 40 | text, _ := PPTX2Text(file, stat.Size()) 41 | fmt.Print(text) 42 | } 43 | 44 | func TestODS(t *testing.T) { 45 | // open local file to extract text and output to command line 46 | file, err := os.Open("test.ods") 47 | if err != nil { 48 | return 49 | } 50 | 51 | defer file.Close() 52 | stat, _ := file.Stat() 53 | 54 | ODS2Text(file, stat.Size(), os.Stdout, 1*1024*1024) 55 | } 56 | 57 | func TestExcelCell(t *testing.T) { 58 | file1, err := os.Open("test.xls") 59 | if err == nil { 60 | cells, _ := XLS2Cells(file1) 61 | for n, cell := range cells { 62 | fmt.Printf("%s\n", cell) 63 | if n > 20 { 64 | break 65 | } 66 | } 67 | 68 | file1.Close() 69 | } 70 | 71 | file1, err = os.Open("test.xlsx") 72 | if err == nil { 73 | stat, _ := file1.Stat() 74 | cells, _ := XLSX2Cells(file1, stat.Size(), 1000) 75 | for n, cell := range cells { 76 | fmt.Printf("%s\n", cell) 77 | if n > 20 { 78 | break 79 | } 80 | } 81 | 82 | file1.Close() 83 | } 84 | 85 | file1, err = os.Open("test.ods") 86 | if err == nil { 87 | stat, _ := file1.Stat() 88 | cells, _ := ODS2Cells(file1, stat.Size()) 89 | for n, cell := range cells { 90 | fmt.Printf("%s\n", cell) 91 | if n > 20 { 92 | break 93 | } 94 | } 95 | 96 | file1.Close() 97 | } 98 | 99 | } 100 | 101 | func TestCSV(t *testing.T) { 102 | file, err := os.Open("test.txt") 103 | if err != nil { 104 | return 105 | } 106 | defer file.Close() 107 | 108 | content, _ := ioutil.ReadAll(file) 109 | 110 | IsCSV(content) 111 | } 112 | 113 | func TestEPUB(t *testing.T) { 114 | // open local file to extract text and output to command line 115 | file, err := os.Open("moby-dick.epub") 116 | if err != nil { 117 | return 118 | } 119 | 120 | defer file.Close() 121 | 122 | stat, _ := file.Stat() 123 | 124 | text, _ := EPUB2Text(file, stat.Size(), 1000) 125 | fmt.Print(text) 126 | } 127 | 128 | func TestMOBI(t *testing.T) { 129 | // open local file to extract text and output to command line 130 | file, err := os.Open("windows-1252.mobi") 131 | if err != nil { 132 | return 133 | } 134 | 135 | defer file.Close() 136 | 137 | text, _ := Mobi2Text(file) 138 | fmt.Print(text) 139 | } 140 | 141 | func TestPDFImage(t *testing.T) { 142 | // open local file to extract images 143 | file, err := os.Open("test.pdf") 144 | if err != nil { 145 | return 146 | } 147 | 148 | defer file.Close() 149 | 150 | images, _ := PDFExtractImages(file) 151 | fmt.Print(len(images)) 152 | } 153 | 154 | func TestPD2Text(t *testing.T) { 155 | file, err := os.Open("1.pdf") 156 | if err != nil { 157 | return 158 | } 159 | 160 | defer file.Close() 161 | 162 | buffer := bytes.NewBuffer(make([]byte, 0, 2*1024)) 163 | PDFListContentStreams(file, buffer, 2*1024) 164 | 165 | fmt.Println(buffer.String()) 166 | } 167 | 168 | func TestODTText(t *testing.T) { 169 | file, err := os.Open("Test\\file-sample_500kB.odt") 170 | if err != nil { 171 | return 172 | } 173 | 174 | defer file.Close() 175 | stat, _ := file.Stat() 176 | 177 | buffer := bytes.NewBuffer(make([]byte, 0, 2*1024)) 178 | 179 | ODT2Text(file, stat.Size(), buffer, 2*1024) 180 | 181 | fmt.Println(buffer.String()) 182 | } 183 | 184 | // TestXLSX extracts text from an XLSX file. 185 | // Memory usage: 100 rows = 52 MB, 500 rows = 200 MB, 1000 rows = 400 MB, 2000/5000/10000/-1 rows = 700 MB 186 | func TestXLSX(t *testing.T) { 187 | file, err := os.Open("Test\\971bd55b-5cbd-43d2-899e-d4a2a7d0a883.xlsx") 188 | if err != nil { 189 | return 190 | } 191 | 192 | defer file.Close() 193 | stat, _ := file.Stat() 194 | 195 | buffer := bytes.NewBuffer(make([]byte, 0, 2*1024)) 196 | 197 | XLSX2Text(file, stat.Size(), buffer, 2*1024, -1) 198 | 199 | fmt.Println(buffer.String()) 200 | } 201 | -------------------------------------------------------------------------------- /DOC 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: DOC 2 Text.go 3 | Copyright: 2018 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | This code is forked from https://github.com/EndFirstCorp/doc2txt and extracts text from DOC files, the legacy binary Word files. 7 | */ 8 | 9 | package fileconversion 10 | 11 | import ( 12 | "bytes" 13 | "encoding/binary" 14 | "errors" 15 | "io" 16 | "unicode/utf16" 17 | "unicode/utf8" 18 | 19 | "github.com/mattetti/filebuffer" 20 | "github.com/richardlehane/mscfb" 21 | ) 22 | 23 | // ---- file doc.go ---- 24 | // There were a few changes in this file to actually support Unicode which the old code was not. 25 | 26 | var ( 27 | errTable = errors.New("cannot find table stream") 28 | errDocEmpty = errors.New("WordDocument not found") 29 | errDocShort = errors.New("wordDoc block too short") 30 | errInvalidArgument = errors.New("invalid table and/or fib") 31 | ) 32 | 33 | type allReader interface { 34 | io.Closer 35 | io.ReaderAt 36 | io.ReadSeeker 37 | } 38 | 39 | func wrapError(e error) error { 40 | return errors.New("Error processing file: " + e.Error()) 41 | } 42 | 43 | // DOC2Text converts a standard io.Reader from a Microsoft Word .doc binary file and returns a reader (actually a bytes.Buffer) which will output the plain text found in the .doc file 44 | func DOC2Text(r io.Reader) (io.Reader, error) { 45 | ra, ok := r.(io.ReaderAt) 46 | if !ok { 47 | ra, _, err := toMemoryBuffer(r) 48 | if err != nil { 49 | return nil, wrapError(err) 50 | } 51 | defer ra.Close() 52 | } 53 | 54 | d, err := mscfb.New(ra) 55 | if err != nil { 56 | return nil, wrapError(err) 57 | } 58 | 59 | wordDoc, table0, table1 := getWordDocAndTables(d) 60 | fib, err := getFib(wordDoc) 61 | if err != nil { 62 | return nil, wrapError(err) 63 | } 64 | 65 | table := getActiveTable(table0, table1, fib) 66 | if table == nil { 67 | return nil, wrapError(errTable) 68 | } 69 | 70 | clx, err := getClx(table, fib) 71 | if err != nil { 72 | return nil, wrapError(err) 73 | } 74 | 75 | return getText(wordDoc, clx) 76 | } 77 | 78 | func toMemoryBuffer(r io.Reader) (allReader, int64, error) { 79 | var b bytes.Buffer 80 | size, err := b.ReadFrom(r) 81 | if err != nil { 82 | return nil, 0, err 83 | } 84 | fb := filebuffer.New(b.Bytes()) 85 | return fb, size, nil 86 | } 87 | 88 | func getText(wordDoc *mscfb.File, clx *clx) (io.Reader, error) { 89 | var buf bytes.Buffer 90 | for i := 0; i < len(clx.pcdt.PlcPcd.aPcd); i++ { 91 | pcd := clx.pcdt.PlcPcd.aPcd[i] 92 | cp := clx.pcdt.PlcPcd.aCP[i] 93 | cpNext := clx.pcdt.PlcPcd.aCP[i+1] 94 | 95 | var start, end int 96 | // https://msdn.microsoft.com/ko-kr/library/office/gg615596(v=office.14).aspx 97 | // Read the value of the Pcd.Fc.fCompressed field at bit 46 of the current Pcd structure. If 0, the Pcd structure refers to a 16-bit Unicode character. If 1, it refers to an 8-bit ANSI character. 98 | if pcd.fc.fCompressed { 99 | start = pcd.fc.fc / 2 100 | end = start + cpNext - cp 101 | } else { 102 | // -> 16-bit Unicode characters 103 | start = pcd.fc.fc 104 | end = start + 2*(cpNext-cp) 105 | } 106 | 107 | b := make([]byte, end-start) 108 | _, err := wordDoc.ReadAt(b, int64(start)) // read all the characters 109 | if err != nil { 110 | return nil, err 111 | } 112 | translateText(b, &buf, pcd.fc.fCompressed) 113 | } 114 | return &buf, nil 115 | } 116 | 117 | // translateText translates the buffer into text. fCompressed = 0 for 16-bit Unicode, 1 = 8-bit ANSI characters. 118 | func translateText(b []byte, buf *bytes.Buffer, fCompressed bool) { 119 | u16s := make([]uint16, 1) 120 | b8buf := make([]byte, 4) 121 | 122 | fieldLevel := 0 123 | var isFieldChar bool 124 | for cIndex := range b { 125 | // Convert to rune 126 | var char rune 127 | if fCompressed { 128 | // ANSI, 1 byte 129 | char = rune(b[cIndex]) 130 | } else { 131 | // 16-bit Unicode: skip every second byte 132 | if cIndex%2 != 0 { 133 | continue 134 | } else if (cIndex + 1) >= len(b) { // make sure there are at least 2 bytes for Unicode decoding 135 | continue 136 | } 137 | 138 | // convert from UTF16 to UTF8 139 | u16s[0] = uint16(b[cIndex]) + (uint16(b[cIndex+1]) << 8) 140 | r := utf16.Decode(u16s) 141 | if len(r) != 1 { 142 | //fmt.Printf("Invalid rune %v\n", r) 143 | continue 144 | } 145 | char = r[0] 146 | } 147 | 148 | // Handle special field characters (section 2.8.25) 149 | if char == 0x13 { 150 | isFieldChar = true 151 | fieldLevel++ 152 | continue 153 | } else if char == 0x14 { 154 | isFieldChar = false 155 | continue 156 | } else if char == 0x15 { 157 | isFieldChar = false 158 | continue 159 | } else if isFieldChar { 160 | continue 161 | } 162 | 163 | if char == 7 { // table column separator 164 | buf.WriteByte(' ') 165 | continue 166 | } else if char < 32 && char != 9 && char != 10 && char != 13 { // skip non-printable ASCII characters 167 | //buf.Write([]byte(fmt.Sprintf("|%#x|", char))) 168 | continue 169 | } 170 | 171 | if fCompressed { // compressed, so replace compressed characters 172 | buf.Write(replaceCompressed(byte(char))) 173 | } else { 174 | // encode the rune to UTF-8 175 | n := utf8.EncodeRune(b8buf, char) 176 | buf.Write(b8buf[:n]) 177 | } 178 | } 179 | } 180 | 181 | func replaceCompressed(char byte) []byte { 182 | var v uint16 183 | switch char { 184 | case 0x82: 185 | v = 0x201A 186 | case 0x83: 187 | v = 0x0192 188 | case 0x84: 189 | v = 0x201E 190 | case 0x85: 191 | v = 0x2026 192 | case 0x86: 193 | v = 0x2020 194 | case 0x87: 195 | v = 0x2021 196 | case 0x88: 197 | v = 0x02C6 198 | case 0x89: 199 | v = 0x2030 200 | case 0x8A: 201 | v = 0x0160 202 | case 0x8B: 203 | v = 0x2039 204 | case 0x8C: 205 | v = 0x0152 206 | case 0x91: 207 | v = 0x2018 208 | case 0x92: 209 | v = 0x2019 210 | case 0x93: 211 | v = 0x201C 212 | case 0x94: 213 | v = 0x201D 214 | case 0x95: 215 | v = 0x2022 216 | case 0x96: 217 | v = 0x2013 218 | case 0x97: 219 | v = 0x2014 220 | case 0x98: 221 | v = 0x02DC 222 | case 0x99: 223 | v = 0x2122 224 | case 0x9A: 225 | v = 0x0161 226 | case 0x9B: 227 | v = 0x203A 228 | case 0x9C: 229 | v = 0x0153 230 | case 0x9F: 231 | v = 0x0178 232 | default: 233 | return []byte{char} 234 | } 235 | out := make([]byte, 2) 236 | binary.LittleEndian.PutUint16(out, v) 237 | return out 238 | } 239 | 240 | func getWordDocAndTables(r *mscfb.Reader) (*mscfb.File, *mscfb.File, *mscfb.File) { 241 | var wordDoc, table0, table1 *mscfb.File 242 | for i := 0; i < len(r.File); i++ { 243 | stream := r.File[i] 244 | 245 | switch stream.Name { 246 | case "WordDocument": 247 | wordDoc = stream 248 | case "0Table": 249 | table0 = stream 250 | case "1Table": 251 | table1 = stream 252 | } 253 | } 254 | return wordDoc, table0, table1 255 | } 256 | 257 | func getActiveTable(table0 *mscfb.File, table1 *mscfb.File, f *fib) *mscfb.File { 258 | if f.base.fWhichTblStm == 0 { 259 | return table0 260 | } 261 | return table1 262 | } 263 | 264 | // ---- file fib.go ---- 265 | 266 | var ( 267 | errFibInvalid = errors.New("file information block validation failed") 268 | ) 269 | 270 | type fib struct { 271 | base fibBase 272 | csw int 273 | fibRgW fibRgW 274 | cslw int 275 | fibRgLw fibRgLw 276 | cbRgFcLcb int 277 | fibRgFcLcb fibRgFcLcb 278 | } 279 | 280 | type fibBase struct { 281 | fWhichTblStm int 282 | } 283 | 284 | type fibRgW struct { 285 | } 286 | 287 | type fibRgLw struct { 288 | ccpText int 289 | ccpFtn int 290 | ccpHdd int 291 | ccpMcr int 292 | ccpAtn int 293 | ccpEdn int 294 | ccpTxbx int 295 | ccpHdrTxbx int 296 | cpLength int 297 | } 298 | 299 | type fibRgFcLcb struct { 300 | fcPlcfFldMom int 301 | lcbPlcfFldMom int 302 | fcPlcfFldHdr int 303 | lcbPlcfFldHdr int 304 | fcPlcfFldFtn int 305 | lcbPlcfFldFtn int 306 | fcPlcfFldAtn int 307 | lcbPlcfFldAtn int 308 | fcClx int 309 | lcbClx int 310 | } 311 | 312 | // parse File Information Block (section 2.5.1) 313 | func getFib(wordDoc *mscfb.File) (*fib, error) { 314 | if wordDoc == nil { 315 | return nil, errDocEmpty 316 | } 317 | 318 | b := make([]byte, 898) // get FIB block up to FibRgFcLcb97 319 | _, err := wordDoc.ReadAt(b, 0) 320 | if err != nil { 321 | return nil, err 322 | } 323 | 324 | fibBase := getFibBase(b[0:32]) 325 | 326 | fibRgW, csw, err := getFibRgW(b, 32) 327 | if err != nil { 328 | return nil, err 329 | } 330 | 331 | fibRgLw, cslw, err := getFibRgLw(b, 34+csw) 332 | if err != nil { 333 | return nil, err 334 | } 335 | 336 | fibRgFcLcb, cbRgFcLcb, err := getFibRgFcLcb(b, 34+csw+2+cslw) 337 | 338 | return &fib{base: *fibBase, csw: csw, cslw: cslw, fibRgW: *fibRgW, fibRgLw: *fibRgLw, fibRgFcLcb: *fibRgFcLcb, cbRgFcLcb: cbRgFcLcb}, err 339 | } 340 | 341 | // parse FibBase (section 2.5.2) 342 | func getFibBase(fib []byte) *fibBase { 343 | byt := fib[11] // fWhichTblStm is 2nd highest bit in this byte 344 | fWhichTblStm := int(byt >> 1 & 1) // set which table (0Table or 1Table) is the table stream 345 | return &fibBase{fWhichTblStm: fWhichTblStm} 346 | } 347 | 348 | func getFibRgW(fib []byte, start int) (*fibRgW, int, error) { 349 | if start+2 >= len(fib) { // must be big enough for csw 350 | return &fibRgW{}, 0, errFibInvalid 351 | } 352 | 353 | csw := int(binary.LittleEndian.Uint16(fib[start:start+2])) * 2 // in bytes 354 | return &fibRgW{}, csw, nil 355 | } 356 | 357 | // parse FibRgLw (section 2.5.4) 358 | func getFibRgLw(fib []byte, start int) (*fibRgLw, int, error) { 359 | fibRgLwStart := start + 2 // skip cslw 360 | if fibRgLwStart+88 >= len(fib) { // expect 88 bytes in fibRgLw 361 | return &fibRgLw{}, 0, errFibInvalid 362 | } 363 | 364 | cslw := getInt16(fib, start) * 4 // in bytes 365 | ccpText := getInt(fib, fibRgLwStart+3*4) 366 | ccpFtn := getInt(fib, fibRgLwStart+4*4) 367 | ccpHdd := getInt(fib, fibRgLwStart+5*4) 368 | ccpMcr := getInt(fib, fibRgLwStart+6*4) 369 | ccpAtn := getInt(fib, fibRgLwStart+7*4) 370 | ccpEdn := getInt(fib, fibRgLwStart+8*4) 371 | ccpTxbx := getInt(fib, fibRgLwStart+9*4) 372 | ccpHdrTxbx := getInt(fib, fibRgLwStart+10*4) 373 | 374 | // calculate cpLength. Used in PlcPcd verification (see section 2.8.35) 375 | var cpLength int 376 | if ccpFtn != 0 || ccpHdd != 0 || ccpMcr != 0 || ccpAtn != 0 || ccpEdn != 0 || ccpTxbx != 0 || ccpHdrTxbx != 0 { 377 | cpLength = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx + ccpText + 1 378 | } else { 379 | cpLength = ccpText 380 | } 381 | return &fibRgLw{ccpText: ccpText, ccpFtn: ccpFtn, ccpHdd: ccpHdd, ccpMcr: ccpMcr, ccpAtn: ccpAtn, 382 | ccpEdn: ccpEdn, ccpTxbx: ccpTxbx, ccpHdrTxbx: ccpHdrTxbx, cpLength: cpLength}, cslw, nil 383 | } 384 | 385 | // parse FibRgFcLcb (section 2.5.5) 386 | func getFibRgFcLcb(fib []byte, start int) (*fibRgFcLcb, int, error) { 387 | fibRgFcLcbStart := start + 2 // skip cbRgFcLcb 388 | if fibRgFcLcbStart+186*4 < len(fib) { // expect 186+ values in FibRgFcLcb 389 | return &fibRgFcLcb{}, 0, errFibInvalid 390 | } 391 | 392 | cbRgFcLcb := getInt16(fib, start) 393 | fcPlcfFldMom := getInt(fib, fibRgFcLcbStart+32*4) 394 | lcbPlcfFldMom := getInt(fib, fibRgFcLcbStart+33*4) 395 | fcPlcfFldHdr := getInt(fib, fibRgFcLcbStart+34*4) 396 | lcbPlcfFldHdr := getInt(fib, fibRgFcLcbStart+35*4) 397 | fcPlcfFldFtn := getInt(fib, fibRgFcLcbStart+36*4) 398 | lcbPlcfFldFtn := getInt(fib, fibRgFcLcbStart+37*4) 399 | fcPlcfFldAtn := getInt(fib, fibRgFcLcbStart+38*4) 400 | lcbPlcfFldAtn := getInt(fib, fibRgFcLcbStart+39*4) 401 | fcClx := getInt(fib, fibRgFcLcbStart+66*4) 402 | lcbClx := getInt(fib, fibRgFcLcbStart+67*4) 403 | return &fibRgFcLcb{fcPlcfFldMom: fcPlcfFldMom, lcbPlcfFldMom: lcbPlcfFldMom, fcPlcfFldHdr: fcPlcfFldHdr, lcbPlcfFldHdr: lcbPlcfFldHdr, 404 | fcPlcfFldFtn: fcPlcfFldFtn, lcbPlcfFldFtn: lcbPlcfFldFtn, fcPlcfFldAtn: fcPlcfFldAtn, lcbPlcfFldAtn: lcbPlcfFldAtn, 405 | fcClx: fcClx, lcbClx: lcbClx}, cbRgFcLcb, nil 406 | } 407 | 408 | func getInt16(buf []byte, start int) int { 409 | return int(binary.LittleEndian.Uint16(buf[start : start+2])) 410 | } 411 | func getInt(buf []byte, start int) int { 412 | return int(binary.LittleEndian.Uint32(buf[start : start+4])) 413 | } 414 | 415 | // ---- file clx.go ---- 416 | 417 | var ( 418 | errInvalidPrc = errors.New("Invalid Prc structure") 419 | errInvalidClx = errors.New("expected last aCP value to equal fib.cpLength (2.8.35)") 420 | errInvalidPcdt = errors.New("expected clxt to be equal 0x02") 421 | ) 422 | 423 | type clx struct { 424 | pcdt pcdt 425 | } 426 | 427 | type pcdt struct { 428 | lcb int 429 | PlcPcd plcPcd 430 | } 431 | 432 | type plcPcd struct { 433 | aCP []int 434 | aPcd []pcd 435 | } 436 | 437 | type pcd struct { 438 | fc fcCompressed 439 | } 440 | 441 | type fcCompressed struct { 442 | fc int 443 | fCompressed bool 444 | } 445 | 446 | // read Clx (section 2.9.38) 447 | func getClx(table *mscfb.File, fib *fib) (*clx, error) { 448 | if table == nil || fib == nil { 449 | return nil, errInvalidArgument 450 | } 451 | b, err := readClx(table, fib) 452 | if err != nil { 453 | return nil, err 454 | } 455 | 456 | pcdtOffset, err := getPrcArrayEnd(b) 457 | if err != nil { 458 | return nil, err 459 | } 460 | 461 | pcdt, err := getPcdt(b, pcdtOffset) 462 | if err != nil { 463 | return nil, err 464 | } 465 | 466 | if pcdt.PlcPcd.aCP[len(pcdt.PlcPcd.aCP)-1] != fib.fibRgLw.cpLength { 467 | return nil, errInvalidClx 468 | } 469 | 470 | return &clx{pcdt: *pcdt}, nil 471 | } 472 | 473 | func readClx(table *mscfb.File, fib *fib) ([]byte, error) { 474 | b := make([]byte, fib.fibRgFcLcb.lcbClx) 475 | _, err := table.ReadAt(b, int64(fib.fibRgFcLcb.fcClx)) 476 | if err != nil { 477 | return nil, err 478 | } 479 | return b, nil 480 | } 481 | 482 | // read Pcdt from Clx (section 2.9.178) 483 | func getPcdt(clx []byte, pcdtOffset int) (*pcdt, error) { 484 | const pcdSize = 8 485 | if pcdtOffset < 0 || pcdtOffset+5 >= len(clx) { 486 | return nil, errInvalidPcdt 487 | } 488 | if clx[pcdtOffset] != 0x02 { // clxt must be 0x02 or invalid 489 | return nil, errInvalidPcdt 490 | } 491 | lcb := int(binary.LittleEndian.Uint32(clx[pcdtOffset+1 : pcdtOffset+5])) // skip clxt, get lcb 492 | plcPcdOffset := pcdtOffset + 5 // skip clxt and lcb 493 | numPcds := (lcb - 4) / (4 + pcdSize) // see 2.2.2 in the spec for equation 494 | numCps := numPcds + 1 // always 1 more cp than pcds 495 | 496 | cps := make([]int, numCps) 497 | for i := 0; i < numCps; i++ { 498 | cpOffset := plcPcdOffset + i*4 499 | if cpOffset < 0 || cpOffset+4 >= len(clx) { 500 | return nil, errInvalidPcdt 501 | } 502 | cps[i] = int(binary.LittleEndian.Uint32(clx[cpOffset : cpOffset+4])) 503 | } 504 | 505 | pcdStart := plcPcdOffset + 4*numCps 506 | pcds := make([]pcd, numPcds) 507 | for i := 0; i < numPcds; i++ { 508 | pcdOffset := pcdStart + i*pcdSize 509 | if pcdOffset < 0 || pcdOffset+pcdSize >= len(clx) { 510 | return nil, errInvalidPcdt 511 | } 512 | pcds[i] = *parsePcd(clx[pcdOffset : pcdOffset+pcdSize]) 513 | } 514 | return &pcdt{lcb: lcb, PlcPcd: plcPcd{aCP: cps, aPcd: pcds}}, nil 515 | } 516 | 517 | // find end of RgPrc array (section 2.9.38) 518 | func getPrcArrayEnd(clx []byte) (int, error) { 519 | prcOffset := 0 520 | count := 0 521 | for { 522 | clxt := clx[prcOffset] 523 | if clxt != 0x01 { // this is not a Prc, so exit 524 | return prcOffset, nil 525 | } 526 | prcDataCbGrpprl := binary.LittleEndian.Uint16(clx[prcOffset+1 : prcOffset+3]) // skip the clxt and read 2 bytes 527 | prcOffset += 1 + 2 + int(prcDataCbGrpprl) // skip clxt, cbGrpprl, and GrpPrl 528 | 529 | if count > 10000 || prcDataCbGrpprl <= 0 || prcOffset+3 > len(clx) { // ensure no infinite loop 530 | return 0, errInvalidPrc 531 | } 532 | count++ 533 | } 534 | } 535 | 536 | // parse Pcd (section 2.9.177) 537 | func parsePcd(pcdData []byte) *pcd { 538 | return &pcd{fc: *parseFcCompressed(pcdData[2:6])} 539 | } 540 | 541 | // parse FcCompressed (section 2.9.73) 542 | func parseFcCompressed(fcData []byte) *fcCompressed { 543 | fCompressed := fcData[3]&64 == 64 // check fcompressed value (second bit from lestmost of the last byte in fcdata) 544 | fcData[3] = fcData[3] & 63 // clear the fcompressed value from data 545 | fc := binary.LittleEndian.Uint32(fcData) // word doc generally uses little endian order (1.3.7) 546 | return &fcCompressed{fc: int(fc), fCompressed: fCompressed} 547 | } 548 | 549 | // IsFileDOC checks if the data indicates a DOC file 550 | // DOC has multiple signature according to https://filesignatures.net/index.php?search=doc&mode=EXT, D0 CF 11 E0 A1 B1 1A E1 551 | func IsFileDOC(data []byte) bool { 552 | return bytes.HasPrefix(data, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1}) 553 | } 554 | -------------------------------------------------------------------------------- /DOCX 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: DOCX 2 Text.go 3 | Copyright: 2018 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | This code is forked from https://github.com/guylaor/goword and extracts text from DOCX files. 7 | */ 8 | 9 | package fileconversion 10 | 11 | import ( 12 | "archive/zip" 13 | "bytes" 14 | "encoding/xml" 15 | "fmt" 16 | "io" 17 | "io/ioutil" 18 | "strings" 19 | ) 20 | 21 | // models.go 22 | 23 | // WordDocument is a full word doc 24 | type WordDocument struct { 25 | Paragraphs []WordParagraph 26 | } 27 | 28 | // WordParagraph is a single paragraph 29 | type WordParagraph struct { 30 | Style WordStyle `xml:"pPr>pStyle"` 31 | Rows []WordRow `xml:"r"` 32 | } 33 | 34 | // WordStyle ... 35 | type WordStyle struct { 36 | Val string `xml:"val,attr"` 37 | } 38 | 39 | // WordRow ... 40 | type WordRow struct { 41 | Text string `xml:"t"` 42 | } 43 | 44 | // AsText returns all text in the document 45 | func (w WordDocument) AsText() string { 46 | text := "" 47 | for _, v := range w.Paragraphs { 48 | for _, rv := range v.Rows { 49 | text += rv.Text 50 | } 51 | text += "\n" 52 | } 53 | return text 54 | } 55 | 56 | // goword.go 57 | 58 | // DOCX2Text extracts text of a Word document 59 | // Size is the full size of the input file. 60 | func DOCX2Text(file io.ReaderAt, size int64) (string, error) { 61 | 62 | doc, err := openWordFile(file, size) 63 | if err != nil { 64 | return "", err 65 | } 66 | 67 | docx, err := WordParse(doc) 68 | if err != nil { 69 | return "", err 70 | } 71 | 72 | return docx.AsText(), nil 73 | } 74 | 75 | // WordParse parses a word file 76 | func WordParse(doc string) (WordDocument, error) { 77 | 78 | docx := WordDocument{} 79 | r := strings.NewReader(string(doc)) 80 | decoder := xml.NewDecoder(r) 81 | 82 | for { 83 | t, _ := decoder.Token() 84 | if t == nil { 85 | break 86 | } 87 | switch se := t.(type) { 88 | case xml.StartElement: 89 | if se.Name.Local == "p" { 90 | var p WordParagraph 91 | decoder.DecodeElement(&p, &se) 92 | docx.Paragraphs = append(docx.Paragraphs, p) 93 | } 94 | } 95 | } 96 | return docx, nil 97 | } 98 | 99 | func openWordFile(file io.ReaderAt, size int64) (string, error) { 100 | 101 | // Open a zip archive for reading. word files are zip archives 102 | r, err := zip.NewReader(file, size) 103 | if err != nil { 104 | return "", err 105 | } 106 | 107 | // Iterate through the files in the archive, 108 | // find document.xml 109 | for _, f := range r.File { 110 | 111 | //fmt.Printf("Contents of %s:\n", f.Name) 112 | rc, err := f.Open() 113 | if err != nil { 114 | return "", err 115 | } 116 | defer rc.Close() 117 | if f.Name == "word/document.xml" { 118 | doc, err := ioutil.ReadAll(rc) 119 | if err != nil { 120 | return "", err 121 | } 122 | return fmt.Sprintf("%s", doc), nil 123 | } 124 | } 125 | 126 | return "", nil 127 | } 128 | 129 | // IsFileDOCX checks if the data indicates a DOCX file 130 | // DOCX has a signature of 50 4B 03 04 131 | func IsFileDOCX(data []byte) bool { 132 | return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04}) 133 | } 134 | -------------------------------------------------------------------------------- /Decompress.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: Decompress.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | */ 6 | 7 | package fileconversion 8 | 9 | import ( 10 | "archive/tar" 11 | "archive/zip" 12 | "bytes" 13 | "compress/bzip2" 14 | "compress/gzip" 15 | "io" 16 | "io/ioutil" 17 | "time" 18 | 19 | "github.com/nwaples/rardecode" 20 | "github.com/saracen/go7z" 21 | "github.com/ulikunitz/xz" 22 | ) 23 | 24 | // DecompressFile decompresses data. It supports: GZ, BZ, BZ2, XZ 25 | func DecompressFile(data []byte) (decompressed []byte, valid bool) { 26 | // Try GZ 27 | if gr, err := gzip.NewReader(bytes.NewBuffer(data)); err == nil { 28 | defer gr.Close() 29 | decompressed, err = ioutil.ReadAll(gr) 30 | if err == nil { 31 | return decompressed, true 32 | } 33 | } 34 | 35 | // BZ, BZ2 36 | br := bzip2.NewReader(bytes.NewBuffer(data)) 37 | decompressed, err := ioutil.ReadAll(br) 38 | if err == nil { 39 | return decompressed, true 40 | } 41 | 42 | // XZ 43 | if xr, err := xz.NewReader(bytes.NewBuffer(data)); err == nil { 44 | decompressed, err = ioutil.ReadAll(xr) 45 | if err == nil { 46 | return decompressed, true 47 | } 48 | } 49 | 50 | return nil, false 51 | } 52 | 53 | // ContainerExtractFiles extracts files from supported containers: ZIP, RAR, 7Z, TAR 54 | func ContainerExtractFiles(data []byte, callback func(name string, size int64, date time.Time, data []byte)) { 55 | 56 | // ZIP 57 | if r, err := zip.NewReader(bytes.NewReader(data), int64(len(data))); err == nil { 58 | for _, f := range r.File { 59 | fileReader, err := f.Open() 60 | if err != nil { 61 | continue 62 | } 63 | 64 | data2, err := ioutil.ReadAll(fileReader) 65 | fileReader.Close() 66 | if err != nil { 67 | // If the file is encrypted with a password, this fails with error "4" here. 68 | continue 69 | } 70 | 71 | callback(f.Name, int64(f.UncompressedSize64), f.Modified, data2) 72 | } 73 | 74 | return 75 | } 76 | 77 | // RAR 78 | if rc, err := rardecode.NewReader(bytes.NewReader(data), ""); err == nil { 79 | for { 80 | hdr, err := rc.Next() 81 | if err == io.EOF || err != nil { // break if end of archive or other error returned 82 | break 83 | } else if err == nil && !hdr.IsDir { 84 | if data2, err := ioutil.ReadAll(rc); err == nil { 85 | callback(hdr.Name, hdr.UnPackedSize, hdr.CreationTime, data2) 86 | } 87 | } 88 | } 89 | } 90 | 91 | // 7Z 92 | if sz, err := go7z.NewReader(bytes.NewReader(data), int64(len(data))); err == nil { 93 | for { 94 | hdr, err := sz.Next() 95 | if err == io.EOF || err != nil { // break if end of archive or other error returned 96 | break // End of archive 97 | } else if err == nil && !hdr.IsEmptyFile { 98 | if data2, err := ioutil.ReadAll(sz); err == nil { 99 | callback(hdr.Name, int64(len(data2)), hdr.CreatedAt, data2) 100 | } 101 | } 102 | } 103 | } else if err == go7z.ErrDecompressorNotFound { 104 | // May happen if it's 7Z, but decompressor not available (like 7zAES). 105 | return 106 | } 107 | 108 | // TAR 109 | tr := tar.NewReader(bytes.NewReader(data)) 110 | // Iterate through the files in the archive. 111 | for { 112 | hdr, err := tr.Next() 113 | if err == io.EOF { 114 | // end of tar archive 115 | break 116 | } 117 | if err != nil { 118 | // other error 119 | break 120 | } 121 | switch hdr.Typeflag { 122 | case tar.TypeDir: 123 | // directories are ignored 124 | case tar.TypeReg, tar.TypeRegA: 125 | // file 126 | data2, err := ioutil.ReadAll(tr) 127 | if err != nil { 128 | continue 129 | } 130 | 131 | callback(hdr.Name, hdr.Size, hdr.ModTime, data2) 132 | } 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /EPUB 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: EPUB 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | EPUB files are ZIP based and contain the content as HTML files. 7 | 8 | Tested but did not work: 9 | * https://github.com/n3integration/epub could not read 2 sample files. Also no NewReader function available. 10 | 11 | This one was tested and works: 12 | * https://github.com/taylorskalyo/goreader/tree/master/epub 13 | 14 | Sample files via https://github.com/IDPF/epub3-samples/releases. 15 | */ 16 | 17 | package fileconversion 18 | 19 | import ( 20 | "io" 21 | 22 | "github.com/taylorskalyo/goreader/epub" 23 | ) 24 | 25 | // EPUB2Text converts an EPUB ebook to text 26 | func EPUB2Text(file io.ReaderAt, size int64, limit int64) (string, error) { 27 | text := "" 28 | 29 | rc, err := epub.NewReader(file, size) 30 | if err != nil { 31 | return "", nil 32 | } 33 | 34 | // The rootfile (content.opf) lists all of the contents of an epub file. 35 | // There may be multiple rootfiles, although typically there is only one. 36 | book := rc.Rootfiles[0] 37 | 38 | // Print book title. 39 | title := "Title: " + book.Title + "\n\n" 40 | limit -= int64(len(title)) 41 | if limit <= 0 { 42 | return title, nil 43 | } 44 | 45 | // List the IDs of files in the book's spine. 46 | for _, item := range book.Spine.Itemrefs { 47 | // item.ID was observed to be in one book: cover,titlepage,brief-toc,xpreface_001,xintroduction_001,xepigraph_001,xchapter_001 48 | reader2, err := item.Open() 49 | if err != nil { 50 | continue 51 | } 52 | 53 | itemText, _ := HTML2Text(reader2) 54 | 55 | // check max length 56 | if limit <= int64(len(itemText)) { 57 | itemText = itemText[:limit] 58 | return title + text, nil 59 | } 60 | 61 | text += itemText 62 | limit -= int64(len(itemText)) 63 | } 64 | 65 | if text == "" { 66 | return "", nil 67 | } 68 | 69 | return title + text, nil 70 | } 71 | -------------------------------------------------------------------------------- /HTML 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: HTML 2 Text.go 3 | Copyright: 2018 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | */ 6 | 7 | package fileconversion 8 | 9 | import ( 10 | "io" 11 | "net/url" 12 | "path" 13 | "strings" 14 | 15 | "github.com/IntelligenceX/fileconversion/html2text" 16 | "github.com/PuerkitoBio/goquery" 17 | "github.com/ssor/bom" 18 | "golang.org/x/net/html" 19 | "golang.org/x/net/html/charset" 20 | ) 21 | 22 | // HTML2Text extracts the text from the HTML 23 | func HTML2Text(reader io.Reader) (pageText string, err error) { 24 | // The charset.NewReader ensures that foreign encodings are properly decoded to UTF-8. 25 | // It will make both heuristic checks as well as look for the HTML meta charset tag. 26 | reader, err = charset.NewReader(reader, "") 27 | if err != nil { 28 | return "", err 29 | } 30 | 31 | // The html2text is a forked improved version that converts HTML to human-friendly text. 32 | return html2text.FromReader(reader) 33 | } 34 | 35 | // HTML2TextAndLinks extracts the text from the HTML and all links from and tags of a HTML 36 | // If the base URL is provided, relative links will be converted to absolute ones. 37 | func HTML2TextAndLinks(reader io.Reader, baseURL string) (pageText string, links []string, err error) { 38 | // The charset.NewReader ensures that foreign encodings are properly decoded to UTF-8. 39 | // It will make both heuristic checks as well as look for the HTML meta charset tag. 40 | reader, err = charset.NewReader(reader, "") 41 | if err != nil { 42 | return "", nil, err 43 | } 44 | 45 | // code from html2text.FromReader to parse the doc 46 | newReader, err := bom.NewReaderWithoutBom(reader) 47 | if err != nil { 48 | return "", nil, err 49 | } 50 | doc, err := html.Parse(newReader) 51 | if err != nil { 52 | return "", nil, err 53 | } 54 | 55 | // get the text 56 | pageText, err = html2text.FromHTMLNode(doc) 57 | if err != nil { 58 | return pageText, nil, err 59 | } 60 | 61 | // get the links 62 | docQ := goquery.NewDocumentFromNode(doc) 63 | docQ.Url, _ = url.Parse(baseURL) 64 | links = processLinks(docQ) 65 | 66 | return pageText, links, err 67 | } 68 | 69 | // ---- below 2 functions are forks from gocrawl/worker.go ---- 70 | 71 | func handleBaseTag(root *url.URL, baseHref string, aHref string) string { 72 | resolvedBase, err := root.Parse(baseHref) 73 | if err != nil { 74 | return "" 75 | } 76 | 77 | parsedURL, err := url.Parse(aHref) 78 | if err != nil { 79 | return "" 80 | } 81 | // If a[href] starts with a /, it overrides the base[href] 82 | if parsedURL.Host == "" && !strings.HasPrefix(aHref, "/") { 83 | aHref = path.Join(resolvedBase.Path, aHref) 84 | } 85 | 86 | resolvedURL, err := resolvedBase.Parse(aHref) 87 | if err != nil { 88 | return "" 89 | } 90 | return resolvedURL.String() 91 | } 92 | 93 | // Scrape the document's content to gather all links 94 | func processLinks(doc *goquery.Document) (result []string) { 95 | // process links via tags 96 | baseURL, _ := doc.Find("base[href]").Attr("href") 97 | urls := doc.Find("a[href]").Map(func(_ int, s *goquery.Selection) string { 98 | val, _ := s.Attr("href") 99 | if baseURL != "" { 100 | val = handleBaseTag(doc.Url, baseURL, val) 101 | } 102 | return val 103 | }) 104 | 105 | // all image references via tag 106 | imgURLs := doc.Find("img[src]").Map(func(_ int, s *goquery.Selection) string { 107 | val, _ := s.Attr("src") 108 | if baseURL != "" { 109 | val = handleBaseTag(doc.Url, baseURL, val) 110 | } 111 | return val 112 | }) 113 | urls = append(urls, imgURLs...) 114 | 115 | // form submission links
116 | formURLs := doc.Find("form[action]").Map(func(_ int, s *goquery.Selection) string { 117 | val, _ := s.Attr("action") 118 | if baseURL != "" { 119 | val = handleBaseTag(doc.Url, baseURL, val) 120 | } 121 | return val 122 | }) 123 | urls = append(urls, formURLs...) 124 | 125 | // parse all found URLs 126 | for _, s := range urls { 127 | // If href starts with "#", then it points to this same exact URL, ignore (will fail to parse anyway) 128 | if len(s) > 0 && !strings.HasPrefix(s, "#") { 129 | if parsed, e := url.Parse(s); e == nil { 130 | parsed = doc.Url.ResolveReference(parsed) 131 | 132 | result = append(result, parsed.String()) 133 | //fmt.Printf("%s\n", parsed.String()) 134 | } else { 135 | //w.logFunc(LogIgnored, "ignore on unparsable policy %s: %s", s, e.Error()) 136 | } 137 | } 138 | } 139 | return 140 | } 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /MBOX.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: MBOX.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | Support for email files in the MBOX format. 7 | */ 8 | 9 | package fileconversion 10 | 11 | // Check out https://github.com/blabber/mbox ???? 12 | -------------------------------------------------------------------------------- /MOBI 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: MOBI 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | Mobi files use HTML tags. 7 | 8 | Did not work: 9 | * https://github.com/766b/mobi is only a writer and does not have a useful reader 10 | * https://github.com/peterbn/mobi a fork of above one. 11 | 12 | Works: 13 | * https://github.com/neofight/mobi code basically works, just an in-memory open function had to be forked. 14 | 15 | */ 16 | 17 | package fileconversion 18 | 19 | import ( 20 | "bytes" 21 | "encoding/binary" 22 | "errors" 23 | "fmt" 24 | "io" 25 | "strconv" 26 | "strings" 27 | "unicode/utf8" 28 | 29 | html "github.com/levigross/exp-html" 30 | "github.com/neofight/mobi/convert" 31 | "github.com/neofight/mobi/headers" 32 | ) 33 | 34 | // Mobi2Text converts a MOBI ebook to text 35 | func Mobi2Text(file io.ReadSeeker) (string, error) { 36 | 37 | book, _ := mobiOpen(file) 38 | markupText, _ := book.Markup() 39 | 40 | text, _ := HTML2Text(strings.NewReader(markupText)) 41 | 42 | return text, nil 43 | } 44 | 45 | // below code is forked from https://github.com/neofight/mobi MOBIFile.go 46 | 47 | type mobiBook struct { 48 | file io.ReadSeeker 49 | pdbHeader *headers.PDB 50 | palmDOCHeader *headers.PalmDOC 51 | mobiHeader *headers.MOBI 52 | exthHeader *headers.EXTH 53 | } 54 | 55 | func mobiOpen(file io.ReadSeeker) (*mobiBook, error) { 56 | 57 | var book mobiBook 58 | 59 | var err error 60 | 61 | book.file = file 62 | book.pdbHeader, err = headers.ReadPDB(book.file) 63 | 64 | if err != nil { 65 | return nil, fmt.Errorf("unable to read PDB header: %v", err) 66 | } 67 | 68 | book.palmDOCHeader, err = headers.ReadPalmDOC(book.file) 69 | 70 | if err != nil { 71 | return nil, fmt.Errorf("unable to read PalmDOC header: %v", err) 72 | } 73 | 74 | book.mobiHeader, err = headers.ReadMOBI(book.file) 75 | 76 | if err != nil { 77 | return nil, fmt.Errorf("unable to read MOBI header: %v", err) 78 | } 79 | 80 | if book.mobiHeader.EXTHHeaderPresent { 81 | 82 | book.exthHeader, err = headers.ReadEXTH(book.file) 83 | 84 | if err != nil { 85 | return nil, fmt.Errorf("unable to read EXTH header: %v", err) 86 | } 87 | } 88 | 89 | return &book, nil 90 | } 91 | 92 | func (mobiFile mobiBook) Cover() ([]byte, error) { 93 | 94 | for _, r := range mobiFile.exthHeader.Records { 95 | 96 | if r.RecordType == 201 { 97 | coverIndex := mobiFile.mobiHeader.FirstImageIndex + convert.FromUint32(r.RecordData) 98 | 99 | record := mobiFile.pdbHeader.Records[coverIndex] 100 | nextRecord := mobiFile.pdbHeader.Records[coverIndex+1] 101 | 102 | coverOffset := record.RecordDataOffset 103 | coverSize := nextRecord.RecordDataOffset - coverOffset 104 | 105 | _, err := mobiFile.file.Seek(int64(coverOffset), 0) 106 | 107 | if err != nil { 108 | return nil, fmt.Errorf("unable to find cover: %v", err) 109 | } 110 | 111 | cover := make([]byte, coverSize) 112 | 113 | err = binary.Read(mobiFile.file, binary.BigEndian, &cover) 114 | 115 | if err != nil { 116 | return nil, fmt.Errorf("unable to read cover: %v", err) 117 | } 118 | 119 | return cover, nil 120 | } 121 | } 122 | 123 | return nil, nil 124 | } 125 | 126 | func (mobiFile mobiBook) Markup() (string, error) { 127 | 128 | startIndex := mobiFile.mobiHeader.FirstContentIndex 129 | endIndex := mobiFile.mobiHeader.FirstNonBookIndex - 1 130 | 131 | if endIndex > len(mobiFile.pdbHeader.Records)-2 { 132 | endIndex = len(mobiFile.pdbHeader.Records) - 2 133 | } 134 | 135 | if endIndex < 0 || startIndex < 0 || startIndex >= len(mobiFile.pdbHeader.Records) { 136 | return "", fmt.Errorf("Invalid header") 137 | } 138 | 139 | text := make([]byte, 0) 140 | 141 | for index := startIndex; index <= endIndex; index++ { 142 | 143 | record := mobiFile.pdbHeader.Records[index] 144 | nextRecord := mobiFile.pdbHeader.Records[index+1] 145 | 146 | recordOffset := record.RecordDataOffset 147 | recordSize := nextRecord.RecordDataOffset - recordOffset 148 | 149 | _, err := mobiFile.file.Seek(int64(recordOffset), 0) 150 | 151 | if err != nil { 152 | return "", fmt.Errorf("unable to find text: %v", err) 153 | } 154 | 155 | recordData := make([]byte, recordSize) 156 | 157 | err = binary.Read(mobiFile.file, binary.BigEndian, &recordData) 158 | 159 | if err != nil { 160 | return "", fmt.Errorf("unable to read text: %v", err) 161 | } 162 | 163 | recordText := fromLZ77(recordData) 164 | 165 | text = append(text, recordText...) 166 | } 167 | 168 | text = text[:mobiFile.palmDOCHeader.TextLength] 169 | 170 | if !utf8.Valid(text) { 171 | return "", errors.New("unable to decompress text") 172 | } 173 | 174 | return string(text), nil 175 | } 176 | 177 | func (mobiFile mobiBook) Text() (string, error) { 178 | 179 | markup, err := mobiFile.Markup() 180 | 181 | if err != nil { 182 | return "", fmt.Errorf("unable to read markup: %v", err) 183 | } 184 | 185 | pos, err := getTOCPosition(markup) 186 | 187 | if err != nil { 188 | return "", fmt.Errorf("unable to locate TOC: %v", err) 189 | } 190 | 191 | bookmarks, err := parseTOC(markup[pos:]) 192 | 193 | text := make([]string, 0) 194 | 195 | for i := range bookmarks { 196 | 197 | start := bookmarks[i] 198 | var end int 199 | 200 | if i < len(bookmarks)-1 { 201 | end = bookmarks[i+1] 202 | } else { 203 | end = pos 204 | } 205 | 206 | paragraphs, err := parseChapter(markup[start:end]) 207 | 208 | if err != nil { 209 | return "", fmt.Errorf("unable to parse chapter: %v", err) 210 | } 211 | 212 | text = append(text, paragraphs...) 213 | } 214 | 215 | return strings.Join(text, "\n\n"), nil 216 | } 217 | 218 | func getTOCPosition(markup string) (int, error) { 219 | 220 | htmlReader := strings.NewReader(markup) 221 | 222 | tokenizer := html.NewTokenizer(htmlReader) 223 | 224 | for { 225 | tokenType := tokenizer.Next() 226 | 227 | switch { 228 | case tokenType == html.ErrorToken: 229 | return 0, fmt.Errorf("unable to find reference element") 230 | case tokenType == html.SelfClosingTagToken: 231 | token := tokenizer.Token() 232 | 233 | if token.Data == "reference" { 234 | filepos, err := attr(token, "filepos") 235 | 236 | if err != nil { 237 | return 0, errors.New("filepos attribute missing") 238 | } 239 | 240 | pos, err := strconv.Atoi(filepos) 241 | 242 | if err != nil { 243 | return 0, errors.New("filepos attribute invalid") 244 | } 245 | 246 | return pos, nil 247 | } 248 | } 249 | } 250 | } 251 | 252 | func parseTOC(markup string) ([]int, error) { 253 | 254 | toc := make([]int, 0) 255 | 256 | htmlReader := strings.NewReader(markup) 257 | 258 | tokenizer := html.NewTokenizer(htmlReader) 259 | 260 | for { 261 | tokenType := tokenizer.Next() 262 | 263 | switch { 264 | case tokenType == html.ErrorToken: 265 | return toc[1:], nil 266 | case tokenType == html.StartTagToken: 267 | token := tokenizer.Token() 268 | 269 | if token.Data == "a" { 270 | filepos, err := attr(token, "filepos") 271 | 272 | if err != nil { 273 | continue 274 | } 275 | 276 | pos, err := strconv.Atoi(filepos) 277 | 278 | if err != nil { 279 | return nil, errors.New("filepos attribute invalid") 280 | } 281 | 282 | toc = append(toc, pos) 283 | } 284 | } 285 | } 286 | } 287 | 288 | func parseChapter(markup string) ([]string, error) { 289 | 290 | paragraphs := make([]string, 0) 291 | 292 | htmlReader := strings.NewReader(markup) 293 | 294 | tokenizer := html.NewTokenizer(htmlReader) 295 | 296 | for { 297 | tokenType := tokenizer.Next() 298 | 299 | switch { 300 | case tokenType == html.ErrorToken: 301 | return paragraphs, nil 302 | case tokenType == html.TextToken: 303 | token := tokenizer.Token() 304 | 305 | if len(strings.TrimSpace(token.Data)) > 0 { 306 | paragraphs = append(paragraphs, strings.TrimSpace(token.Data)) 307 | } 308 | } 309 | } 310 | } 311 | 312 | func attr(t html.Token, name string) (string, error) { 313 | for _, a := range t.Attr { 314 | if a.Key == name { 315 | return a.Val, nil 316 | } 317 | } 318 | 319 | return "", fmt.Errorf("attribute %v not found", name) 320 | } 321 | 322 | // fromLZ77 is forked from conversion.go because of index out of range panic 323 | func fromLZ77(text []byte) []byte { 324 | 325 | var reader = bytes.NewReader(text) 326 | 327 | var buffer [4096]byte 328 | var pos int 329 | 330 | for { 331 | if pos == 4096 { 332 | break 333 | } 334 | 335 | c, err := reader.ReadByte() 336 | 337 | if err == io.EOF { 338 | break 339 | } 340 | 341 | switch { 342 | 343 | // 0x00: "1 literal" copy that byte unmodified to the decompressed stream. 344 | case c == 0x00: 345 | buffer[pos] = c 346 | pos++ 347 | 348 | // 0x09 to 0x7f: "1 literal" copy that byte unmodified to the decompressed stream. 349 | case c >= 0x09 && c <= 0x7f: 350 | buffer[pos] = c 351 | pos++ 352 | 353 | // 0x01 to 0x08: "literals": the byte is interpreted as a count from 1 to 8, and that many literals are copied 354 | // unmodified from the compressed stream to the decompressed stream. 355 | case c >= 0x01 && c <= 0x08: 356 | length := int(c) 357 | for i := 0; i < length; i++ { 358 | c, err = reader.ReadByte() 359 | buffer[pos] = c 360 | pos++ 361 | } 362 | 363 | // 0x80 to 0xbf: "length, distance" pair: the 2 leftmost bits of this byte ('10') are discarded, and the 364 | // following 6 bits are combined with the 8 bits of the next byte to make a 14 bit "distance, length" item. 365 | // Those 14 bits are broken into 11 bits of distance backwards from the current location in the uncompressed 366 | // text, and 3 bits of length to copy from that point (copying n+3 bytes, 3 to 10 bytes). 367 | case c >= 0x80 && c <= 0xbf: 368 | c2, _ := reader.ReadByte() 369 | 370 | distance := (int(c&0x3F)<<8 | int(c2)) >> 3 371 | length := int(c2&0x07) + 3 372 | 373 | start := pos - distance 374 | 375 | for i := 0; i < length; i++ { 376 | // check if index is in range 377 | if start+i >= len(buffer) || start+i < 0 { 378 | return buffer[:pos] 379 | } 380 | 381 | c = buffer[start+i] 382 | buffer[pos] = c 383 | pos++ 384 | } 385 | 386 | // 0xc0 to 0xff: "byte pair": this byte is decoded into 2 characters: a space character, and a letter formed 387 | // from this byte XORed with 0x80. 388 | case c >= 0xc0: 389 | buffer[pos] = ' ' 390 | pos++ 391 | buffer[pos] = c ^ 0x80 392 | pos++ 393 | } 394 | } 395 | 396 | return buffer[:pos] 397 | } 398 | 399 | // IsFileMOBI checks if the data indicates a MOBI file 400 | func IsFileMOBI(data []byte) bool { 401 | // Mobi files have a header and there is the signature "BOOKMOBI" or "TEXtREAd". 402 | // There are many more more potential signatures https://sno.phy.queensu.ca/~phil/exiftool/TagNames/Palm.html 403 | 404 | // Fork from code here http://will.tip.dhappy.org/lib/calibre/dedrm/mobidedrm.py 405 | // if self.header[0x3C:0x3C+8] != 'BOOKMOBI' and self.header[0x3C:0x3C+8] != 'TEXtREAd': 406 | // raise DrmException(u"Invalid file format") 407 | 408 | if len(data) < 0x3C+8 { 409 | return false 410 | } 411 | 412 | signature := data[0x3C : 0x3C+8] 413 | 414 | return bytes.Equal(signature, []byte("BOOKMOBI")) || bytes.Equal(signature, []byte("TEXtREAd")) 415 | } 416 | -------------------------------------------------------------------------------- /ODS 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: ODS 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | Code for parsing Open Document Spreadsheet files. ZIP-compressed XML-based file format. 7 | */ 8 | 9 | package fileconversion 10 | 11 | import ( 12 | "io" 13 | 14 | "github.com/IntelligenceX/fileconversion/odf/ods" 15 | ) 16 | 17 | // ODS2Text extracts text of an OpenDocument Spreadsheet 18 | // Size is the full size of the input file. 19 | func ODS2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) { 20 | 21 | var doc ods.Doc 22 | 23 | f, err := ods.NewReader(file, size) 24 | if err != nil { 25 | return 0, err 26 | } 27 | defer f.Close() 28 | if err := f.ParseContent(&doc); err != nil { 29 | return 0, err 30 | } 31 | 32 | for n, sheet := range doc.Table { 33 | rows := sheet.Strings() 34 | if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet.Name, n, int(len(rows)))), &written, &limit); err != nil || limit == 0 { 35 | return written, err 36 | } 37 | 38 | for _, row := range rows { 39 | 40 | rowText := "" 41 | 42 | // go through all columns 43 | for m, text := range row { 44 | if text != "" { 45 | text = cleanCell(text) 46 | 47 | if m > 0 { 48 | rowText += ", " 49 | } 50 | rowText += text 51 | } 52 | } 53 | 54 | rowText += "\n" 55 | 56 | if err = writeOutput(writer, []byte(rowText), &written, &limit); err != nil || limit == 0 { 57 | return written, err 58 | } 59 | } 60 | } 61 | 62 | return written, nil 63 | } 64 | 65 | // ODS2Cells converts an ODS file to individual cells 66 | // Size is the full size of the input file. 67 | func ODS2Cells(file io.ReaderAt, size int64) (cells []string, err error) { 68 | 69 | var doc ods.Doc 70 | 71 | f, err := ods.NewReader(file, size) 72 | if err != nil { 73 | return nil, err 74 | } 75 | defer f.Close() 76 | if err := f.ParseContent(&doc); err != nil { 77 | return nil, err 78 | } 79 | 80 | for _, sheet := range doc.Table { 81 | for _, row := range sheet.Strings() { 82 | for _, text := range row { 83 | if text != "" { 84 | text = cleanCell(text) 85 | cells = append(cells, text) 86 | } 87 | } 88 | } 89 | } 90 | 91 | return 92 | } 93 | -------------------------------------------------------------------------------- /ODT 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: ODT 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | Fork from https://github.com/lu4p/cat/blob/master/odtxt/odtreader.go. 7 | The extract discards any formatting. The output is one large string without new-lines at the current time. 8 | */ 9 | 10 | package fileconversion 11 | 12 | import ( 13 | "archive/zip" 14 | "errors" 15 | "io" 16 | "io/ioutil" 17 | 18 | "github.com/IntelligenceX/fileconversion/html2text" 19 | ) 20 | 21 | // ODT2Text extracts text of an OpenDocument Text file 22 | // Size is the full size of the input file. 23 | func ODT2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) { 24 | f, err := odtNewReader(file, size) 25 | if err != nil { 26 | return 0, err 27 | } 28 | 29 | text, err := f.GetTxt() 30 | if err != nil { 31 | return 0, err 32 | } 33 | 34 | err = writeOutput(writer, []byte(text), &written, &limit) 35 | 36 | return 37 | } 38 | 39 | //odt zip struct 40 | type odt struct { 41 | zipFileReader *zip.Reader 42 | Files []*zip.File 43 | FilesContent map[string][]byte 44 | Content string 45 | } 46 | 47 | func odtNewReader(file io.ReaderAt, size int64) (*odt, error) { 48 | reader, err := zip.NewReader(file, size) 49 | if err != nil { 50 | return nil, err 51 | } 52 | 53 | odtDoc := odt{ 54 | zipFileReader: reader, 55 | Files: reader.File, 56 | FilesContent: map[string][]byte{}, 57 | } 58 | 59 | for _, f := range odtDoc.Files { 60 | contents, _ := odtDoc.retrieveFileContents(f.Name) 61 | odtDoc.FilesContent[f.Name] = contents 62 | } 63 | 64 | return &odtDoc, nil 65 | } 66 | 67 | //Read all files contents 68 | func (d *odt) retrieveFileContents(filename string) ([]byte, error) { 69 | var file *zip.File 70 | for _, f := range d.Files { 71 | if f.Name == filename { 72 | file = f 73 | break 74 | } 75 | } 76 | 77 | if file == nil { 78 | return nil, errors.New(filename + " file not found") 79 | } 80 | 81 | reader, err := file.Open() 82 | if err != nil { 83 | return nil, err 84 | } 85 | return ioutil.ReadAll(reader) 86 | } 87 | 88 | func (d *odt) GetTxt() (content string, err error) { 89 | xmlData := d.FilesContent["content.xml"] 90 | return xml2Text(xmlData) 91 | //content, err = d.listP(xmlData) 92 | } 93 | 94 | /* 95 | // listP for w:p tag value 96 | func (d *odt) listP(data []byte) (string, error) { 97 | v := new(odtQuery) 98 | err := xml.Unmarshal(data, &v) 99 | if err != nil { 100 | return "", err 101 | } 102 | var result string 103 | for _, text := range v.Body.Text { 104 | for _, line := range text.P { 105 | if line == "" { 106 | continue 107 | } 108 | result += line + "\n" 109 | } 110 | } 111 | return result, nil 112 | } 113 | 114 | type odtQuery struct { 115 | XMLName xml.Name `xml:"document-content"` 116 | Body odtBody `xml:"body"` 117 | } 118 | type odtBody struct { 119 | Text []odtText `xml:"text"` 120 | } 121 | type odtText struct { 122 | P []string `xml:"p"` 123 | } 124 | */ 125 | 126 | // xml2Text extracts any text from XML data. 127 | // Note that any formatting will be lost. The output is one large string without new-lines. 128 | func xml2Text(data []byte) (string, error) { 129 | return html2text.FromString(string(data)) 130 | } 131 | -------------------------------------------------------------------------------- /PDF 2 Image.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: PDF 2 Image.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | */ 6 | 7 | package fileconversion 8 | 9 | import ( 10 | "image" 11 | "io" 12 | "strconv" 13 | 14 | pdfcontent "github.com/unidoc/unipdf/contentstream" 15 | pdfcore "github.com/unidoc/unipdf/core" 16 | pdf "github.com/unidoc/unipdf/model" 17 | ) 18 | 19 | var xObjectImages = 0 20 | var inlineImages = 0 21 | 22 | // ImageResult contains an extracted image 23 | type ImageResult struct { 24 | Image image.Image 25 | Name string 26 | } 27 | 28 | // PDFExtractImages extracts all images from a PDF file 29 | func PDFExtractImages(input io.ReadSeeker) (images []ImageResult, err error) { 30 | 31 | pdfReader, err := pdf.NewPdfReader(input) 32 | if err != nil { 33 | return nil, err 34 | } 35 | 36 | isEncrypted, err := pdfReader.IsEncrypted() 37 | if err != nil { 38 | return nil, err 39 | } 40 | 41 | // Try decrypting with an empty one. 42 | if isEncrypted { 43 | auth, err := pdfReader.Decrypt([]byte("")) 44 | if err != nil { 45 | // Encrypted and we cannot do anything about it. 46 | return nil, err 47 | } 48 | if !auth { 49 | //fmt.Println("Need to decrypt with password") 50 | return nil, nil 51 | } 52 | } 53 | 54 | numPages, err := pdfReader.GetNumPages() 55 | if err != nil { 56 | return nil, err 57 | } 58 | //fmt.Printf("PDF Num Pages: %d\n", numPages) 59 | 60 | for i := 0; i < numPages; i++ { 61 | //fmt.Printf("-----\nPage %d:\n", i+1) 62 | 63 | page, err := pdfReader.GetPage(i + 1) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | // List images on the page. 69 | rgbImages, err := extractImagesOnPage(page) 70 | if err != nil { 71 | return nil, err 72 | } 73 | _ = rgbImages 74 | 75 | for idx, img := range rgbImages { 76 | fname := "p" + strconv.Itoa(i+1) + "_" + strconv.Itoa(idx) + ".jpg" 77 | 78 | gimg, err := img.ToGoImage() 79 | if err != nil { 80 | return nil, err 81 | } 82 | 83 | images = append(images, ImageResult{Image: gimg, Name: fname}) 84 | } 85 | } 86 | 87 | return images, nil 88 | } 89 | 90 | func extractImagesOnPage(page *pdf.PdfPage) ([]*pdf.Image, error) { 91 | contents, err := page.GetAllContentStreams() 92 | if err != nil { 93 | return nil, err 94 | } 95 | 96 | return extractImagesInContentStream(contents, page.Resources) 97 | } 98 | 99 | func extractImagesInContentStream(contents string, resources *pdf.PdfPageResources) ([]*pdf.Image, error) { 100 | rgbImages := []*pdf.Image{} 101 | cstreamParser := pdfcontent.NewContentStreamParser(contents) 102 | operations, err := cstreamParser.Parse() 103 | if err != nil { 104 | return nil, err 105 | } 106 | 107 | processedXObjects := map[string]bool{} 108 | 109 | // Range through all the content stream operations. 110 | for _, op := range *operations { 111 | if op.Operand == "BI" && len(op.Params) == 1 { 112 | // BI: Inline image. 113 | 114 | iimg, ok := op.Params[0].(*pdfcontent.ContentStreamInlineImage) 115 | if !ok { 116 | continue 117 | } 118 | 119 | img, err := iimg.ToImage(resources) 120 | if err != nil { 121 | return nil, err 122 | } 123 | 124 | cs, err := iimg.GetColorSpace(resources) 125 | if err != nil { 126 | return nil, err 127 | } 128 | if cs == nil { 129 | // Default if not specified? 130 | cs = pdf.NewPdfColorspaceDeviceGray() 131 | } 132 | //fmt.Printf("Cs: %T\n", cs) 133 | 134 | rgbImg, err := cs.ImageToRGB(*img) 135 | if err != nil { 136 | return nil, err 137 | } 138 | 139 | rgbImages = append(rgbImages, &rgbImg) 140 | inlineImages++ 141 | } else if op.Operand == "Do" && len(op.Params) == 1 { 142 | // Do: XObject. 143 | name := op.Params[0].(*pdfcore.PdfObjectName) 144 | 145 | // Only process each one once. 146 | _, has := processedXObjects[string(*name)] 147 | if has { 148 | continue 149 | } 150 | processedXObjects[string(*name)] = true 151 | 152 | _, xtype := resources.GetXObjectByName(*name) 153 | if xtype == pdf.XObjectTypeImage { 154 | //fmt.Printf(" XObject Image: %s\n", *name) 155 | 156 | ximg, err := resources.GetXObjectImageByName(*name) 157 | if err != nil { 158 | return nil, err 159 | } 160 | 161 | img, err := ximg.ToImage() 162 | if err != nil { 163 | return nil, err 164 | } 165 | 166 | rgbImg, err := ximg.ColorSpace.ImageToRGB(*img) 167 | if err != nil { 168 | return nil, err 169 | } 170 | rgbImages = append(rgbImages, &rgbImg) 171 | xObjectImages++ 172 | } else if xtype == pdf.XObjectTypeForm { 173 | // Go through the XObject Form content stream. 174 | xform, err := resources.GetXObjectFormByName(*name) 175 | if err != nil { 176 | return nil, err 177 | } 178 | 179 | formContent, err := xform.GetContentStream() 180 | if err != nil { 181 | return nil, err 182 | } 183 | 184 | // Process the content stream in the Form object too: 185 | formResources := xform.Resources 186 | if formResources == nil { 187 | formResources = resources 188 | } 189 | 190 | // Process the content stream in the Form object too: 191 | formRgbImages, err := extractImagesInContentStream(string(formContent), formResources) 192 | if err != nil { 193 | return nil, err 194 | } 195 | rgbImages = append(rgbImages, formRgbImages...) 196 | } 197 | } 198 | } 199 | 200 | return rgbImages, nil 201 | } 202 | -------------------------------------------------------------------------------- /PDF 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: PDF 2 Text.go 3 | Copyright: 2018 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | This code uses the commercial library UniDoc https://unidoc.io/ to extract text from PDFs. 7 | */ 8 | 9 | package fileconversion 10 | 11 | import ( 12 | "io" 13 | "strconv" 14 | "strings" 15 | "time" 16 | 17 | "github.com/unidoc/unipdf/core" 18 | "github.com/unidoc/unipdf/extractor" 19 | pdf "github.com/unidoc/unipdf/model" 20 | 21 | "github.com/unidoc/unipdf/common/license" 22 | ) 23 | 24 | // InitPDFLicense initializes the PDF license 25 | func InitPDFLicense(key, name string) { 26 | // load the unidoc license (v3) 27 | license.SetLicenseKey(key, name) 28 | } 29 | 30 | // PDFListContentStreams writes all text streams in a PDF to the writer 31 | // It returns the number of characters attempted written (excluding "Page N" and new-lines) and an error, if any. It can be used to determine whether any text was extracted. 32 | // The parameter size is the max amount of bytes (not characters) to write out. 33 | func PDFListContentStreams(f io.ReadSeeker, w io.Writer, size int64) (written int64, err error) { 34 | 35 | pdfReader, err := pdf.NewPdfReader(f) 36 | if err != nil { 37 | return 0, err 38 | } 39 | 40 | isEncrypted, err := pdfReader.IsEncrypted() 41 | if err != nil { 42 | return 0, err 43 | } 44 | 45 | if isEncrypted { 46 | _, err = pdfReader.Decrypt([]byte("")) 47 | if err != nil { 48 | return 0, err 49 | } 50 | } 51 | 52 | numPages, err := pdfReader.GetNumPages() 53 | if err != nil { 54 | return 0, err 55 | } 56 | 57 | for i := 0; i < numPages && size > 0; i++ { 58 | pageNum := i + 1 59 | 60 | page, err := pdfReader.GetPage(pageNum) 61 | if err != nil { 62 | return written, err 63 | } 64 | 65 | ex, err := extractor.New(page) 66 | if err != nil { 67 | return written, err 68 | } 69 | 70 | txt, err := ex.ExtractText() 71 | if err != nil { 72 | return written, err 73 | } 74 | 75 | // use the extracted text 76 | txtNL := "" 77 | if written > 0 { 78 | txtNL += "\n\n" 79 | } 80 | 81 | textB := []byte(txtNL + "---- Page " + strconv.Itoa(pageNum) + " ----\n") 82 | 83 | // empty page? skip if so. 84 | txt = strings.TrimSpace(txt) 85 | if len(txt) == 0 { 86 | continue 87 | } 88 | 89 | textB = append(textB, []byte(txt)...) 90 | if int64(len(textB)) > size { 91 | textB = textB[:size] 92 | } 93 | 94 | if _, err = w.Write(textB); err != nil { 95 | return written, err 96 | } 97 | 98 | size -= int64(len(textB)) 99 | written += int64(len(txt)) 100 | } 101 | 102 | return written, nil 103 | } 104 | 105 | // PDFGetCreationDate tries to get the creation date 106 | func PDFGetCreationDate(f io.ReadSeeker) (date time.Time, valid bool) { 107 | // Below code is forked from https://github.com/unidoc/unidoc-examples/blob/master/pdf/metadata/pdf_metadata_get_docinfo.go 108 | pdfReader, err := pdf.NewPdfReader(f) 109 | if err != nil { 110 | return date, false 111 | } 112 | 113 | trailerDict, err := pdfReader.GetTrailer() 114 | if err != nil || trailerDict == nil { 115 | return date, false 116 | } 117 | 118 | var infoDict *core.PdfObjectDictionary 119 | 120 | infoObj := trailerDict.Get("Info") 121 | switch t := infoObj.(type) { 122 | case *core.PdfObjectReference: 123 | infoRef := t 124 | infoObj, err = pdfReader.GetIndirectObjectByNumber(int(infoRef.ObjectNumber)) 125 | infoObj = core.TraceToDirectObject(infoObj) 126 | if err != nil { 127 | return date, false 128 | } 129 | infoDict, _ = infoObj.(*core.PdfObjectDictionary) 130 | case *core.PdfObjectDictionary: 131 | infoDict = t 132 | } 133 | 134 | if infoDict == nil { 135 | return date, false 136 | } 137 | 138 | if str, has := infoDict.Get("CreationDate").(*core.PdfObjectString); has { 139 | creationDateA := strings.TrimPrefix(str.String(), "D:") 140 | 141 | time1, err := time.Parse("20060102150405-07'00'", creationDateA) 142 | if err == nil { 143 | return time1.UTC(), true 144 | } 145 | } 146 | 147 | return date, false 148 | } 149 | -------------------------------------------------------------------------------- /PPT 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: PPT 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | Placeholder file until PPT conversion code 2 text is available. 7 | */ 8 | 9 | package fileconversion 10 | 11 | import "bytes" 12 | 13 | // IsFilePPT checks if the data indicates a PPT file 14 | // PPT has multiple signature according to https://www.filesignatures.net/index.php?page=search&search=PPT&mode=EXT, D0 CF 11 E0 A1 B1 1A E1. This overlaps with others (including DOC ans XLS). 15 | func IsFilePPT(data []byte) bool { 16 | return bytes.HasPrefix(data, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1}) 17 | } 18 | -------------------------------------------------------------------------------- /PPTX 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: PPTX 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | This code is a fork from https://github.com/mr-tim/rol-o-decks/blob/master/indexer/indexer.go. 7 | */ 8 | 9 | package fileconversion 10 | 11 | import ( 12 | "archive/zip" 13 | "bytes" 14 | "io" 15 | "sort" 16 | "strconv" 17 | "strings" 18 | 19 | "gopkg.in/xmlpath.v2" 20 | ) 21 | 22 | // PPTXDocument is a PPTX document loaded into memory 23 | type PPTXDocument struct { 24 | Slides []PPTXSlide 25 | } 26 | 27 | // PPTXSlide is a single PPTX slide 28 | type PPTXSlide struct { 29 | SlideNumber int 30 | //ThumbnailBase64 string 31 | TextContent string 32 | } 33 | 34 | // SlideNumberSorter is used for sorting 35 | type SlideNumberSorter []PPTXSlide 36 | 37 | func (a SlideNumberSorter) Len() int { return len(a) } 38 | func (a SlideNumberSorter) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 39 | func (a SlideNumberSorter) Less(i, j int) bool { return a[i].SlideNumber < a[j].SlideNumber } 40 | 41 | // PPTX2Text extracts text of a PowerPoint document 42 | // Size is the full size of the input file. 43 | func PPTX2Text(file io.ReaderAt, size int64) (string, error) { 44 | 45 | r, err := zip.NewReader(file, size) 46 | if err != nil { 47 | return "", err 48 | } 49 | 50 | doc := parsePPTXDocument(r) 51 | 52 | return doc.AsText(), nil 53 | } 54 | 55 | // IsFilePPTX checks if the data indicates a PPTX file 56 | // PPTX has a signature of 50 4B 03 04 57 | // Warning: This collides with ZIP, DOCX and other zip-based files. 58 | func IsFilePPTX(data []byte) bool { 59 | return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04}) 60 | } 61 | 62 | func extractSlideContent(f *zip.File) string { 63 | p := xmlpath.MustCompile("//t") 64 | zr, _ := f.Open() 65 | defer zr.Close() 66 | root, _ := xmlpath.Parse(zr) 67 | i := p.Iter(root) 68 | content := make([]string, 0) 69 | for i.Next() { 70 | n := i.Node() 71 | content = append(content, n.String()) 72 | } 73 | textContent := strings.Join(content, "\n") 74 | return textContent 75 | } 76 | 77 | func parsePPTXDocument(r *zip.Reader) (doc PPTXDocument) { 78 | 79 | for _, f := range r.File { 80 | if strings.HasPrefix(f.Name, "ppt/slides/") && !strings.HasPrefix(f.Name, "ppt/slides/_rels") { 81 | slideNumberStr := strings.TrimSuffix(strings.TrimPrefix(strings.ToLower(f.Name), "ppt/slides/slide"), ".xml") 82 | slideNumber, _ := strconv.Atoi(slideNumberStr) 83 | 84 | // grab the text content 85 | doc.Slides = append(doc.Slides, PPTXSlide{ 86 | SlideNumber: slideNumber, 87 | TextContent: extractSlideContent(f), 88 | //ThumbnailBase64: generateThumbnail(fileToIndex, slideNumber), 89 | }) 90 | } 91 | } 92 | 93 | sort.Sort(SlideNumberSorter(doc.Slides)) 94 | 95 | return doc 96 | } 97 | 98 | // AsText returns the text on all slides 99 | func (doc PPTXDocument) AsText() (text string) { 100 | 101 | for n, slide := range doc.Slides { 102 | if slide.TextContent == "" { // skip empty slides 103 | continue 104 | } 105 | 106 | if n > 0 && text != "" { 107 | text += "\n\n" 108 | } 109 | 110 | text += "Slide " + strconv.Itoa(n+1) + ":\n" 111 | text += slide.TextContent 112 | } 113 | 114 | return text 115 | } 116 | -------------------------------------------------------------------------------- /Picture.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: Picture.go 3 | Copyright: 2018 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | */ 6 | 7 | package fileconversion 8 | 9 | import ( 10 | "bytes" 11 | "image" 12 | _ "image/gif" // automatic registration 13 | "image/jpeg" 14 | _ "image/png" // REQUIRED! automatic registration of PNG decoding for image.Decode 15 | 16 | _ "golang.org/x/image/bmp" // Required for BMP decoding 17 | _ "golang.org/x/image/tiff" // Required for TIFF decoding 18 | 19 | "github.com/nfnt/resize" 20 | ) 21 | 22 | // IsExcessiveLargePicture checks if the picture has reasonable width and height, preventing potential DoS when decoding it 23 | // This protects against this problem: If the image claims to be large (in terms of width & height), jpeg.Decode may use a lot of memory, see https://github.com/golang/go/issues/10532. 24 | func IsExcessiveLargePicture(Picture []byte) (excessive bool, err error) { 25 | config, _, err := image.DecodeConfig(bytes.NewBuffer(Picture)) 26 | if err != nil { 27 | return false, err 28 | } 29 | 30 | return config.Width > 7680 || config.Height > 4320, nil 31 | } 32 | 33 | // CompressJPEG compresses a JPEG picture according to the input 34 | // Warning: If the image claims to be large (in terms of width & height), this may use a lot of memory. Use IsExcessiveLargePicture first. 35 | func CompressJPEG(Picture []byte, quality int) (compressed []byte) { 36 | if quality == 100 { // nothing todo on perfect quality 37 | return Picture 38 | } 39 | 40 | image, err := jpeg.Decode(bytes.NewBuffer(Picture)) 41 | if err != nil { 42 | return Picture 43 | } 44 | 45 | target := bytes.NewBuffer(make([]byte, 0, len(Picture))) 46 | 47 | err = jpeg.Encode(target, image, &jpeg.Options{Quality: quality}) 48 | if err != nil { 49 | return Picture 50 | } 51 | 52 | return target.Bytes() 53 | } 54 | 55 | // ResizeCompressPicture scales a picture down and compresses it. It accepts GIF, JPEG, PNG as input but output will always be JPEG. 56 | // Quality specifies the output JPEG quality 0-100. Anything below 75 will noticably reduce the picture quality. 57 | // Warning: If the image claims to be large (in terms of width & height), this may use a lot of memory. Use IsExcessiveLargePicture first. 58 | // Scaling a picture down is optional and only done if MaxWidth and MaxHeight are not 0. Even without rescaling, this function is useful to convert a picture into JPEG. 59 | func ResizeCompressPicture(Picture []byte, Quality int, MaxWidth, MaxHeight uint) (compressed []byte, err error) { 60 | 61 | // decode the image 62 | img, _, err := image.Decode(bytes.NewBuffer(Picture)) 63 | if err != nil { // discard images that can't be decoded 64 | return nil, err 65 | } 66 | 67 | // resize if required 68 | if MaxWidth != 0 && MaxHeight != 0 { 69 | img = resize.Thumbnail(MaxWidth, MaxHeight, img, resize.Lanczos3) 70 | } 71 | 72 | // encode as JPEG with the specified quality 73 | target := bytes.NewBuffer(make([]byte, 0, len(Picture))) 74 | 75 | err = jpeg.Encode(target, img, &jpeg.Options{Quality: Quality}) 76 | if err != nil { 77 | return nil, err 78 | } 79 | 80 | return target.Bytes(), nil 81 | } 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fileconversion 2 | 3 | This is a Go library to convert various file formats into plaintext and provide related useful functions. 4 | 5 | This library is used for https://intelx.io and was successfully tested over 184 million individual files. It is partly written from scratch, partly forked from open source and partly a rewrite of existing code. Many existing libraries lack stability and functionality and this libraries solves that. 6 | 7 | We welcome any contributions - please open issues for any feature requests, bugs, and other related issues. 8 | 9 | It supports following file formats for plaintext conversion: 10 | 11 | * Word: DOC, DOCX, RTF, ODT 12 | * Excel: XLS, XLSX, ODS 13 | * PowerPoint: PPTX 14 | * PDF 15 | * Ebook: EPUB, MOBI 16 | * Website: HTML 17 | 18 | Functions for compressed and container files: 19 | 20 | * Decompress files: GZ, BZ, BZ2, XZ 21 | * Extract files from containers: ZIP, RAR, 7Z, TAR 22 | 23 | Picture related functions: 24 | 25 | * Check if pictures are excessively large 26 | * Compress (and convert) pictures to JPEG: GIF, JPEG, PNG, BMP, TIFF 27 | * Resize and compress pictures 28 | * Extract pictures from PDF files 29 | 30 | To download this library: 31 | 32 | ``` 33 | go get -u github.com/IntelligenceX/fileconversion 34 | ``` 35 | 36 | And then use it like: 37 | 38 | ```go 39 | package main 40 | 41 | import ( 42 | "bytes" 43 | "fmt" 44 | "os" 45 | 46 | "github.com/IntelligenceX/fileconversion" 47 | ) 48 | 49 | const sizeLimit = 2 * 1024 * 1024 // 2 MB 50 | 51 | func main() { 52 | // extract text from an XLSX file 53 | file, err := os.Open("Test.xlsx") 54 | if err != nil { 55 | fmt.Printf("Error opening file: %s\n", err) 56 | return 57 | } 58 | 59 | defer file.Close() 60 | stat, _ := file.Stat() 61 | 62 | buffer := bytes.NewBuffer(make([]byte, 0, sizeLimit)) 63 | 64 | fileconversion.XLSX2Text(file, stat.Size(), buffer, sizeLimit, -1) 65 | 66 | fmt.Println(buffer.String()) 67 | } 68 | ``` 69 | 70 | 71 | ## Functions 72 | 73 | The package exports the following functions: 74 | 75 | ```go 76 | XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error) 77 | DOCX2Text(file io.ReaderAt, size int64) (string, error) 78 | EPUB2Text(file io.ReaderAt, size int64, limit int64) (string, error) 79 | HTML2Text(reader io.Reader) (pageText string, err error) 80 | HTML2TextAndLinks(reader io.Reader, baseURL string) (pageText string, links []string, err error) 81 | Mobi2Text(file io.ReadSeeker) (string, error) 82 | ODS2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) 83 | ODT2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) 84 | PDFListContentStreams(f io.ReadSeeker, w io.Writer, size int64) (written int64, err error) 85 | PPTX2Text(file io.ReaderAt, size int64) (string, error) 86 | RTF2Text(inputRtf string) string 87 | XLS2Text(reader io.ReadSeeker, writer io.Writer, size int64) (written int64, err error) 88 | XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error) 89 | ``` 90 | 91 | Picture functions: 92 | 93 | ```go 94 | IsExcessiveLargePicture(Picture []byte) (excessive bool, err error) 95 | CompressJPEG(Picture []byte, quality int) (compressed []byte) 96 | ResizeCompressPicture(Picture []byte, Quality int, MaxWidth, MaxHeight uint) 97 | PDFExtractImages(input io.ReadSeeker) (images []ImageResult, err error) 98 | ``` 99 | 100 | Compression and container file functions: 101 | 102 | ```go 103 | DecompressFile(data []byte) (decompressed []byte, valid bool) 104 | ContainerExtractFiles(data []byte, callback func(name string, size int64, date time.Time, data []byte)) 105 | ``` 106 | 107 | ## Dependencies 108 | 109 | This library uses other go packages. Run the following command to download them: 110 | 111 | ``` 112 | go get -u github.com/nwaples/rardecode 113 | go get -u github.com/saracen/go7z 114 | go get -u github.com/ulikunitz/xz 115 | go get -u github.com/mattetti/filebuffer 116 | go get -u github.com/richardlehane/mscfb 117 | go get -u github.com/taylorskalyo/goreader/epub 118 | go get -u github.com/PuerkitoBio/goquery 119 | go get -u github.com/ssor/bom 120 | go get -u github.com/levigross/exp-html 121 | go get -u github.com/neofight/mobi/convert 122 | go get -u github.com/neofight/mobi/headers 123 | go get -u github.com/unidoc/unipdf 124 | go get -u github.com/nfnt/resize 125 | go get -u github.com/tealeg/xlsx 126 | go get -u gopkg.in/xmlpath.v2 127 | ``` 128 | 129 | ## Tests 130 | 131 | There are no functional tests. The only test functions are used manually for debugging. 132 | 133 | ## Forks 134 | 135 | Other packages were tested and either found insufficient, or unstable. Many of the below listed packages were found to be unstable, cause crashes, as well as exhaust memory due to bad programming, bad input sanitizing and bad memory management. 136 | 137 | * `html2text` is forked from https://github.com/jaytaylor/html2text 138 | * `odf` is forked from https://github.com/knieriem/odf 139 | * `ole2` is forked and partly rewritten from https://github.com/extrame/ole2 140 | * `xls` is forked from https://github.com/sergeilem/xls which is a fork from https://github.com/extrame/xls 141 | * `doc` is forked from https://github.com/EndFirstCorp/doc2txt 142 | * `docx` is forked from https://github.com/guylaor/goword 143 | * `mobi` is forked from https://github.com/neofight/mobi 144 | * `odt` is forked from https://github.com/lu4p/cat 145 | * `pptx` is forked from https://github.com/mr-tim/rol-o-decks 146 | * `rtf` is forked from https://github.com/J45k4/rtf-go 147 | 148 | ## License 149 | 150 | This is free and unencumbered software released into the public domain. 151 | 152 | Note that this package includes, or consists partly of forks or rewrite of existing open source code. Use at your own risk. Intelligence X does not provide any warranty for this library or any parts of it. 153 | -------------------------------------------------------------------------------- /RTF 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: RTF 2 Text.go 3 | Copyright: 2018 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | This code is forked from https://github.com/J45k4/rtf-go and extracts text from RTF files. 7 | It contains an important fix for a bug that was triggered with 06ffe2e7-06b6-41d6-9905-3a225fd55537 with an "index out of range" crash. 8 | It contains another fix to properly decode foreign encodings. 9 | 10 | Warning: rtfRegex.FindAllStringSubmatch may use excessive memory! Example System ID that causes problems: 02cf9199-2cda-4fa1-b830-060c67417d2d. 11 | 12 | An alternative solution is https://github.com/EndFirstCorp/rtf2txt, but it was found to output everything as one long line without LFs. 13 | */ 14 | 15 | package fileconversion 16 | 17 | import ( 18 | "bytes" 19 | "regexp" 20 | "strconv" 21 | "strings" 22 | 23 | "golang.org/x/text/encoding" 24 | "golang.org/x/text/encoding/charmap" 25 | ) 26 | 27 | var destinations = map[string]bool{ 28 | "aftncn": true, 29 | "aftnsep": true, 30 | "aftnsepc": true, 31 | "annotation": true, 32 | "atnauthor": true, 33 | "atndate": true, 34 | "atnicn": true, 35 | "atnid": true, 36 | "atnparent": true, 37 | "atnref": true, 38 | "atntime": true, 39 | "atrfend": true, 40 | "atrfstart": true, 41 | "author": true, 42 | "background": true, 43 | "bkmkend": true, 44 | "bkmkstart": true, 45 | "blipuid": true, 46 | "buptim": true, 47 | "category": true, 48 | "colorschememapping": true, 49 | "colortbl": true, 50 | "comment": true, 51 | "company": true, 52 | "creatim": true, 53 | "datafield": true, 54 | "datastore": true, 55 | "defchp": true, 56 | "defpap": true, 57 | "do": true, 58 | "doccomm": true, 59 | "docvar": true, 60 | "dptxbxtext": true, 61 | "ebcend": true, 62 | "ebcstart": true, 63 | "factoidname": true, 64 | "falt": true, 65 | "fchars": true, 66 | "ffdeftext": true, 67 | "ffentrymcr": true, 68 | "ffexitmcr": true, 69 | "ffformat": true, 70 | "ffhelptext": true, 71 | "ffl": true, 72 | "ffname": true, 73 | "ffstattext": true, 74 | "field": true, 75 | "file": true, 76 | "filetbl": true, 77 | "fldinst": true, 78 | "fldrslt": true, 79 | "fldtype": true, 80 | "fname": true, 81 | "fontemb": true, 82 | "fontfile": true, 83 | "fonttbl": true, 84 | "footer": true, 85 | "footerf": true, 86 | "footerl": true, 87 | "footerr": true, 88 | "footnote": true, 89 | "formfield": true, 90 | "ftncn": true, 91 | "ftnsep": true, 92 | "ftnsepc": true, 93 | "g": true, 94 | "generator": true, 95 | "gridtbl": true, 96 | "header": true, 97 | "headerf": true, 98 | "headerl": true, 99 | "headerr": true, 100 | "hl": true, 101 | "hlfr": true, 102 | "hlinkbase": true, 103 | "hlloc": true, 104 | "hlsrc": true, 105 | "hsv": true, 106 | "htmltag": true, 107 | "info": true, 108 | "keycode": true, 109 | "keywords": true, 110 | "latentstyles": true, 111 | "lchars": true, 112 | "levelnumbers": true, 113 | "leveltext": true, 114 | "lfolevel": true, 115 | "linkval": true, 116 | "list": true, 117 | "listlevel": true, 118 | "listname": true, 119 | "listoverride": true, 120 | "listoverridetable": true, 121 | "listpicture": true, 122 | "liststylename": true, 123 | "listtable": true, 124 | "listtext": true, 125 | "lsdlockedexcept": true, 126 | "macc": true, 127 | "maccPr": true, 128 | "mailmerge": true, 129 | "maln": true, 130 | "malnScr": true, 131 | "manager": true, 132 | "margPr": true, 133 | "mbar": true, 134 | "mbarPr": true, 135 | "mbaseJc": true, 136 | "mbegChr": true, 137 | "mborderBox": true, 138 | "mborderBoxPr": true, 139 | "mbox": true, 140 | "mboxPr": true, 141 | "mchr": true, 142 | "mcount": true, 143 | "mctrlPr": true, 144 | "md": true, 145 | "mdeg": true, 146 | "mdegHide": true, 147 | "mden": true, 148 | "mdiff": true, 149 | "mdPr": true, 150 | "me": true, 151 | "mendChr": true, 152 | "meqArr": true, 153 | "meqArrPr": true, 154 | "mf": true, 155 | "mfName": true, 156 | "mfPr": true, 157 | "mfunc": true, 158 | "mfuncPr": true, 159 | "mgroupChr": true, 160 | "mgroupChrPr": true, 161 | "mgrow": true, 162 | "mhideBot": true, 163 | "mhideLeft": true, 164 | "mhideRight": true, 165 | "mhideTop": true, 166 | "mhtmltag": true, 167 | "mlim": true, 168 | "mlimloc": true, 169 | "mlimlow": true, 170 | "mlimlowPr": true, 171 | "mlimupp": true, 172 | "mlimuppPr": true, 173 | "mm": true, 174 | "mmaddfieldname": true, 175 | "mmath": true, 176 | "mmathPict": true, 177 | "mmathPr": true, 178 | "mmaxdist": true, 179 | "mmc": true, 180 | "mmcJc": true, 181 | "mmconnectstr": true, 182 | "mmconnectstrdata": true, 183 | "mmcPr": true, 184 | "mmcs": true, 185 | "mmdatasource": true, 186 | "mmheadersource": true, 187 | "mmmailsubject": true, 188 | "mmodso": true, 189 | "mmodsofilter": true, 190 | "mmodsofldmpdata": true, 191 | "mmodsomappedname": true, 192 | "mmodsoname": true, 193 | "mmodsorecipdata": true, 194 | "mmodsosort": true, 195 | "mmodsosrc": true, 196 | "mmodsotable": true, 197 | "mmodsoudl": true, 198 | "mmodsoudldata": true, 199 | "mmodsouniquetag": true, 200 | "mmPr": true, 201 | "mmquery": true, 202 | "mmr": true, 203 | "mnary": true, 204 | "mnaryPr": true, 205 | "mnoBreak": true, 206 | "mnum": true, 207 | "mobjDist": true, 208 | "moMath": true, 209 | "moMathPara": true, 210 | "moMathParaPr": true, 211 | "mopEmu": true, 212 | "mphant": true, 213 | "mphantPr": true, 214 | "mplcHide": true, 215 | "mpos": true, 216 | "mr": true, 217 | "mrad": true, 218 | "mradPr": true, 219 | "mrPr": true, 220 | "msepChr": true, 221 | "mshow": true, 222 | "mshp": true, 223 | "msPre": true, 224 | "msPrePr": true, 225 | "msSub": true, 226 | "msSubPr": true, 227 | "msSubSup": true, 228 | "msSubSupPr": true, 229 | "msSup": true, 230 | "msSupPr": true, 231 | "mstrikeBLTR": true, 232 | "mstrikeH": true, 233 | "mstrikeTLBR": true, 234 | "mstrikeV": true, 235 | "msub": true, 236 | "msubHide": true, 237 | "msup": true, 238 | "msupHide": true, 239 | "mtransp": true, 240 | "mtype": true, 241 | "mvertJc": true, 242 | "mvfmf": true, 243 | "mvfml": true, 244 | "mvtof": true, 245 | "mvtol": true, 246 | "mzeroAsc": true, 247 | "mzeroDesc": true, 248 | "mzeroWid": true, 249 | "nesttableprops": true, 250 | "nextfile": true, 251 | "nonesttables": true, 252 | "objalias": true, 253 | "objclass": true, 254 | "objdata": true, 255 | "object": true, 256 | "objname": true, 257 | "objsect": true, 258 | "objtime": true, 259 | "oldcprops": true, 260 | "oldpprops": true, 261 | "oldsprops": true, 262 | "oldtprops": true, 263 | "oleclsid": true, 264 | "operator": true, 265 | "panose": true, 266 | "password": true, 267 | "passwordhash": true, 268 | "pgp": true, 269 | "pgptbl": true, 270 | "picprop": true, 271 | "pict": true, 272 | "pn": true, 273 | "pnseclvl": true, 274 | "pntext": true, 275 | "pntxta": true, 276 | "pntxtb": true, 277 | "printim": true, 278 | "private": true, 279 | "propname": true, 280 | "protend": true, 281 | "protstart": true, 282 | "protusertbl": true, 283 | "pxe": true, 284 | "result": true, 285 | "revtbl": true, 286 | "revtim": true, 287 | "rsidtbl": true, 288 | "rxe": true, 289 | "shp": true, 290 | "shpgrp": true, 291 | "shpinst": true, 292 | "shppict": true, 293 | "shprslt": true, 294 | "shptxt": true, 295 | "sn": true, 296 | "sp": true, 297 | "staticval": true, 298 | "stylesheet": true, 299 | "subject": true, 300 | "sv": true, 301 | "svb": true, 302 | "tc": true, 303 | "template": true, 304 | "themedata": true, 305 | "title": true, 306 | "txe": true, 307 | "ud": true, 308 | "upr": true, 309 | "userprops": true, 310 | "wgrffmtfilter": true, 311 | "windowcaption": true, 312 | "writereservation": true, 313 | "writereservhash": true, 314 | "xe": true, 315 | "xform": true, 316 | "xmlattrname": true, 317 | "xmlattrvalue": true, 318 | "xmlclose": true, 319 | "xmlname": true, 320 | "xmlnstbl": true, 321 | "xmlopen": true, 322 | } 323 | 324 | var specialCharacters = map[string]string{ 325 | "par": "\n", 326 | "sect": "\n\n", 327 | "page": "\n\n", 328 | "line": "\n", 329 | "tab": "\t", 330 | "emdash": "\u2014", 331 | "endash": "\u2013", 332 | "emspace": "\u2003", 333 | "enspace": "\u2002", 334 | "qmspace": "\u2005", 335 | "bullet": "\u2022", 336 | "lquote": "\u2018", 337 | "rquote": "\u2019", 338 | "ldblquote": "\u201C", 339 | "rdblquote": "\u201D", 340 | } 341 | 342 | var charmaps = map[string]*charmap.Charmap{ 343 | "437": charmap.CodePage437, 344 | // "708": nil, 345 | // "709": nil, 346 | // "710": nil, 347 | // "711": nil, 348 | // "720": nil, 349 | // "819": nil, 350 | "850": charmap.CodePage850, 351 | "852": charmap.CodePage852, 352 | "860": charmap.CodePage860, 353 | "862": charmap.CodePage862, 354 | "863": charmap.CodePage863, 355 | // "864": nil, 356 | "865": charmap.CodePage865, 357 | "866": charmap.CodePage866, 358 | // "874": nil, 359 | // "932": nil, 360 | // "936": nil, 361 | // "949": nil, 362 | // "950": nil, 363 | "1250": charmap.Windows1250, 364 | "1251": charmap.Windows1251, 365 | "1252": charmap.Windows1252, 366 | "1253": charmap.Windows1253, 367 | "1254": charmap.Windows1254, 368 | "1255": charmap.Windows1255, 369 | "1256": charmap.Windows1256, 370 | "1257": charmap.Windows1257, 371 | "1258": charmap.Windows1258, 372 | // "1361": nil, 373 | } 374 | 375 | var rtfRegex = regexp.MustCompile( 376 | "(?i)" + 377 | `\\([a-z]{1,32})(-?\d{1,10})?[ ]?` + 378 | `|\\'([0-9a-f]{2})` + 379 | `|\\([^a-z])` + 380 | `|([{}])` + 381 | `|[\r\n]+` + 382 | `|(.)`) 383 | 384 | type stackEntry struct { 385 | NumberOfCharactersToSkip int 386 | Ignorable bool 387 | } 388 | 389 | func newStackEntry(numberOfCharactersToSkip int, ignorable bool) stackEntry { 390 | return stackEntry{ 391 | NumberOfCharactersToSkip: numberOfCharactersToSkip, 392 | Ignorable: ignorable, 393 | } 394 | } 395 | 396 | // RTF2Text removes rtf characters from string and returns the new string. 397 | func RTF2Text(inputRtf string) string { 398 | var charMap *charmap.Charmap 399 | var decoder *encoding.Decoder 400 | var stack []stackEntry 401 | var ignorable bool 402 | ucskip := 1 403 | curskip := 0 404 | 405 | matches := rtfRegex.FindAllStringSubmatch(inputRtf, -1) 406 | var returnBuffer bytes.Buffer 407 | 408 | for _, match := range matches { 409 | word := match[1] 410 | arg := match[2] 411 | hex := match[3] 412 | character := match[4] 413 | brace := match[5] 414 | tchar := match[6] 415 | 416 | switch { 417 | case tchar != "": 418 | if curskip > 0 { 419 | curskip-- 420 | } else if !ignorable { 421 | if charMap == nil || decoder == nil { 422 | returnBuffer.WriteString(tchar) 423 | } else { 424 | tcharDec, err := decoder.String(tchar) 425 | if err == nil { 426 | returnBuffer.WriteString(tcharDec) 427 | } 428 | } 429 | } 430 | case brace != "": 431 | curskip = 0 432 | if brace == "{" { 433 | stack = append( 434 | stack, newStackEntry(ucskip, ignorable)) 435 | } else if brace == "}" { 436 | // There was a crash here with item 06ffe2e7-06b6-41d6-9905-3a225fd55537 437 | // It's fixed by checking l == 0 and handling it as special case 438 | if l := len(stack); l > 0 { 439 | entry := stack[l-1] 440 | stack = stack[:l-1] 441 | ucskip = entry.NumberOfCharactersToSkip 442 | ignorable = entry.Ignorable 443 | } 444 | } 445 | case character != "": 446 | curskip = 0 447 | if character == "~" { 448 | if !ignorable { 449 | returnBuffer.WriteString("\xA0") 450 | } 451 | } else if strings.Contains("{}\\", character) { 452 | if !ignorable { 453 | returnBuffer.WriteString(character) 454 | } 455 | } else if character == "*" { 456 | ignorable = true 457 | } 458 | case word != "": 459 | curskip = 0 460 | if destinations[word] { 461 | ignorable = true 462 | } else if ignorable { 463 | } else if specialCharacters[word] != "" { 464 | returnBuffer.WriteString( 465 | specialCharacters[word]) 466 | } else if word == "ansicpg" { 467 | var ok bool 468 | if charMap, ok = charmaps[arg]; ok { 469 | decoder = charMap.NewDecoder() 470 | } else { 471 | // encoding not supported, continue anyway 472 | } 473 | } else if word == "uc" { 474 | i, _ := strconv.Atoi(arg) 475 | ucskip = i 476 | } else if word == "u" { 477 | c, _ := strconv.Atoi(arg) 478 | if c < 0 { 479 | c += 0x10000 480 | } 481 | returnBuffer.WriteRune(rune(c)) 482 | curskip = ucskip 483 | } 484 | case hex != "": 485 | if curskip > 0 { 486 | curskip-- 487 | } else if !ignorable { 488 | c, _ := strconv.ParseInt(hex, 16, 0) 489 | if charMap == nil { 490 | returnBuffer.WriteRune(rune(c)) 491 | } else { 492 | returnBuffer.WriteRune( 493 | charMap.DecodeByte(byte(c))) 494 | } 495 | } 496 | } 497 | } 498 | return returnBuffer.String() 499 | } 500 | 501 | // IsFileRTF checks if the data indicates a RTF file 502 | // RTF has a signature of 7B 5C 72 74 66 31, or in string "{\rtf1" 503 | func IsFileRTF(data []byte) bool { 504 | return bytes.HasPrefix(data, []byte{0x7B, 0x5C, 0x72, 0x74, 0x66, 0x31}) 505 | } 506 | -------------------------------------------------------------------------------- /XLS 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: XLS 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | The code originally used https://github.com/extrame/xls, which revealed multiple bugs that crashed for certain Excel files. 7 | Now it forks the xls package and the underlying ole2 package. This fork also fixes excessive memory usage issues. 8 | */ 9 | 10 | package fileconversion 11 | 12 | import ( 13 | "bytes" 14 | "fmt" 15 | "io" 16 | "strings" 17 | 18 | "github.com/IntelligenceX/fileconversion/xls" 19 | ) 20 | 21 | // XLS2Text extracts text from an Excel sheet. It returns bytes written. 22 | // The parameter size is the max amount of bytes (not characters) to write out. 23 | // The whole Excel file is required even for partial text extraction. This function returns no error with 0 bytes written in case of corrupted or invalid file. 24 | func XLS2Text(reader io.ReadSeeker, writer io.Writer, size int64) (written int64, err error) { 25 | 26 | xlFile, err := xls.OpenReader(reader, "utf-8") 27 | if err != nil || xlFile == nil { 28 | return 0, err 29 | } 30 | 31 | for n := 0; n < xlFile.NumSheets(); n++ { 32 | if sheet1 := xlFile.GetSheet(n); sheet1 != nil { 33 | if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet1.Name, n, int(sheet1.MaxRow))), &written, &size); err != nil || size == 0 { 34 | return written, err 35 | } 36 | 37 | for m := 0; m <= int(sheet1.MaxRow); m++ { 38 | row1 := sheet1.Row(m) 39 | if row1 == nil { 40 | continue 41 | } 42 | 43 | rowText := "" 44 | 45 | // go through all columns 46 | for c := row1.FirstCol(); c < row1.LastCol(); c++ { 47 | if text := row1.Col(c); text != "" { 48 | text = cleanCell(text) 49 | 50 | if c > row1.FirstCol() { 51 | rowText += ", " 52 | } 53 | rowText += text 54 | } 55 | } 56 | 57 | rowText += "\n" 58 | 59 | if err = writeOutput(writer, []byte(rowText), &written, &size); err != nil || size == 0 { 60 | return written, err 61 | } 62 | } 63 | } 64 | } 65 | 66 | return written, nil 67 | } 68 | 69 | // cleanCell returns a cleaned cell text without new-lines 70 | func cleanCell(text string) string { 71 | text = strings.ReplaceAll(text, "\n", " ") 72 | text = strings.ReplaceAll(text, "\r", "") 73 | text = strings.TrimSpace(text) 74 | 75 | return text 76 | } 77 | 78 | func xlGenerateSheetTitle(name string, number, rows int) (title string) { 79 | if number > 0 { 80 | title += "\n" 81 | } 82 | 83 | title += fmt.Sprintf("Sheet \"%s\" (%d rows):\n", name, rows) 84 | 85 | return title 86 | } 87 | 88 | func writeOutput(writer io.Writer, output []byte, alreadyWritten *int64, size *int64) (err error) { 89 | 90 | if int64(len(output)) > *size { 91 | output = output[:*size] 92 | } 93 | 94 | *size -= int64(len(output)) 95 | 96 | writtenOut, err := writer.Write(output) 97 | *alreadyWritten += int64(writtenOut) 98 | 99 | return err 100 | } 101 | 102 | // IsFileXLS checks if the data indicates a XLS file 103 | // XLS has a signature of D0 CF 11 E0 A1 B1 1A E1 104 | func IsFileXLS(data []byte) bool { 105 | return bytes.HasPrefix(data, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1}) 106 | } 107 | 108 | // XLS2Cells converts an XLS file to individual cells 109 | func XLS2Cells(reader io.ReadSeeker) (cells []string, err error) { 110 | 111 | xlFile, err := xls.OpenReader(reader, "utf-8") 112 | if err != nil || xlFile == nil { 113 | return nil, err 114 | } 115 | 116 | for n := 0; n < xlFile.NumSheets(); n++ { 117 | if sheet1 := xlFile.GetSheet(n); sheet1 != nil { 118 | for m := 0; m <= int(sheet1.MaxRow); m++ { 119 | row1 := sheet1.Row(m) 120 | if row1 == nil { 121 | continue 122 | } 123 | 124 | for c := row1.FirstCol(); c < row1.LastCol(); c++ { 125 | if text := row1.Col(c); text != "" { 126 | text = cleanCell(text) 127 | cells = append(cells, text) 128 | } 129 | } 130 | } 131 | } 132 | } 133 | 134 | return 135 | } 136 | -------------------------------------------------------------------------------- /XLSX 2 Text.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: XLSX 2 Text.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | 6 | * https://github.com/tealeg/xlsx is used in production. 7 | Some files used more than 1 GB of memory, even though the file itself is only 9 MB. Example 971bd55b-5cbd-43d2-899e-d4a2a7d0a883. 8 | The underlying issue was how it decoded the worksheet XML into large structures. There was no easy fix for that. 9 | 10 | * https://github.com/unidoc/unioffice is available as inactive implementation below, although it was found to also use lots of RAM. 11 | 12 | * https://github.com/360EntSecGroup-Skylar/excelize was not tested in detail, but seems very similar to "tealeg/xlsx". 13 | 14 | * https://github.com/szyhf/go-excel is faster and uses smaller resources than "tealeg/xlsx", but lacks quality when extracting cells and misses many. 15 | 16 | */ 17 | 18 | package fileconversion 19 | 20 | import ( 21 | "bytes" 22 | "io" 23 | 24 | "github.com/tealeg/xlsx" 25 | ) 26 | 27 | // IsFileXLSX checks if the data indicates a XLSX file 28 | // XLSX has a signature of 50 4B 03 04 29 | // Warning: This collides with ZIP, DOCX and other zip-based files. 30 | func IsFileXLSX(data []byte) bool { 31 | return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04}) 32 | } 33 | 34 | // XLSX2Text extracts text of an Excel sheet 35 | // Size is the full size of the input file. Limit is the output limit in bytes. 36 | // rowLimit defines how many rows per sheet to extract. -1 means unlimited. This exists as protection against some XLSX files that may use excessive amount of memory. 37 | func XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64, rowLimit int) (written int64, err error) { 38 | var xlFile *xlsx.File 39 | 40 | if rowLimit == -1 { 41 | xlFile, err = xlsx.OpenReaderAt(file, size) 42 | } else { 43 | xlFile, err = xlsx.OpenReaderAtWithRowLimit(file, size, rowLimit) 44 | } 45 | if err != nil { 46 | return 0, err 47 | } 48 | 49 | for n, sheet := range xlFile.Sheets { 50 | if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet.Name, n, int(sheet.MaxRow))), &written, &limit); err != nil || limit == 0 { 51 | return written, err 52 | } 53 | 54 | for _, row := range sheet.Rows { 55 | 56 | rowText := "" 57 | 58 | // go through all columns 59 | for m, cell := range row.Cells { 60 | text := cell.String() 61 | if text != "" { 62 | text = cleanCell(text) 63 | 64 | if m > 0 { 65 | rowText += ", " 66 | } 67 | rowText += text 68 | } 69 | } 70 | 71 | rowText += "\n" 72 | 73 | if err = writeOutput(writer, []byte(rowText), &written, &limit); err != nil || limit == 0 { 74 | return written, err 75 | } 76 | } 77 | } 78 | 79 | return written, nil 80 | } 81 | 82 | // XLSX2Cells converts an XLSX file to individual cells 83 | // Size is the full size of the input file. 84 | // rowLimit defines how many rows per sheet to extract. -1 means unlimited. This exists as protection against some XLSX files that may use excessive amount of memory. 85 | func XLSX2Cells(file io.ReaderAt, size int64, rowLimit int) (cells []string, err error) { 86 | var xlFile *xlsx.File 87 | 88 | if rowLimit == -1 { 89 | xlFile, err = xlsx.OpenReaderAt(file, size) 90 | } else { 91 | xlFile, err = xlsx.OpenReaderAtWithRowLimit(file, size, rowLimit) 92 | } 93 | if err != nil { 94 | return nil, err 95 | } 96 | 97 | for _, sheet := range xlFile.Sheets { 98 | for _, row := range sheet.Rows { 99 | for _, cell := range row.Cells { 100 | if text := cell.String(); text != "" { 101 | text = cleanCell(text) 102 | cells = append(cells, text) 103 | } 104 | } 105 | } 106 | } 107 | 108 | return 109 | } 110 | 111 | // alternative implementation using https://github.com/unidoc/unioffice, not required 112 | 113 | /* 114 | // XLSX2Cells2 converts an XLS file to individual cells 115 | func XLSX2Cells2(file io.ReaderAt, size int64) (cells []string, err error) { 116 | 117 | xlFile, err := spreadsheet.Read(file, size) 118 | if err != nil || xlFile == nil { 119 | return nil, err 120 | } 121 | defer xlFile.Close() 122 | 123 | for _, sheet := range xlFile.Sheets() { 124 | for _, row := range sheet.Rows() { 125 | for _, cell := range row.Cells() { 126 | if text := cell.GetString(); text != "" { 127 | text = cleanCell(text) 128 | cells = append(cells, text) 129 | } 130 | } 131 | } 132 | } 133 | 134 | return 135 | } 136 | 137 | // XLSX2Text2 extracts text from an Excel sheet. It returns bytes written. 138 | // The parameter limit is the max amount of bytes (not characters) to write out. 139 | // The whole Excel file is required even for partial text extraction. This function returns no error with 0 bytes written in case of corrupted or invalid file. 140 | func XLSX2Text2(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) { 141 | 142 | xlFile, err := spreadsheet.Read(file, size) 143 | if err != nil || xlFile == nil { 144 | return 0, err 145 | } 146 | defer xlFile.Close() 147 | 148 | for n, sheet := range xlFile.Sheets() { 149 | rows := sheet.Rows() 150 | if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheet.Name(), n, len(rows))), &written, &limit); err != nil || limit == 0 { 151 | return written, err 152 | } 153 | 154 | for _, row := range sheet.Rows() { 155 | rowText := "" 156 | 157 | for n, cell := range row.Cells() { 158 | text := cell.GetString() 159 | text = cleanCell(text) 160 | 161 | if n > 1 { 162 | rowText += ", " 163 | } 164 | rowText += text 165 | } 166 | 167 | rowText += "\n" 168 | 169 | if err = writeOutput(writer, []byte(rowText), &written, &size); err != nil || size == 0 { 170 | return written, err 171 | } 172 | } 173 | } 174 | 175 | return written, nil 176 | } 177 | */ 178 | 179 | // implementation using https://github.com/szyhf/go-excel 180 | 181 | /* 182 | // XLSX2Text extracts text of an Excel sheet 183 | // Size is the full size of the input file. 184 | func XLSX2Text(file io.ReaderAt, size int64, writer io.Writer, limit int64) (written int64, err error) { 185 | 186 | conn := excel.NewConnecter() 187 | err = conn.OpenReader(file, size) 188 | if err != nil { 189 | return 0, err 190 | } 191 | defer conn.Close() 192 | 193 | for n, sheetName := range conn.GetSheetNames() { 194 | if err = writeOutput(writer, []byte(xlGenerateSheetTitle(sheetName, n, 100)), &written, &limit); err != nil || limit == 0 { 195 | return written, err 196 | } 197 | 198 | rd, err := conn.NewReader(sheetName) 199 | if err != nil { 200 | continue 201 | } 202 | 203 | for rd.Next() { 204 | var rowCells []string 205 | err1 := rd.Read(&rowCells) 206 | 207 | rowText := "" 208 | 209 | // go through all columns 210 | for m, cell := range rowCells { 211 | if text := cell; text != "" { 212 | text = cleanCell(text) 213 | 214 | if m > 0 { 215 | rowText += ", " 216 | } 217 | rowText += text 218 | } 219 | } 220 | 221 | rowText += "\n" 222 | 223 | if err = writeOutput(writer, []byte(rowText), &written, &limit); err != nil || limit == 0 { 224 | return written, err 225 | } 226 | 227 | if err1 == io.EOF { 228 | break 229 | } 230 | } 231 | } 232 | 233 | return written, nil 234 | } 235 | 236 | // XLSX2Cells converts an XLSX file to individual cells 237 | // Size is the full size of the input file. 238 | func XLSX2Cells(file io.ReaderAt, size int64) (cells []string, err error) { 239 | 240 | conn := excel.NewConnecter() 241 | err = conn.OpenReader(file, size) 242 | if err != nil { 243 | return nil, err 244 | } 245 | defer conn.Close() 246 | 247 | loopSheet: 248 | for _, sheetName := range conn.GetSheetNames() { 249 | rd, err := conn.NewReader(sheetName) 250 | if err != nil { 251 | continue 252 | } 253 | 254 | for rd.Next() { 255 | var rowCells []string 256 | if err := rd.Read(&rowCells); err != nil { 257 | continue loopSheet 258 | } 259 | 260 | for _, cell := range rowCells { 261 | if text := cell; text != "" { 262 | text = cleanCell(text) 263 | cells = append(cells, text) 264 | } 265 | } 266 | } 267 | } 268 | 269 | return 270 | } 271 | */ 272 | -------------------------------------------------------------------------------- /ZIP.go: -------------------------------------------------------------------------------- 1 | /* 2 | File Name: ZIP.go 3 | Copyright: 2019 Kleissner Investments s.r.o. 4 | Author: Peter Kleissner 5 | */ 6 | 7 | package fileconversion 8 | 9 | import "bytes" 10 | 11 | // IsFileZIP checks if the data indicates a ZIP file. 12 | // Many file formats like DOCX, XLSX, PPTX and APK are actual ZIP files. 13 | // Signature 50 4B 03 04 14 | func IsFileZIP(data []byte) bool { 15 | return bytes.HasPrefix(data, []byte{0x50, 0x4B, 0x03, 0x04}) 16 | } 17 | -------------------------------------------------------------------------------- /html2text/README.md: -------------------------------------------------------------------------------- 1 | # html2text 2 | 3 | Forked from https://github.com/jaytaylor/html2text. 4 | 5 | ### Converts HTML into text of the markdown-flavored variety 6 | 7 | 8 | ## Introduction 9 | 10 | Ensure your emails are readable by all! 11 | 12 | Turns HTML into raw text, useful for sending fancy HTML emails with an equivalently nicely formatted TXT document as a fallback (e.g. for people who don't allow HTML emails or have other display issues). 13 | 14 | html2text is a simple golang package for rendering HTML into plaintext. 15 | 16 | There are still lots of improvements to be had, but FWIW this has worked fine for my [basic] HTML-2-text needs. 17 | 18 | It requires go 1.x or newer ;) 19 | 20 | 21 | ## Download the package 22 | 23 | ```bash 24 | go get jaytaylor.com/html2text 25 | ``` 26 | 27 | ## Example usage 28 | 29 | ```go 30 | package main 31 | 32 | import ( 33 | "fmt" 34 | 35 | "jaytaylor.com/html2text" 36 | ) 37 | 38 | func main() { 39 | inputHTML := ` 40 | 41 | 42 | My Mega Service 43 | 44 | 45 | 46 | 47 | 48 | 51 | 52 |

Welcome to your new account on my service!

53 | 54 |

55 | Here is some more information: 56 | 57 |

62 |

63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 |
Header 1Header 2
Footer 1Footer 2
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
76 | 77 | ` 78 | 79 | text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true}) 80 | if err != nil { 81 | panic(err) 82 | } 83 | fmt.Println(text) 84 | } 85 | ``` 86 | 87 | Output: 88 | ``` 89 | Mega Service ( http://jaytaylor.com/ ) 90 | 91 | ****************************************** 92 | Welcome to your new account on my service! 93 | ****************************************** 94 | 95 | Here is some more information: 96 | 97 | * Link 1: Example.com ( https://example.com ) 98 | * Link 2: Example2.com ( https://example2.com ) 99 | * Something else 100 | 101 | +-------------+-------------+ 102 | | HEADER 1 | HEADER 2 | 103 | +-------------+-------------+ 104 | | Row 1 Col 1 | Row 1 Col 2 | 105 | | Row 2 Col 1 | Row 2 Col 2 | 106 | +-------------+-------------+ 107 | | FOOTER 1 | FOOTER 2 | 108 | +-------------+-------------+ 109 | ``` 110 | 111 | 112 | ## Unit-tests 113 | 114 | Running the unit-tests is straightforward and standard: 115 | 116 | ```bash 117 | go test 118 | ``` 119 | 120 | -------------------------------------------------------------------------------- /html2text/html2text.go: -------------------------------------------------------------------------------- 1 | package html2text 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "regexp" 7 | "strings" 8 | "unicode" 9 | 10 | "github.com/olekukonko/tablewriter" 11 | "github.com/ssor/bom" 12 | "golang.org/x/net/html" 13 | "golang.org/x/net/html/atom" 14 | ) 15 | 16 | // Options provide toggles and overrides to control specific rendering behaviors. 17 | type Options struct { 18 | PrettyTables bool // Turns on pretty ASCII rendering for table elements. 19 | PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements. 20 | OmitLinks bool // Turns on omitting links 21 | } 22 | 23 | // PrettyTablesOptions overrides tablewriter behaviors 24 | type PrettyTablesOptions struct { 25 | AutoFormatHeader bool 26 | AutoWrapText bool 27 | ReflowDuringAutoWrap bool 28 | ColWidth int 29 | ColumnSeparator string 30 | RowSeparator string 31 | CenterSeparator string 32 | HeaderAlignment int 33 | FooterAlignment int 34 | Alignment int 35 | ColumnAlignment []int 36 | NewLine string 37 | HeaderLine bool 38 | RowLine bool 39 | AutoMergeCells bool 40 | Borders tablewriter.Border 41 | } 42 | 43 | // NewPrettyTablesOptions creates PrettyTablesOptions with default settings 44 | func NewPrettyTablesOptions() *PrettyTablesOptions { 45 | return &PrettyTablesOptions{ 46 | AutoFormatHeader: true, 47 | AutoWrapText: true, 48 | ReflowDuringAutoWrap: true, 49 | ColWidth: tablewriter.MAX_ROW_WIDTH, 50 | ColumnSeparator: tablewriter.COLUMN, 51 | RowSeparator: tablewriter.ROW, 52 | CenterSeparator: tablewriter.CENTER, 53 | HeaderAlignment: tablewriter.ALIGN_DEFAULT, 54 | FooterAlignment: tablewriter.ALIGN_DEFAULT, 55 | Alignment: tablewriter.ALIGN_DEFAULT, 56 | ColumnAlignment: []int{}, 57 | NewLine: tablewriter.NEWLINE, 58 | HeaderLine: true, 59 | RowLine: false, 60 | AutoMergeCells: false, 61 | Borders: tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true}, 62 | } 63 | } 64 | 65 | // FromHTMLNode renders text output from a pre-parsed HTML document. 66 | func FromHTMLNode(doc *html.Node, o ...Options) (string, error) { 67 | var options Options 68 | if len(o) > 0 { 69 | options = o[0] 70 | } 71 | 72 | ctx := textifyTraverseContext{ 73 | buf: bytes.Buffer{}, 74 | options: options, 75 | } 76 | if err := ctx.traverse(doc); err != nil { 77 | return "", err 78 | } 79 | 80 | text := strings.TrimSpace(newlineRe.ReplaceAllString( 81 | strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"), 82 | ) 83 | return text, nil 84 | } 85 | 86 | // FromReader renders text output after parsing HTML for the specified 87 | // io.Reader. 88 | func FromReader(reader io.Reader, options ...Options) (string, error) { 89 | newReader, err := bom.NewReaderWithoutBom(reader) 90 | if err != nil { 91 | return "", err 92 | } 93 | doc, err := html.Parse(newReader) 94 | if err != nil { 95 | return "", err 96 | } 97 | return FromHTMLNode(doc, options...) 98 | } 99 | 100 | // FromString parses HTML from the input string, then renders the text form. 101 | func FromString(input string, options ...Options) (string, error) { 102 | bs := bom.CleanBom([]byte(input)) 103 | text, err := FromReader(bytes.NewReader(bs), options...) 104 | if err != nil { 105 | return "", err 106 | } 107 | return text, nil 108 | } 109 | 110 | var ( 111 | spacingRe = regexp.MustCompile(`[ \r\n\t]+`) 112 | newlineRe = regexp.MustCompile(`\n\n+`) 113 | ) 114 | 115 | // traverseTableCtx holds text-related context. 116 | type textifyTraverseContext struct { 117 | buf bytes.Buffer 118 | 119 | prefix string 120 | tableCtx tableTraverseContext 121 | options Options 122 | endsWithSpace bool 123 | justClosedDiv bool 124 | blockquoteLevel int 125 | lineLength int 126 | isPre bool 127 | isVirtualBQ bool // virtual blockquote 128 | } 129 | 130 | // tableTraverseContext holds table ASCII-form related context. 131 | type tableTraverseContext struct { 132 | header []string 133 | body [][]string 134 | footer []string 135 | tmpRow int 136 | isInFooter bool 137 | } 138 | 139 | func (tableCtx *tableTraverseContext) init() { 140 | tableCtx.body = [][]string{} 141 | tableCtx.header = []string{} 142 | tableCtx.footer = []string{} 143 | tableCtx.isInFooter = false 144 | tableCtx.tmpRow = 0 145 | } 146 | 147 | func (ctx *textifyTraverseContext) handleElement(node *html.Node) error { 148 | ctx.justClosedDiv = false 149 | 150 | switch node.DataAtom { 151 | case atom.Br: 152 | return ctx.emit("\n") 153 | 154 | case atom.H1, atom.H2, atom.H3: 155 | subCtx := textifyTraverseContext{} 156 | if err := subCtx.traverseChildren(node); err != nil { 157 | return err 158 | } 159 | 160 | str := subCtx.buf.String() 161 | dividerLen := 0 162 | for _, line := range strings.Split(str, "\n") { 163 | if lineLen := len([]rune(line)); lineLen-1 > dividerLen { 164 | dividerLen = lineLen - 1 165 | } 166 | } 167 | var divider string 168 | if node.DataAtom == atom.H1 { 169 | divider = strings.Repeat("*", dividerLen) 170 | } else { 171 | divider = strings.Repeat("-", dividerLen) 172 | } 173 | 174 | if node.DataAtom == atom.H3 { 175 | return ctx.emit("\n\n" + str + "\n" + divider + "\n\n") 176 | } 177 | return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n") 178 | 179 | case atom.Blockquote: 180 | if ctx.buf.Len() == 0 && !ctx.isVirtualBQ { // do not apply blockquote if full html is blockquote 181 | return ctx.traverseChildren(node) 182 | } 183 | 184 | ctx.blockquoteLevel++ 185 | ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " " 186 | // following lines are disabled, otherwise it outputs 2x empty '>' lines 187 | //if err := ctx.emit("\n"); err != nil { 188 | // return err 189 | //} 190 | //if ctx.blockquoteLevel == 1 { 191 | // if err := ctx.emit("\n"); err != nil { 192 | // return err 193 | // } 194 | //} 195 | if err := ctx.traverseChildren(node); err != nil { 196 | return err 197 | } 198 | ctx.blockquoteLevel-- 199 | ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) 200 | if ctx.blockquoteLevel > 0 { 201 | ctx.prefix += " " 202 | } 203 | // to remove the last "> " (or multiple on levels) added we would have to make some magic with the ctx.buf 204 | return ctx.emit("\n\n") 205 | 206 | case atom.Div: 207 | // hack
as blockquote 208 | for _, attr := range node.Attr { 209 | if attr.Key == "class" && attr.Val == "quote_container" { 210 | node.DataAtom = atom.Blockquote 211 | ctx.isVirtualBQ = true 212 | err := ctx.handleElement(node) 213 | ctx.isVirtualBQ = false 214 | return err 215 | } 216 | } 217 | 218 | if ctx.lineLength > 0 { 219 | if err := ctx.emit("\n"); err != nil { 220 | return err 221 | } 222 | } 223 | if err := ctx.traverseChildren(node); err != nil { 224 | return err 225 | } 226 | var err error 227 | if !ctx.justClosedDiv { 228 | err = ctx.emit("\n") 229 | } 230 | ctx.justClosedDiv = true 231 | return err 232 | 233 | case atom.Li: 234 | if err := ctx.emit("* "); err != nil { 235 | return err 236 | } 237 | 238 | if err := ctx.traverseChildren(node); err != nil { 239 | return err 240 | } 241 | 242 | return ctx.emit("\n") 243 | 244 | case atom.B, atom.Strong: 245 | subCtx := textifyTraverseContext{} 246 | subCtx.endsWithSpace = true 247 | if err := subCtx.traverseChildren(node); err != nil { 248 | return err 249 | } 250 | str := subCtx.buf.String() 251 | return ctx.emit("*" + str + "*") 252 | 253 | case atom.A: 254 | linkText := "" 255 | // For simple link element content with single text node only, peek at the link text. 256 | if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode { 257 | linkText = node.FirstChild.Data 258 | } 259 | 260 | // If image is the only child, take its alt text as the link text. 261 | if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img { 262 | if altText := getAttrVal(img, "alt"); altText != "" { 263 | if err := ctx.emit(altText); err != nil { 264 | return err 265 | } 266 | } 267 | } else if err := ctx.traverseChildren(node); err != nil { 268 | return err 269 | } 270 | 271 | hrefLink := "" 272 | if attrVal := getAttrVal(node, "href"); attrVal != "" { 273 | attrVal = ctx.normalizeHrefLink(attrVal) 274 | // Don't print link href if it matches link element content or if the link is empty. 275 | if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal { 276 | hrefLink = "( " + attrVal + " )" 277 | } 278 | } 279 | 280 | return ctx.emit(hrefLink) 281 | 282 | case atom.P, atom.Ul: 283 | return ctx.paragraphHandler(node) 284 | 285 | case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td: 286 | if ctx.options.PrettyTables { 287 | return ctx.handleTableElement(node) 288 | } else if node.DataAtom == atom.Table { 289 | return ctx.paragraphHandler(node) 290 | } 291 | return ctx.traverseChildren(node) 292 | 293 | case atom.Pre, atom.Code: 294 | ctx.isPre = true 295 | err := ctx.traverseChildren(node) 296 | ctx.isPre = false 297 | return err 298 | 299 | case atom.Style, atom.Script, atom.Head: 300 | // Ignore the subtree. 301 | return nil 302 | 303 | case atom.Noscript: 304 | // Because of bug https://github.com/golang/go/issues/16318 we have to remove the whole content in