├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── [MS-DOC]-170112.docx ├── clx.go ├── clx_test.go ├── doc conversion.docx ├── doc.go ├── doc_test.go ├── fib.go ├── fib_test.go ├── fibrgfclcb97.txt ├── fibrglw97.txt ├── go.mod ├── go.sum ├── plcFld.go └── testData ├── docFile.doc └── simpleDoc.doc /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | *.out 6 | 7 | # Folders 8 | _obj 9 | _test 10 | testData 11 | 12 | # Architecture specific extensions/prefixes 13 | *.[568vq] 14 | [568vq].out 15 | 16 | *.cgo1.go 17 | *.cgo2.c 18 | _cgo_defun.c 19 | _cgo_gotypes.go 20 | _cgo_export.* 21 | 22 | _testmain.go 23 | 24 | *.exe 25 | *.test 26 | *.prof 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - tip 5 | 6 | script: 7 | - go build ./... 8 | - go test -v -short -covermode=count -coverprofile=coverage.out 9 | 10 | before_install: 11 | - go get golang.org/x/tools/cmd/cover 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 EndFirst LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # doc2txt 2 | [![Build Status](https://travis-ci.org/EndFirstCorp/doc2txt.svg?branch=master)](https://travis-ci.org/EndFirstCorp/doc2txt) [![Coverage Status](https://coveralls.io/repos/github/EndFirstCorp/doc2txt/badge.svg?branch=master)](https://coveralls.io/github/EndFirstCorp/doc2txt?branch=master) 3 | 4 | A native Go reader for the old Microsoft Word .doc binary format files 5 | 6 | Example usage: 7 | 8 | ```go 9 | f, _ := os.Open(`testData\simpleDoc.doc`) 10 | buf, err := ParseDoc(f) 11 | if err != nil { 12 | // handle error 13 | } 14 | // buf now contains an io.Reader which you can save to the file system or further transform 15 | ``` 16 | 17 | ## Special Thanks 18 | A great big thank you to Richard Lehane. His [(https://github.com/richardlehane/mscfb](https://github.com/richardlehane/mscfb) got me started, his [https://github.com/richardlehane/doctool](https://github.com/richardlehane/doctool) project got me closer and his answer to questions via email helped get me to the finish line. Thanks Richard! -------------------------------------------------------------------------------- /[MS-DOC]-170112.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robarchibald/doc2txt/5d2d4043bb03dcfe653a2c03fc8b863b90282911/[MS-DOC]-170112.docx -------------------------------------------------------------------------------- /clx.go: -------------------------------------------------------------------------------- 1 | package doc2txt 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | 7 | "github.com/richardlehane/mscfb" 8 | ) 9 | 10 | var ( 11 | errInvalidPrc = errors.New("Invalid Prc structure") 12 | errInvalidClx = errors.New("expected last aCP value to equal fib.cpLength (2.8.35)") 13 | errInvalidPcdt = errors.New("expected clxt to be equal 0x02") 14 | ) 15 | 16 | type clx struct { 17 | pcdt pcdt 18 | } 19 | 20 | type pcdt struct { 21 | lcb int 22 | PlcPcd plcPcd 23 | } 24 | 25 | type plcPcd struct { 26 | aCP []int 27 | aPcd []pcd 28 | } 29 | 30 | type pcd struct { 31 | fc fcCompressed 32 | } 33 | 34 | type fcCompressed struct { 35 | fc int 36 | fCompressed bool 37 | } 38 | 39 | // read Clx (section 2.9.38) 40 | func getClx(table *mscfb.File, fib *fib) (*clx, error) { 41 | if table == nil || fib == nil { 42 | return nil, errInvalidArgument 43 | } 44 | b, err := readClx(table, fib) 45 | if err != nil { 46 | return nil, err 47 | } 48 | 49 | pcdtOffset, err := getPrcArrayEnd(b) 50 | if err != nil { 51 | return nil, err 52 | } 53 | 54 | pcdt, err := getPcdt(b, pcdtOffset) 55 | if err != nil { 56 | return nil, err 57 | } 58 | 59 | if pcdt.PlcPcd.aCP[len(pcdt.PlcPcd.aCP)-1] != fib.fibRgLw.cpLength { 60 | return nil, errInvalidClx 61 | } 62 | 63 | return &clx{pcdt: *pcdt}, nil 64 | } 65 | 66 | func readClx(table *mscfb.File, fib *fib) ([]byte, error) { 67 | b := make([]byte, fib.fibRgFcLcb.lcbClx) 68 | _, err := table.ReadAt(b, int64(fib.fibRgFcLcb.fcClx)) 69 | if err != nil { 70 | return nil, err 71 | } 72 | return b, nil 73 | } 74 | 75 | // read Pcdt from Clx (section 2.9.178) 76 | func getPcdt(clx []byte, pcdtOffset int) (*pcdt, error) { 77 | const pcdSize = 8 78 | if clx[pcdtOffset] != 0x02 { // clxt must be 0x02 or invalid 79 | return nil, errInvalidPcdt 80 | } 81 | lcb := int(binary.LittleEndian.Uint32(clx[pcdtOffset+1 : pcdtOffset+5])) // skip clxt, get lcb 82 | plcPcdOffset := pcdtOffset + 5 // skip clxt and lcb 83 | numPcds := (lcb - 4) / (4 + pcdSize) // see 2.2.2 in the spec for equation 84 | numCps := numPcds + 1 // always 1 more cp than pcds 85 | 86 | cps := make([]int, numCps) 87 | for i := 0; i < numCps; i++ { 88 | cpOffset := plcPcdOffset + i*4 89 | cps[i] = int(binary.LittleEndian.Uint32(clx[cpOffset : cpOffset+4])) 90 | } 91 | 92 | pcdStart := plcPcdOffset + 4*numCps 93 | pcds := make([]pcd, numPcds) 94 | for i := 0; i < numPcds; i++ { 95 | pcdOffset := pcdStart + i*pcdSize 96 | pcds[i] = *parsePcd(clx[pcdOffset : pcdOffset+pcdSize]) 97 | } 98 | return &pcdt{lcb: lcb, PlcPcd: plcPcd{aCP: cps, aPcd: pcds}}, nil 99 | } 100 | 101 | // find end of RgPrc array (section 2.9.38) 102 | func getPrcArrayEnd(clx []byte) (int, error) { 103 | prcOffset := 0 104 | count := 0 105 | for { 106 | clxt := clx[prcOffset] 107 | if clxt != 0x01 { // this is not a Prc, so exit 108 | return prcOffset, nil 109 | } 110 | prcDataCbGrpprl := binary.LittleEndian.Uint16(clx[prcOffset+1 : prcOffset+3]) // skip the clxt and read 2 bytes 111 | prcOffset += 1 + 2 + int(prcDataCbGrpprl) // skip clxt, cbGrpprl, and GrpPrl 112 | 113 | if count > 10000 || prcDataCbGrpprl <= 0 || prcOffset+3 > len(clx) { // ensure no infinite loop 114 | return 0, errInvalidPrc 115 | } 116 | count++ 117 | } 118 | } 119 | 120 | // parse Pcd (section 2.9.177) 121 | func parsePcd(pcdData []byte) *pcd { 122 | return &pcd{fc: *parseFcCompressed(pcdData[2:6])} 123 | } 124 | 125 | // parse FcCompressed (section 2.9.73) 126 | func parseFcCompressed(fcData []byte) *fcCompressed { 127 | fCompressed := fcData[3]&64 == 64 // check fcompressed value (second bit from lestmost of the last byte in fcdata) 128 | fcData[3] = fcData[3] & 63 // clear the fcompressed value from data 129 | fc := binary.LittleEndian.Uint32(fcData) // word doc generally uses little endian order (1.3.7) 130 | return &fcCompressed{fc: int(fc), fCompressed: fCompressed} 131 | } 132 | -------------------------------------------------------------------------------- /clx_test.go: -------------------------------------------------------------------------------- 1 | package doc2txt 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "github.com/richardlehane/mscfb" 8 | ) 9 | 10 | func init() { 11 | var err error 12 | 13 | f, err := os.Open(`testData/simpleDoc.doc`) 14 | if err != nil { 15 | panic(err) 16 | } 17 | 18 | reader, err = mscfb.New(f) 19 | if err != nil { 20 | panic(err) 21 | } 22 | 23 | simpleDoc, _, table = getWordDocAndTables(reader) 24 | } 25 | 26 | func TestGetClx(t *testing.T) { 27 | // invalid argument(s) 28 | _, err := getClx(nil, nil) 29 | if err != errInvalidArgument { 30 | t.Error("expected invalid argument", err) 31 | } 32 | // can't read empty table 33 | f := &fib{fibRgFcLcb: fibRgFcLcb{fcClx: 12, lcbClx: 21}} 34 | if _, err = getClx(reader.File[0], f); err == nil { 35 | t.Error("expected error reading") 36 | } 37 | // invalid read location, invalid data 38 | if _, err = getClx(table, f); err != errInvalidPcdt { 39 | t.Error("expected error reading", err) 40 | } 41 | // all correct, but cpLength 42 | f = &fib{fibRgFcLcb: fibRgFcLcb{fcClx: 5279, lcbClx: 21}} 43 | if _, err = getClx(table, f); err != errInvalidClx { 44 | t.Error("expected error reading", err) 45 | } 46 | 47 | // values come from a successful parse of simpleDoc.doc 48 | f.fibRgLw = fibRgLw{cpLength: 6} 49 | clx, err := getClx(table, f) 50 | if err != nil || clx.pcdt.lcb != 16 || len(clx.pcdt.PlcPcd.aCP) != 2 || len(clx.pcdt.PlcPcd.aPcd) != 1 || 51 | clx.pcdt.PlcPcd.aCP[0] != 0 || clx.pcdt.PlcPcd.aCP[1] != 6 || 52 | clx.pcdt.PlcPcd.aPcd[0].fc.fc != 4096 || clx.pcdt.PlcPcd.aPcd[0].fc.fCompressed != true { 53 | t.Error("expected valid clx", clx, err) 54 | } 55 | } 56 | 57 | func TestGetPrcArrayEnd(t *testing.T) { 58 | //skip since it is not a Prc 59 | clx := []byte{2, 0, 0, 0} 60 | if num, _ := getPrcArrayEnd(clx); num != 0 { 61 | t.Error("expected to be set to beginning") 62 | } 63 | // error due to zero offset with valid Prc clxt 64 | clx = []byte{1, 0, 0, 0} 65 | if _, err := getPrcArrayEnd(clx); err != errInvalidPrc { 66 | t.Error("expected to revert to 0 due to invalid value", err) 67 | } 68 | // error since next offset would be too large 69 | clx = []byte{1, 4, 4, 0} 70 | if _, err := getPrcArrayEnd(clx); err != errInvalidPrc { 71 | t.Error("expected to revert to 0 due to invalid value", err) 72 | } 73 | // two items 74 | clx = []byte{1, 2, 0, 0, 0, 1, 2, 0, 0, 2, 2, 2, 2} 75 | if num, err := getPrcArrayEnd(clx); err != nil || num != 10 { 76 | t.Error("expected to revert to 0 due to invalid value", err, num) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /doc conversion.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robarchibald/doc2txt/5d2d4043bb03dcfe653a2c03fc8b863b90282911/doc conversion.docx -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | package doc2txt 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | "io" 8 | 9 | "github.com/mattetti/filebuffer" 10 | "github.com/richardlehane/mscfb" 11 | ) 12 | 13 | var ( 14 | errTable = errors.New("cannot find table stream") 15 | errDocEmpty = errors.New("WordDocument not found") 16 | errDocShort = errors.New("wordDoc block too short") 17 | errInvalidArgument = errors.New("invalid table and/or fib") 18 | ) 19 | 20 | type allReader interface { 21 | io.Closer 22 | io.ReaderAt 23 | io.ReadSeeker 24 | } 25 | 26 | func wrapError(e error) error { 27 | return errors.New("Error processing file: " + e.Error()) 28 | } 29 | 30 | // ParseDoc converts a standard io.Reader from a Microsoft Word 31 | // .doc binary file and returns a reader (actually a bytes.Buffer) 32 | // which will output the plain text found in the .doc file 33 | func ParseDoc(r io.Reader) (io.Reader, error) { 34 | ra, ok := r.(io.ReaderAt) 35 | if !ok { 36 | ra, _, err := toMemoryBuffer(r) 37 | if err != nil { 38 | return nil, wrapError(err) 39 | } 40 | defer ra.Close() 41 | } 42 | 43 | d, err := mscfb.New(ra) 44 | if err != nil { 45 | return nil, wrapError(err) 46 | } 47 | 48 | wordDoc, table0, table1 := getWordDocAndTables(d) 49 | fib, err := getFib(wordDoc) 50 | if err != nil { 51 | return nil, wrapError(err) 52 | } 53 | 54 | table := getActiveTable(table0, table1, fib) 55 | if table == nil { 56 | return nil, wrapError(errTable) 57 | } 58 | 59 | clx, err := getClx(table, fib) 60 | if err != nil { 61 | return nil, wrapError(err) 62 | } 63 | 64 | return getText(wordDoc, clx) 65 | } 66 | 67 | func toMemoryBuffer(r io.Reader) (allReader, int64, error) { 68 | var b bytes.Buffer 69 | size, err := b.ReadFrom(r) 70 | if err != nil { 71 | return nil, 0, err 72 | } 73 | fb := filebuffer.New(b.Bytes()) 74 | return fb, size, nil 75 | } 76 | 77 | func getText(wordDoc *mscfb.File, clx *clx) (io.Reader, error) { 78 | var buf bytes.Buffer 79 | for i := 0; i < len(clx.pcdt.PlcPcd.aPcd); i++ { 80 | pcd := clx.pcdt.PlcPcd.aPcd[i] 81 | cp := clx.pcdt.PlcPcd.aCP[i] 82 | cpNext := clx.pcdt.PlcPcd.aCP[i+1] 83 | 84 | var start, end, size int 85 | if pcd.fc.fCompressed { 86 | size = 1 87 | start = pcd.fc.fc / 2 88 | end = start + cpNext - cp 89 | } else { 90 | size = 2 91 | start = pcd.fc.fc 92 | end = start + 2*(cpNext-cp) 93 | } 94 | 95 | b := make([]byte, end-start) 96 | _, err := wordDoc.ReadAt(b, int64(start/size)) // read all the characters 97 | if err != nil { 98 | return nil, err 99 | } 100 | translateText(b, &buf, pcd.fc.fCompressed) 101 | } 102 | return &buf, nil 103 | } 104 | 105 | func translateText(b []byte, buf *bytes.Buffer, fCompressed bool) { 106 | fieldLevel := 0 107 | var isFieldChar bool 108 | for cIndex := range b { 109 | // Handle special field characters (section 2.8.25) 110 | if b[cIndex] == 0x13 { 111 | isFieldChar = true 112 | fieldLevel++ 113 | continue 114 | } else if b[cIndex] == 0x14 { 115 | isFieldChar = false 116 | continue 117 | } else if b[cIndex] == 0x15 { 118 | isFieldChar = false 119 | continue 120 | } else if isFieldChar { 121 | continue 122 | } 123 | 124 | if b[cIndex] == 7 { // table column separator 125 | buf.WriteByte(' ') 126 | continue 127 | } else if b[cIndex] < 32 && b[cIndex] != 9 && b[cIndex] != 10 && b[cIndex] != 13 { // skip non-printable ASCII characters 128 | //buf.Write([]byte(fmt.Sprintf("|%#x|", b[cIndex]))) 129 | continue 130 | } 131 | 132 | if fCompressed { // compressed, so replace compressed characters 133 | buf.Write(replaceCompressed(b[cIndex])) 134 | } else { 135 | buf.Write(b) 136 | } 137 | } 138 | } 139 | 140 | func replaceCompressed(char byte) []byte { 141 | var v uint16 142 | switch char { 143 | case 0x82: 144 | v = 0x201A 145 | case 0x83: 146 | v = 0x0192 147 | case 0x84: 148 | v = 0x201E 149 | case 0x85: 150 | v = 0x2026 151 | case 0x86: 152 | v = 0x2020 153 | case 0x87: 154 | v = 0x2021 155 | case 0x88: 156 | v = 0x02C6 157 | case 0x89: 158 | v = 0x2030 159 | case 0x8A: 160 | v = 0x0160 161 | case 0x8B: 162 | v = 0x2039 163 | case 0x8C: 164 | v = 0x0152 165 | case 0x91: 166 | v = 0x2018 167 | case 0x92: 168 | v = 0x2019 169 | case 0x93: 170 | v = 0x201C 171 | case 0x94: 172 | v = 0x201D 173 | case 0x95: 174 | v = 0x2022 175 | case 0x96: 176 | v = 0x2013 177 | case 0x97: 178 | v = 0x2014 179 | case 0x98: 180 | v = 0x02DC 181 | case 0x99: 182 | v = 0x2122 183 | case 0x9A: 184 | v = 0x0161 185 | case 0x9B: 186 | v = 0x203A 187 | case 0x9C: 188 | v = 0x0153 189 | case 0x9F: 190 | v = 0x0178 191 | default: 192 | return []byte{char} 193 | } 194 | out := make([]byte, 2) 195 | binary.LittleEndian.PutUint16(out, v) 196 | return out 197 | } 198 | 199 | func getWordDocAndTables(r *mscfb.Reader) (*mscfb.File, *mscfb.File, *mscfb.File) { 200 | var wordDoc, table0, table1 *mscfb.File 201 | for i := 0; i < len(r.File); i++ { 202 | stream := r.File[i] 203 | 204 | switch stream.Name { 205 | case "WordDocument": 206 | wordDoc = stream 207 | case "0Table": 208 | table0 = stream 209 | case "1Table": 210 | table1 = stream 211 | } 212 | } 213 | return wordDoc, table0, table1 214 | } 215 | 216 | func getActiveTable(table0 *mscfb.File, table1 *mscfb.File, f *fib) *mscfb.File { 217 | if f.base.fWhichTblStm == 0 { 218 | return table0 219 | } 220 | return table1 221 | } 222 | -------------------------------------------------------------------------------- /doc_test.go: -------------------------------------------------------------------------------- 1 | package doc2txt 2 | 3 | import ( 4 | "bytes" 5 | "os" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | func TestParseSimpleDoc(t *testing.T) { 11 | f, _ := os.Open(`testData/simpleDoc.doc`) 12 | buf, err := ParseDoc(f) 13 | if err != nil { 14 | t.Fatal("expected successful parse", err) 15 | } 16 | if s := buf.(*bytes.Buffer).String(); s != "12345\r" { 17 | t.Errorf("expected correct value |%s|", s) 18 | } 19 | } 20 | 21 | func TestParseComplicated(t *testing.T) { 22 | f, _ := os.Open(`testData/docFile.doc`) 23 | buf, err := ParseDoc(f) 24 | if err != nil { 25 | t.Fatal("expected to be able to parse document", err) 26 | } 27 | 28 | var expected bytes.Buffer 29 | expected.WriteString(strings.Replace(complicatedDoc, "\n", "\r", -1)) 30 | 31 | actual := buf.(*bytes.Buffer) 32 | count := 0 33 | for aline, err := actual.ReadString('\r'); err == nil; aline, err = actual.ReadString('\r') { 34 | eline, err := expected.ReadString('\r') 35 | if err != nil || eline != aline { 36 | t.Errorf("mismatch at line %d. Expected: %s, Actual: %s\n", count, eline, aline) 37 | } 38 | count++ 39 | } 40 | } 41 | 42 | const complicatedDoc = `Name Here in Big 43 | Link to something 44 | 45 | 46 | 47 | Summary 48 | Testing out new things 49 | 50 | Bullet 1 51 | Bullet 2 52 | Bullet 3 53 | 54 | Underlined 55 | Italics 56 | 57 | Numbered list 58 | Item 1 59 | Item 2 60 | Item 3 61 | 62 | Some Information In a Table Hopefully, we get it 63 | 64 | 65 | Here is some information with a footnote 66 | Here is some information with an endnote 67 | 68 | Here is a table of contents 69 | 70 | Contents 71 | Some 1 72 | Information 1 73 | In a 1 74 | Table 1 75 | Hopefully, we 1 76 | get it 1 77 | 1 78 | 79 | 80 | 81 | Header 1 82 | Header 2 83 | Header 3 84 | 85 | Here is my footnote 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | Information in the header 96 | 97 | 98 | Some Footer information current date:8/7/2017 4:16:33 PM pg. 1 99 | 100 | 101 | My endnote 102 | 103 | Some info from inside a text box 104 | 105 | 106 | 107 | ` 108 | -------------------------------------------------------------------------------- /fib.go: -------------------------------------------------------------------------------- 1 | package doc2txt 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | 7 | "github.com/richardlehane/mscfb" 8 | ) 9 | 10 | var ( 11 | errFibInvalid = errors.New("file information block validation failed") 12 | ) 13 | 14 | type fib struct { 15 | base fibBase 16 | csw int 17 | fibRgW fibRgW 18 | cslw int 19 | fibRgLw fibRgLw 20 | cbRgFcLcb int 21 | fibRgFcLcb fibRgFcLcb 22 | } 23 | 24 | type fibBase struct { 25 | fWhichTblStm int 26 | } 27 | 28 | type fibRgW struct { 29 | } 30 | 31 | type fibRgLw struct { 32 | ccpText int 33 | ccpFtn int 34 | ccpHdd int 35 | ccpMcr int 36 | ccpAtn int 37 | ccpEdn int 38 | ccpTxbx int 39 | ccpHdrTxbx int 40 | cpLength int 41 | } 42 | 43 | type fibRgFcLcb struct { 44 | fcPlcfFldMom int 45 | lcbPlcfFldMom int 46 | fcPlcfFldHdr int 47 | lcbPlcfFldHdr int 48 | fcPlcfFldFtn int 49 | lcbPlcfFldFtn int 50 | fcPlcfFldAtn int 51 | lcbPlcfFldAtn int 52 | fcClx int 53 | lcbClx int 54 | } 55 | 56 | // parse File Information Block (section 2.5.1) 57 | func getFib(wordDoc *mscfb.File) (*fib, error) { 58 | if wordDoc == nil { 59 | return nil, errDocEmpty 60 | } 61 | 62 | b := make([]byte, 898) // get FIB block up to FibRgFcLcb97 63 | _, err := wordDoc.ReadAt(b, 0) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | fibBase := getFibBase(b[0:32]) 69 | 70 | fibRgW, csw, err := getFibRgW(b, 32) 71 | if err != nil { 72 | return nil, err 73 | } 74 | 75 | fibRgLw, cslw, err := getFibRgLw(b, 34+csw) 76 | if err != nil { 77 | return nil, err 78 | } 79 | 80 | fibRgFcLcb, cbRgFcLcb, err := getFibRgFcLcb(b, 34+csw+2+cslw) 81 | 82 | return &fib{base: *fibBase, csw: csw, cslw: cslw, fibRgW: *fibRgW, fibRgLw: *fibRgLw, fibRgFcLcb: *fibRgFcLcb, cbRgFcLcb: cbRgFcLcb}, err 83 | } 84 | 85 | // parse FibBase (section 2.5.2) 86 | func getFibBase(fib []byte) *fibBase { 87 | byt := fib[11] // fWhichTblStm is 2nd highest bit in this byte 88 | fWhichTblStm := int(byt >> 1 & 1) // set which table (0Table or 1Table) is the table stream 89 | return &fibBase{fWhichTblStm: fWhichTblStm} 90 | } 91 | 92 | func getFibRgW(fib []byte, start int) (*fibRgW, int, error) { 93 | if start+2 >= len(fib) { // must be big enough for csw 94 | return &fibRgW{}, 0, errFibInvalid 95 | } 96 | 97 | csw := int(binary.LittleEndian.Uint16(fib[start:start+2])) * 2 // in bytes 98 | return &fibRgW{}, csw, nil 99 | } 100 | 101 | // parse FibRgLw (section 2.5.4) 102 | func getFibRgLw(fib []byte, start int) (*fibRgLw, int, error) { 103 | fibRgLwStart := start + 2 // skip cslw 104 | if fibRgLwStart+88 >= len(fib) { // expect 88 bytes in fibRgLw 105 | return &fibRgLw{}, 0, errFibInvalid 106 | } 107 | 108 | cslw := getInt16(fib, start) * 4 // in bytes 109 | ccpText := getInt(fib, fibRgLwStart+3*4) 110 | ccpFtn := getInt(fib, fibRgLwStart+4*4) 111 | ccpHdd := getInt(fib, fibRgLwStart+5*4) 112 | ccpMcr := getInt(fib, fibRgLwStart+6*4) 113 | ccpAtn := getInt(fib, fibRgLwStart+7*4) 114 | ccpEdn := getInt(fib, fibRgLwStart+8*4) 115 | ccpTxbx := getInt(fib, fibRgLwStart+9*4) 116 | ccpHdrTxbx := getInt(fib, fibRgLwStart+10*4) 117 | 118 | // calculate cpLength. Used in PlcPcd verification (see section 2.8.35) 119 | var cpLength int 120 | if ccpFtn != 0 || ccpHdd != 0 || ccpMcr != 0 || ccpAtn != 0 || ccpEdn != 0 || ccpTxbx != 0 || ccpHdrTxbx != 0 { 121 | cpLength = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx + ccpText + 1 122 | } else { 123 | cpLength = ccpText 124 | } 125 | return &fibRgLw{ccpText: ccpText, ccpFtn: ccpFtn, ccpHdd: ccpHdd, ccpMcr: ccpMcr, ccpAtn: ccpAtn, 126 | ccpEdn: ccpEdn, ccpTxbx: ccpTxbx, ccpHdrTxbx: ccpHdrTxbx, cpLength: cpLength}, cslw, nil 127 | } 128 | 129 | // parse FibRgFcLcb (section 2.5.5) 130 | func getFibRgFcLcb(fib []byte, start int) (*fibRgFcLcb, int, error) { 131 | fibRgFcLcbStart := start + 2 // skip cbRgFcLcb 132 | if fibRgFcLcbStart+186*4 < len(fib) { // expect 186+ values in FibRgFcLcb 133 | return &fibRgFcLcb{}, 0, errFibInvalid 134 | } 135 | 136 | cbRgFcLcb := getInt16(fib, start) 137 | fcPlcfFldMom := getInt(fib, fibRgFcLcbStart+32*4) 138 | lcbPlcfFldMom := getInt(fib, fibRgFcLcbStart+33*4) 139 | fcPlcfFldHdr := getInt(fib, fibRgFcLcbStart+34*4) 140 | lcbPlcfFldHdr := getInt(fib, fibRgFcLcbStart+35*4) 141 | fcPlcfFldFtn := getInt(fib, fibRgFcLcbStart+36*4) 142 | lcbPlcfFldFtn := getInt(fib, fibRgFcLcbStart+37*4) 143 | fcPlcfFldAtn := getInt(fib, fibRgFcLcbStart+38*4) 144 | lcbPlcfFldAtn := getInt(fib, fibRgFcLcbStart+39*4) 145 | fcClx := getInt(fib, fibRgFcLcbStart+66*4) 146 | lcbClx := getInt(fib, fibRgFcLcbStart+67*4) 147 | return &fibRgFcLcb{fcPlcfFldMom: fcPlcfFldMom, lcbPlcfFldMom: lcbPlcfFldMom, fcPlcfFldHdr: fcPlcfFldHdr, lcbPlcfFldHdr: lcbPlcfFldHdr, 148 | fcPlcfFldFtn: fcPlcfFldFtn, lcbPlcfFldFtn: lcbPlcfFldFtn, fcPlcfFldAtn: fcPlcfFldAtn, lcbPlcfFldAtn: lcbPlcfFldAtn, 149 | fcClx: fcClx, lcbClx: lcbClx}, cbRgFcLcb, nil 150 | } 151 | 152 | func getInt16(buf []byte, start int) int { 153 | return int(binary.LittleEndian.Uint16(buf[start : start+2])) 154 | } 155 | func getInt(buf []byte, start int) int { 156 | return int(binary.LittleEndian.Uint32(buf[start : start+4])) 157 | } 158 | -------------------------------------------------------------------------------- /fib_test.go: -------------------------------------------------------------------------------- 1 | package doc2txt 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "github.com/richardlehane/mscfb" 8 | ) 9 | 10 | var simpleDoc *mscfb.File 11 | var table *mscfb.File 12 | var reader *mscfb.Reader 13 | 14 | func init() { 15 | f, _ := os.Open(`testData/simpleDoc.doc`) 16 | reader, _ = mscfb.New(f) 17 | simpleDoc, _, table = getWordDocAndTables(reader) 18 | } 19 | 20 | func TestGetFib(t *testing.T) { 21 | _, err := getFib(nil) 22 | if err != errDocEmpty { 23 | t.Error("Expected error due to empty WordDoc") 24 | } 25 | if _, err := getFib(reader.File[2]); err == nil { // short mscfb.File 26 | t.Error("expected error due to short file", err) 27 | } 28 | if _, err = getFib(reader.File[4]); err != errFibInvalid { // use wrong mscfb.File 29 | t.Error("expected error due to corrupt file", err) 30 | } 31 | 32 | fib, _ := getFib(simpleDoc) 33 | if fib.csw != 28 || fib.cslw != 88 || fib.cbRgFcLcb != 0x00B7 { 34 | t.Error("expected valid sizes", fib.csw, fib.cslw, fib.cbRgFcLcb) 35 | } 36 | if fib.base.fWhichTblStm != 1 { 37 | t.Error("expected table 1") 38 | } 39 | // No headers in simpleDoc, just "12345" in the text which apparently makes a ccpText of 6 40 | // cpLength is calculated and should equal ccpText in this scenario 41 | if fib.fibRgLw.ccpAtn != 0 || fib.fibRgLw.ccpEdn != 0 || fib.fibRgLw.ccpFtn != 0 || fib.fibRgLw.ccpHdd != 0 || fib.fibRgLw.ccpHdrTxbx != 0 || 42 | fib.fibRgLw.ccpMcr != 0 || fib.fibRgLw.ccpText != 6 || fib.fibRgLw.cpLength != 6 { 43 | t.Error("expected valid fibRgLw", fib.fibRgLw) 44 | } 45 | // These are the values in the byte stream at the correct locations 46 | if fib.fibRgFcLcb.fcClx != 5279 || fib.fibRgFcLcb.lcbClx != 21 { 47 | t.Error("expected valid fibRgFcLcb", fib.fibRgFcLcb) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /fibrgfclcb97.txt: -------------------------------------------------------------------------------- 1 | fcStshfOrig 2 | lcbStshfOrig 3 | fcStshf 4 | lcbStshf 5 | fcPlcffndRef 6 | lcbPlcffndRef 7 | fcPlcffndTxt 8 | lcbPlcffndTxt 9 | fcPlcfandRef 10 | lcbPlcfandRef 11 | fcPlcfandTxt 12 | lcbPlcfandTxt 13 | fcPlcfSed 14 | lcbPlcfSed 15 | fcPlcPad 16 | lcbPlcPad 17 | fcPlcfPhe 18 | lcbPlcfPhe 19 | fcSttbfGlsy 20 | lcbSttbfGlsy 21 | fcPlcfGlsy 22 | lcbPlcfGlsy 23 | fcPlcfHdd 24 | lcbPlcfHdd 25 | fcPlcfBteChpx 26 | lcbPlcfBteChpx 27 | fcPlcfBtePapx 28 | lcbPlcfBtePapx 29 | fcPlcfSea 30 | lcbPlcfSea 31 | fcSttbfFfn 32 | lcbSttbfFfn 33 | fcPlcfFldMom * offset of main doc fields. Note: could use this to dig further 34 | lcbPlcfFldMom * size of main doc fields (offset calculated by 33 * 4 + 154) 35 | fcPlcfFldHdr * offset of header/footer fields 36 | lcbPlcfFldHdr * size header/footer fields 37 | fcPlcfFldFtn * offset of footnote fields 38 | lcbPlcfFldFtn * size footnote fields 39 | fcPlcfFldAtn * offset of comment fields 40 | lcbPlcfFldAtn * size comment fields 41 | fcPlcfFldMcr * not used 42 | lcbPlcfFldMcr * not used 43 | fcSttbfBkmk 44 | lcbSttbfBkmk 45 | fcPlcfBkf 46 | lcbPlcfBkf 47 | fcPlcfBkl 48 | lcbPlcfBkl 49 | fcCmds 50 | lcbCmds 51 | fcUnused1 52 | lcbUnused1 53 | fcSttbfMcr 54 | lcbSttbfMcr 55 | fcPrDrvr 56 | lcbPrDrvr 57 | fcPrEnvPort 58 | lcbPrEnvPort 59 | fcPrEnvLand 60 | lcbPrEnvLand 61 | fcWss 62 | lcbWss 63 | fcDop 64 | lcbDop 65 | fcSttbfAssoc 66 | lcbSttbfAssoc 67 | fcClx * offset of the Clx portion of the document 68 | lcbClx * size of the Clx portion of the document 69 | fcPlcfPgdFtn 70 | lcbPlcfPgdFtn 71 | fcAutosaveSource 72 | lcbAutosaveSource 73 | fcGrpXstAtnOwners 74 | lcbGrpXstAtnOwners 75 | fcSttbfAtnBkmk 76 | lcbSttbfAtnBkmk 77 | fcUnused2 78 | lcbUnused2 79 | fcUnused3 80 | lcbUnused3 81 | fcPlcSpaMom 82 | lcbPlcSpaMom 83 | fcPlcSpaHdr 84 | lcbPlcSpaHdr 85 | fcPlcfAtnBkf 86 | lcbPlcfAtnBkf 87 | fcPlcfAtnBkl 88 | lcbPlcfAtnBkl 89 | fcPms 90 | lcbPms 91 | fcFormFldSttbs 92 | lcbFormFldSttbs 93 | fcPlcfendRef 94 | lcbPlcfendRef 95 | fcPlcfendTxt 96 | lcbPlcfendTxt 97 | fcPlcfFldEdn * (96*4 + 154) 98 | lcbPlcfFldEdn * 99 | fcUnused4 100 | lcbUnused4 101 | fcDggInfo 102 | lcbDggInfo 103 | fcSttbfRMark 104 | lcbSttbfRMark 105 | fcSttbfCaption 106 | lcbSttbfCaption 107 | fcSttbfAutoCaption 108 | lcbSttbfAutoCaption 109 | fcPlcfWkb 110 | lcbPlcfWkb 111 | fcPlcfSpl 112 | lcbPlcfSpl 113 | fcPlcftxbxTxt 114 | lcbPlcftxbxTxt 115 | fcPlcfFldTxbx * (116*4 + 154) 116 | lcbPlcfFldTxbx * 117 | fcPlcfHdrtxbxTxt 118 | lcbPlcfHdrtxbxTxt 119 | fcPlcffldHdrTxbx * (118*4+154) 120 | lcbPlcffldHdrTxbx * 121 | fcStwUser 122 | lcbStwUser 123 | fcSttbTtmbd 124 | lcbSttbTtmbd 125 | fcCookieData 126 | lcbCookieData 127 | fcPgdMotherOldOld 128 | lcbPgdMotherOldOld 129 | fcBkdMotherOldOld 130 | lcbBkdMotherOldOld 131 | fcPgdFtnOldOld 132 | lcbPgdFtnOldOld 133 | fcBkdFtnOldOld 134 | lcbBkdFtnOldOld 135 | fcPgdEdnOldOld 136 | lcbPgdEdnOldOld 137 | fcBkdEdnOldOld 138 | lcbBkdEdnOldOld 139 | fcSttbfIntlFld 140 | lcbSttbfIntlFld 141 | fcRouteSlip 142 | lcbRouteSlip 143 | fcSttbSavedBy 144 | lcbSttbSavedBy 145 | fcSttbFnm 146 | lcbSttbFnm 147 | fcPlfLst 148 | lcbPlfLst 149 | fcPlfLfo 150 | lcbPlfLfo 151 | fcPlcfTxbxBkd 152 | lcbPlcfTxbxBkd 153 | fcPlcfTxbxHdrBkd 154 | lcbPlcfTxbxHdrBkd 155 | fcDocUndoWord9 156 | lcbDocUndoWord9 157 | fcRgbUse 158 | lcbRgbUse 159 | fcUsp 160 | lcbUsp 161 | fcUskf 162 | lcbUskf 163 | fcPlcupcRgbUse 164 | lcbPlcupcRgbUse 165 | fcPlcupcUsp 166 | lcbPlcupcUsp 167 | fcSttbGlsyStyle 168 | lcbSttbGlsyStyle 169 | fcPlgosl 170 | lcbPlgosl 171 | fcPlcocx 172 | lcbPlcocx 173 | fcPlcfBteLvc 174 | lcbPlcfBteLvc 175 | dwLowDateTime 176 | dwHighDateTime 177 | fcPlcfLvcPre10 178 | lcbPlcfLvcPre10 179 | fcPlcfAsumy 180 | lcbPlcfAsumy 181 | fcPlcfGram 182 | lcbPlcfGram 183 | fcSttbListNames 184 | lcbSttbListNames 185 | fcSttbfUssr 186 | lcbSttbfUssr -------------------------------------------------------------------------------- /fibrglw97.txt: -------------------------------------------------------------------------------- 1 | cbMac 2 | reserved1 3 | reserved2 4 | ccpText 64+3*4 5 | ccpFtn 64+4*4 6 | ccpHdd 64+5*4 7 | reserved3 8 | ccpAtn 64+7*4 9 | ccpEdn 64+8*4 10 | ccpTxbx 64+9*4 11 | ccpHdrTxbx 64+10*4 12 | reserved4 13 | reserved5 14 | reserved6 15 | reserved7 16 | reserved8 17 | reserved9 18 | reserved10 19 | reserved11 20 | reserved12 21 | reserved13 22 | reserved14 -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/EndFirstCorp/doc2txt 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/mattetti/filebuffer v1.0.0 7 | github.com/richardlehane/mscfb v1.0.3 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/mattetti/filebuffer v1.0.0 h1:ixTvQ0JjBTwWbdpDZ98lLrydo7KRi8xNRIi5RFszsbY= 2 | github.com/mattetti/filebuffer v1.0.0/go.mod h1:X6nyAIge2JGVmuJt2MFCqmHrb/5IHiphfHtot0s5cnI= 3 | github.com/richardlehane/mscfb v1.0.3 h1:rD8TBkYWkObWO0oLDFCbwMeZ4KoalxQy+QgniCj3nKI= 4 | github.com/richardlehane/mscfb v1.0.3/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk= 5 | github.com/richardlehane/msoleps v1.0.1 h1:RfrALnSNXzmXLbGct/P2b4xkFz4e8Gmj/0Vj9M9xC1o= 6 | github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= 7 | -------------------------------------------------------------------------------- /plcFld.go: -------------------------------------------------------------------------------- 1 | package doc2txt 2 | 3 | /* I don't think I'm going to need this 4 | type plcFld struct { 5 | aCp []int 6 | aFld []fld 7 | } 8 | 9 | type fld struct { 10 | fldch int 11 | grffld int 12 | fieldtype string 13 | fNested bool 14 | fHasSep bool 15 | } 16 | 17 | func getPlcFld(table *mscfb.File, offset, size int) (*plcFld, error) { 18 | if table == nil { 19 | return nil, errInvalidArgument 20 | } 21 | b := make([]byte, size) 22 | _, err := table.ReadAt(b, int64(offset)) 23 | if err != nil { 24 | return nil, err 25 | } 26 | 27 | f, err := getFld(b) 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | return f, nil 33 | } 34 | 35 | func getFld(plc []byte) (*plcFld, error) { 36 | return nil, nil 37 | } 38 | 39 | func getFieldType(grffld byte) string { 40 | switch grffld { 41 | case 0x01: 42 | return "Not Named" 43 | case 0x02: 44 | return "Not Named" 45 | case 0x03: 46 | return "REF" 47 | case 0x05: 48 | return "FTNREF" 49 | case 0x06: 50 | return "SET" 51 | case 0x07: 52 | return "IF" 53 | case 0x08: 54 | return "INDEX" 55 | case 0x0A: 56 | return "STYLEREF" 57 | case 0x0C: 58 | return "SEQ" 59 | case 0x0D: 60 | return "TOC" 61 | case 0x0E: 62 | return "INFO" 63 | case 0x0F: 64 | return "TITLE" 65 | case 0x10: 66 | return "SUBJECT" 67 | case 0x11: 68 | return "AUTHOR" 69 | case 0x12: 70 | return "KEYWORDS" 71 | case 0x13: 72 | return "COMMENTS" 73 | case 0x14: 74 | return "LASTSAVEDBY" 75 | case 0x15: 76 | return "CREATEDATE" 77 | case 0x16: 78 | return "SAVEDATE" 79 | case 0x17: 80 | return "PRINTDATE" 81 | case 0x18: 82 | return "REVNUM" 83 | case 0x19: 84 | return "EDITTIME" 85 | case 0x1A: 86 | return "NUMPAGES" 87 | case 0x1B: 88 | return "NUMWORDS" 89 | case 0x1C: 90 | return "NUMCHARS" 91 | case 0x1D: 92 | return "FILENAME" 93 | case 0x1E: 94 | return "TEMPLATE" 95 | case 0x1F: 96 | return "DATE" 97 | case 0x20: 98 | return "TIME" 99 | case 0x21: 100 | return "PAGE" 101 | case 0x22: 102 | return "=" 103 | case 0x23: 104 | return "QUOTE" 105 | case 0x24: 106 | return "INCLUDE" 107 | case 0x25: 108 | return "PAGEREF" 109 | case 0x26: 110 | return "ASK" 111 | case 0x27: 112 | return "FILLIN" 113 | case 0x28: 114 | return "DATA" 115 | case 0x29: 116 | return "NEXT" 117 | case 0x2A: 118 | return "NEXTIF" 119 | case 0x2B: 120 | return "SKIPIF" 121 | case 0x2C: 122 | return "MERGEREC" 123 | case 0x2D: 124 | return "DDE" 125 | case 0x2E: 126 | return "DDEAUTO" 127 | case 0x2F: 128 | return "GLOSSARY" 129 | case 0x30: 130 | return "PRINT" 131 | case 0x31: 132 | return "EQ" 133 | case 0x32: 134 | return "GOTOBUTTON" 135 | case 0x33: 136 | return "MACROBUTTON" 137 | case 0x34: 138 | return "AUTONUMOUT" 139 | case 0x35: 140 | return "AUTONUMLGL" 141 | case 0x36: 142 | return "AUTONUM" 143 | case 0x37: 144 | return "IMPORT" 145 | case 0x38: 146 | return "LINK" 147 | case 0x39: 148 | return "SYMBOL" 149 | case 0x3A: 150 | return "EMBED" 151 | case 0x3B: 152 | return "MERGEFIELD" 153 | case 0x3C: 154 | return "USERNAME" 155 | case 0x3D: 156 | return "USERINITIALS" 157 | case 0x3E: 158 | return "USERADDRESS" 159 | case 0x3F: 160 | return "BARCODE" 161 | case 0x40: 162 | return "DOCVARIABLE" 163 | case 0x41: 164 | return "SECTION" 165 | case 0x42: 166 | return "SECTIONPAGES" 167 | case 0x43: 168 | return "INCLUDEPICTURE" 169 | case 0x44: 170 | return "INCLUDETEXT" 171 | case 0x45: 172 | return "FILESIZE" 173 | case 0x46: 174 | return "FORMTEXT" 175 | case 0x47: 176 | return "FORMCHECKBOX" 177 | case 0x48: 178 | return "NOTEREF" 179 | case 0x49: 180 | return "TOA" 181 | case 0x4B: 182 | return "MERGESEQ" 183 | case 0x4F: 184 | return "AUTOTEXT" 185 | case 0x50: 186 | return "COMPARE" 187 | case 0x51: 188 | return "ADDIN" 189 | case 0x53: 190 | return "FORMDROPDOWN" 191 | case 0x54: 192 | return "ADVANCE" 193 | case 0x55: 194 | return "DOCPROPERTY" 195 | case 0x57: 196 | return "CONTROL" 197 | case 0x58: 198 | return "HYPERLINK" 199 | case 0x59: 200 | return "AUTOTEXTLIST" 201 | case 0x5A: 202 | return "LISTNUM" 203 | case 0x5B: 204 | return "HTMLCONTROL" 205 | case 0x5C: 206 | return "BIDIOUTLINE" 207 | case 0x5D: 208 | return "ADDRESSBLOCK" 209 | case 0x5E: 210 | return "GREETINGLINE" 211 | case 0x5F: 212 | return "SHAPE" 213 | default: 214 | return "UNKNOWN" 215 | } 216 | } 217 | */ 218 | -------------------------------------------------------------------------------- /testData/docFile.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robarchibald/doc2txt/5d2d4043bb03dcfe653a2c03fc8b863b90282911/testData/docFile.doc -------------------------------------------------------------------------------- /testData/simpleDoc.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robarchibald/doc2txt/5d2d4043bb03dcfe653a2c03fc8b863b90282911/testData/simpleDoc.doc --------------------------------------------------------------------------------