├── .github └── workflows │ └── test.yaml ├── LICENSE.txt ├── README.md ├── file.go ├── file_test.go ├── fuzz.go ├── go.mod ├── go.sum ├── mscfb.go ├── mscfb_test.go └── test ├── .gitattributes ├── novpapplan.doc ├── test.doc ├── test.msg ├── test.ppt └── test.xls /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | # This workflow runs all the tests in ubuntu, windows and mac 2 | # environments. 3 | # 4 | # Trigger this workflow by pushing commits or by opening 5 | # a pull request. 6 | name: Test 7 | on: 8 | push: 9 | pull_request: 10 | types: 11 | - opened 12 | jobs: 13 | test: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ubuntu-latest, windows-latest, macos-latest] 18 | go: [ '1.x', '1.20', '1.19', '1.18'] 19 | steps: 20 | - name: Check out repository code 21 | uses: actions/checkout@v4 22 | - name: Install latest version of go 23 | uses: actions/setup-go@v5 24 | with: 25 | go-version: ${{ matrix.go }} 26 | - name: Run tests 27 | run: go test -v ./... -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A reader for Microsoft's Compound File Binary File Format. 2 | 3 | Example usage: 4 | 5 | file, _ := os.Open("test/test.doc") 6 | defer file.Close() 7 | doc, err := mscfb.New(file) 8 | if err != nil { 9 | log.Fatal(err) 10 | } 11 | for entry, err := doc.Next(); err == nil; entry, err = doc.Next() { 12 | buf := make([]byte, 512) 13 | i, _ := doc.Read(buf) 14 | if i > 0 { 15 | fmt.Println(buf[:i]) 16 | } 17 | fmt.Println(entry.Name) 18 | } 19 | 20 | The Compound File Binary File Format is also known as the Object Linking and Embedding (OLE) or Component Object Model (COM) format and was used by early MS software such as MS Office. See [http://msdn.microsoft.com/en-us/library/dd942138.aspx](http://msdn.microsoft.com/en-us/library/dd942138.aspx) for more details 21 | 22 | Install with `go get github.com/richardlehane/mscfb` 23 | 24 | [![Build Status](https://travis-ci.org/richardlehane/mscfb.png?branch=master)](https://travis-ci.org/richardlehane/mscfb) -------------------------------------------------------------------------------- /file.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Richard Lehane. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package mscfb 16 | 17 | import ( 18 | "encoding/binary" 19 | "io" 20 | "os" 21 | "time" 22 | "unicode" 23 | "unicode/utf16" 24 | 25 | "github.com/richardlehane/msoleps/types" 26 | ) 27 | 28 | // objectType types 29 | const ( 30 | unknown uint8 = 0x0 // this means unallocated - typically zeroed dir entries 31 | storage uint8 = 0x1 // this means dir 32 | stream uint8 = 0x2 // this means file 33 | rootStorage uint8 = 0x5 // this means root 34 | ) 35 | 36 | type directoryEntryFields struct { 37 | rawName [32]uint16 //64 bytes, unicode string encoded in UTF-16. If root, "Root Entry\0" w 38 | nameLength uint16 //2 bytes 39 | objectType uint8 //1 byte Must be one of the types specified above 40 | color uint8 //1 byte Must be 0x00 RED or 0x01 BLACK 41 | leftSibID uint32 //4 bytes, Dir? Stream ID of left sibling, if none set to NOSTREAM 42 | rightSibID uint32 //4 bytes, Dir? Stream ID of right sibling, if none set to NOSTREAM 43 | childID uint32 //4 bytes, Dir? Stream ID of child object, if none set to NOSTREAM 44 | clsid types.Guid // Contains an object class GUID (must be set to zeroes for stream object) 45 | stateBits [4]byte // user-defined flags for storage object 46 | create types.FileTime // Windows FILETIME structure 47 | modify types.FileTime // Windows FILETIME structure 48 | startingSectorLoc uint32 // if a stream object, first sector location. If root, first sector of ministream 49 | streamSize [8]byte // if a stream, size of user-defined data. If root, size of ministream 50 | } 51 | 52 | func makeDirEntry(b []byte) *directoryEntryFields { 53 | d := &directoryEntryFields{} 54 | for i := range d.rawName { 55 | d.rawName[i] = binary.LittleEndian.Uint16(b[i*2 : i*2+2]) 56 | } 57 | d.nameLength = binary.LittleEndian.Uint16(b[64:66]) 58 | d.objectType = uint8(b[66]) 59 | d.color = uint8(b[67]) 60 | d.leftSibID = binary.LittleEndian.Uint32(b[68:72]) 61 | d.rightSibID = binary.LittleEndian.Uint32(b[72:76]) 62 | d.childID = binary.LittleEndian.Uint32(b[76:80]) 63 | d.clsid = types.MustGuid(b[80:96]) 64 | copy(d.stateBits[:], b[96:100]) 65 | d.create = types.MustFileTime(b[100:108]) 66 | d.modify = types.MustFileTime(b[108:116]) 67 | d.startingSectorLoc = binary.LittleEndian.Uint32(b[116:120]) 68 | copy(d.streamSize[:], b[120:128]) 69 | return d 70 | } 71 | 72 | func (r *Reader) setDirEntries() error { 73 | c := 20 74 | if r.header.numDirectorySectors > 0 { 75 | c = int(r.header.numDirectorySectors) 76 | } 77 | de := make([]*File, 0, c) 78 | cycles := make(map[uint32]bool) 79 | num := int(r.sectorSize / 128) 80 | sn := r.header.directorySectorLoc 81 | for sn != endOfChain { 82 | buf, err := r.readAt(fileOffset(r.sectorSize, sn), int(r.sectorSize)) 83 | if err != nil { 84 | return Error{ErrRead, "directory entries read error (" + err.Error() + ")", fileOffset(r.sectorSize, sn)} 85 | } 86 | for i := 0; i < num; i++ { 87 | f := &File{r: r} 88 | f.directoryEntryFields = makeDirEntry(buf[i*128:]) 89 | fixFile(r.header.majorVersion, f) 90 | f.curSector = f.startingSectorLoc 91 | de = append(de, f) 92 | } 93 | nsn, err := r.findNext(sn, false) 94 | if err != nil { 95 | return Error{ErrRead, "directory entries error finding sector (" + err.Error() + ")", int64(nsn)} 96 | } 97 | if nsn <= sn { 98 | if nsn == sn || cycles[nsn] { 99 | return Error{ErrRead, "directory entries sector cycle", int64(nsn)} 100 | } 101 | cycles[nsn] = true 102 | } 103 | sn = nsn 104 | } 105 | r.direntries = de 106 | return nil 107 | } 108 | 109 | func fixFile(v uint16, f *File) { 110 | fixName(f) 111 | if f.objectType != stream { 112 | return 113 | } 114 | // if the MSCFB major version is 4, then this can be a uint64 otherwise is a uint32 and the least signficant bits can contain junk 115 | if v > 3 { 116 | f.Size = int64(binary.LittleEndian.Uint64(f.streamSize[:])) 117 | } else { 118 | f.Size = int64(binary.LittleEndian.Uint32(f.streamSize[:4])) 119 | } 120 | } 121 | 122 | func fixName(f *File) { 123 | // From the spec: 124 | // "The length [name] MUST be a multiple of 2, and include the terminating null character in the count. 125 | // This length MUST NOT exceed 64, the maximum size of the Directory Entry Name field." 126 | if f.nameLength < 4 || f.nameLength > 64 { 127 | return 128 | } 129 | nlen := int(f.nameLength/2 - 1) 130 | f.Initial = f.rawName[0] 131 | var slen int 132 | if !unicode.IsPrint(rune(f.Initial)) { 133 | slen = 1 134 | } 135 | f.Name = string(utf16.Decode(f.rawName[slen:nlen])) 136 | } 137 | 138 | func (r *Reader) traverse() error { 139 | r.File = make([]*File, 0, len(r.direntries)) 140 | var ( 141 | recurse func(int, []string) 142 | err error 143 | counter int 144 | ) 145 | recurse = func(i int, path []string) { 146 | // prevent cycles, number of recurse calls can't exceed number of directory entries 147 | counter++ 148 | if counter > len(r.direntries) { 149 | err = Error{ErrTraverse, "traversal counter overflow", int64(i)} 150 | return 151 | } 152 | if i < 0 || i >= len(r.direntries) { 153 | err = Error{ErrTraverse, "illegal traversal index", int64(i)} 154 | return 155 | } 156 | file := r.direntries[i] 157 | if file.leftSibID != noStream { 158 | recurse(int(file.leftSibID), path) 159 | } 160 | r.File = append(r.File, file) 161 | file.Path = path 162 | if file.childID != noStream { 163 | if i > 0 { 164 | recurse(int(file.childID), append(path, file.Name)) 165 | } else { 166 | recurse(int(file.childID), path) 167 | } 168 | } 169 | if file.rightSibID != noStream { 170 | recurse(int(file.rightSibID), path) 171 | } 172 | } 173 | recurse(0, []string{}) 174 | return err 175 | } 176 | 177 | // File represents a MSCFB directory entry 178 | type File struct { 179 | Name string // stream or directory name 180 | Initial uint16 // the first character in the name (identifies special streams such as MSOLEPS property sets) 181 | Path []string // file path 182 | Size int64 // size of stream 183 | i int64 // bytes read 184 | curSector uint32 // next sector for Read | Write 185 | rem int64 // offset in current sector remaining previous Read | Write 186 | *directoryEntryFields 187 | r *Reader 188 | } 189 | 190 | type fileInfo struct{ *File } 191 | 192 | func (fi fileInfo) Name() string { return fi.File.Name } 193 | func (fi fileInfo) Size() int64 { 194 | if fi.objectType != stream { 195 | return 0 196 | } 197 | return fi.File.Size 198 | } 199 | func (fi fileInfo) IsDir() bool { return fi.mode().IsDir() } 200 | func (fi fileInfo) ModTime() time.Time { return fi.Modified() } 201 | func (fi fileInfo) Mode() os.FileMode { return fi.File.mode() } 202 | func (fi fileInfo) Sys() interface{} { return nil } 203 | 204 | func (f *File) mode() os.FileMode { 205 | if f.objectType != stream { 206 | return os.ModeDir | 0777 207 | } 208 | return 0666 209 | } 210 | 211 | // FileInfo for this directory entry. Useful for IsDir() (whether a directory entry is a stream (file) or a storage object (dir)) 212 | func (f *File) FileInfo() os.FileInfo { 213 | return fileInfo{f} 214 | } 215 | 216 | // ID returns this directory entry's CLSID field 217 | func (f *File) ID() string { 218 | return f.clsid.String() 219 | } 220 | 221 | // Created returns this directory entry's created field 222 | func (f *File) Created() time.Time { 223 | return f.create.Time() 224 | } 225 | 226 | // Created returns this directory entry's modified field 227 | func (f *File) Modified() time.Time { 228 | return f.modify.Time() 229 | } 230 | 231 | // Read this directory entry 232 | // Returns 0, io.EOF if no stream is available (i.e. for a storage object) 233 | func (f *File) Read(b []byte) (int, error) { 234 | if f.Size < 1 || f.i >= f.Size { 235 | return 0, io.EOF 236 | } 237 | sz := len(b) 238 | if int64(sz) > f.Size-f.i { 239 | sz = int(f.Size - f.i) 240 | } 241 | // get sectors and lengths for reads 242 | str, err := f.stream(sz) 243 | if err != nil { 244 | return 0, err 245 | } 246 | // now read 247 | var idx, i int 248 | for _, v := range str { 249 | jdx := idx + int(v[1]) 250 | if jdx < idx || jdx > sz { 251 | return 0, Error{ErrRead, "bad read length", int64(jdx)} 252 | } 253 | j, err := f.r.ra.ReadAt(b[idx:jdx], v[0]) 254 | i = i + j 255 | if err != nil { 256 | f.i += int64(i) 257 | return i, Error{ErrRead, "underlying reader fail (" + err.Error() + ")", int64(idx)} 258 | } 259 | idx = jdx 260 | } 261 | f.i += int64(i) 262 | if i != sz { 263 | err = Error{ErrRead, "bytes read do not match expected read size", int64(i)} 264 | } else if i < len(b) { 265 | err = io.EOF 266 | } 267 | return i, err 268 | } 269 | 270 | // Write to this directory entry 271 | // Depends on the io.ReaderAt supplied to mscfb.New() being a WriterAt too 272 | // Returns 0, io.EOF if no stream is available (i.e. for a storage object) 273 | func (f *File) Write(b []byte) (int, error) { 274 | if f.Size < 1 || f.i >= f.Size { 275 | return 0, io.EOF 276 | } 277 | if f.r.wa == nil { 278 | wa, ok := f.r.ra.(io.WriterAt) 279 | if !ok { 280 | return 0, Error{ErrWrite, "mscfb.New must be given ReaderAt convertible to a io.WriterAt in order to write", 0} 281 | } 282 | f.r.wa = wa 283 | } 284 | sz := len(b) 285 | if int64(sz) > f.Size-f.i { 286 | sz = int(f.Size - f.i) 287 | } 288 | // get sectors and lengths for writes 289 | str, err := f.stream(sz) 290 | if err != nil { 291 | return 0, err 292 | } 293 | // now read 294 | var idx, i int 295 | for _, v := range str { 296 | jdx := idx + int(v[1]) 297 | if jdx < idx || jdx > sz { 298 | return 0, Error{ErrWrite, "bad write length", int64(jdx)} 299 | } 300 | j, err := f.r.wa.WriteAt(b[idx:jdx], v[0]) 301 | i = i + j 302 | if err != nil { 303 | f.i += int64(i) 304 | return i, Error{ErrWrite, "underlying writer fail (" + err.Error() + ")", int64(idx)} 305 | } 306 | idx = jdx 307 | } 308 | f.i += int64(i) 309 | if i != sz { 310 | err = Error{ErrWrite, "bytes written do not match expected write size", int64(i)} 311 | } else if i < len(b) { 312 | err = io.EOF 313 | } 314 | return i, err 315 | } 316 | 317 | // ReadAt reads p bytes at offset off from start of file. Does not affect seek place for other reads/writes. 318 | func (f *File) ReadAt(p []byte, off int64) (n int, err error) { 319 | // memorize place 320 | mi, mrem, mcur := f.i, f.rem, f.curSector 321 | _, err = f.Seek(off, 0) 322 | if err == nil { 323 | n, err = f.Read(p) 324 | } 325 | f.i, f.rem, f.curSector = mi, mrem, mcur 326 | return n, err 327 | } 328 | 329 | // WriteAt reads p bytes at offset off from start of file. Does not affect seek place for other reads/writes. 330 | func (f *File) WriteAt(p []byte, off int64) (n int, err error) { 331 | // memorize place 332 | mi, mrem, mcur := f.i, f.rem, f.curSector 333 | _, err = f.Seek(off, 0) 334 | if err == nil { 335 | n, err = f.Write(p) 336 | } 337 | f.i, f.rem, f.curSector = mi, mrem, mcur 338 | return n, err 339 | } 340 | 341 | // Seek sets the offset for the next Read or Write to offset, interpreted according to whence: 0 means relative to the 342 | // start of the file, 1 means relative to the current offset, and 2 means relative to the end. Seek returns the new 343 | // offset relative to the start of the file and an error, if any. 344 | func (f *File) Seek(offset int64, whence int) (int64, error) { 345 | var abs int64 346 | switch whence { 347 | default: 348 | return 0, Error{ErrSeek, "invalid whence", int64(whence)} 349 | case 0: 350 | abs = offset 351 | case 1: 352 | abs = f.i + offset 353 | case 2: 354 | abs = f.Size - offset 355 | } 356 | switch { 357 | case abs < 0: 358 | return f.i, Error{ErrSeek, "can't seek before start of File", abs} 359 | case abs >= f.Size: 360 | return f.i, Error{ErrSeek, "can't seek past File length", abs} 361 | case abs == f.i: 362 | return abs, nil 363 | case abs > f.i: 364 | t := f.i 365 | f.i = abs 366 | return f.i, f.seek(abs - t) 367 | } 368 | if f.rem >= f.i-abs { 369 | f.rem = f.rem - (f.i - abs) 370 | f.i = abs 371 | return f.i, nil 372 | } 373 | f.rem = 0 374 | f.curSector = f.startingSectorLoc 375 | f.i = abs 376 | return f.i, f.seek(abs) 377 | } 378 | 379 | func (f *File) seek(sz int64) error { 380 | // calculate ministream and sector size 381 | var mini bool 382 | var ss int64 383 | if f.Size < miniStreamCutoffSize { 384 | mini = true 385 | ss = 64 386 | } else { 387 | ss = int64(f.r.sectorSize) 388 | } 389 | 390 | var j int64 391 | var err error 392 | // if we have a remainder in the current sector, use it first 393 | if f.rem > 0 { 394 | if ss-f.rem <= sz { 395 | f.curSector, err = f.r.findNext(f.curSector, mini) 396 | if err != nil { 397 | return err 398 | } 399 | j += ss - f.rem 400 | f.rem = 0 401 | if j == sz { 402 | return nil 403 | } 404 | } else { 405 | f.rem += sz 406 | return nil 407 | } 408 | if f.curSector == endOfChain { 409 | return Error{ErrRead, "unexpected early end of chain", int64(f.curSector)} 410 | } 411 | } 412 | 413 | for { 414 | // check if we are at the last sector 415 | if sz-j < ss { 416 | f.rem = sz - j 417 | return nil 418 | } else { 419 | j += ss 420 | f.curSector, err = f.r.findNext(f.curSector, mini) 421 | if err != nil { 422 | return err 423 | } 424 | // we might be at the last sector if there is no remainder, if so can return 425 | if j == sz { 426 | return nil 427 | } 428 | } 429 | } 430 | } 431 | 432 | // return offsets and lengths for read or write 433 | func (f *File) stream(sz int) ([][2]int64, error) { 434 | // calculate ministream, cap for sector slice, and sector size 435 | var mini bool 436 | var l int 437 | var ss int64 438 | if f.Size < miniStreamCutoffSize { 439 | mini = true 440 | l = sz/64 + 2 441 | ss = 64 442 | } else { 443 | l = sz/int(f.r.sectorSize) + 2 444 | ss = int64(f.r.sectorSize) 445 | } 446 | 447 | sectors := make([][2]int64, 0, l) 448 | var i, j int 449 | 450 | // if we have a remainder from a previous read, use it first 451 | if f.rem > 0 { 452 | offset, err := f.r.getOffset(f.curSector, mini) 453 | if err != nil { 454 | return nil, err 455 | } 456 | if ss-f.rem >= int64(sz) { 457 | sectors = append(sectors, [2]int64{offset + f.rem, int64(sz)}) 458 | } else { 459 | sectors = append(sectors, [2]int64{offset + f.rem, ss - f.rem}) 460 | } 461 | if ss-f.rem <= int64(sz) { 462 | f.curSector, err = f.r.findNext(f.curSector, mini) 463 | if err != nil { 464 | return nil, err 465 | } 466 | j += int(ss - f.rem) 467 | f.rem = 0 468 | } else { 469 | f.rem += int64(sz) 470 | } 471 | if sectors[0][1] == int64(sz) { 472 | return sectors, nil 473 | } 474 | if f.curSector == endOfChain { 475 | return nil, Error{ErrRead, "unexpected early end of chain", int64(f.curSector)} 476 | } 477 | i++ 478 | } 479 | 480 | for { 481 | // emergency brake! 482 | if i >= cap(sectors) { 483 | return nil, Error{ErrRead, "index overruns sector length", int64(i)} 484 | } 485 | // grab the next offset 486 | offset, err := f.r.getOffset(f.curSector, mini) 487 | if err != nil { 488 | return nil, err 489 | } 490 | // check if we are at the last sector 491 | if sz-j < int(ss) { 492 | sectors = append(sectors, [2]int64{offset, int64(sz - j)}) 493 | f.rem = int64(sz - j) 494 | return compressChain(sectors), nil 495 | } else { 496 | sectors = append(sectors, [2]int64{offset, ss}) 497 | j += int(ss) 498 | f.curSector, err = f.r.findNext(f.curSector, mini) 499 | if err != nil { 500 | return nil, err 501 | } 502 | // we might be at the last sector if there is no remainder, if so can return 503 | if j == sz { 504 | return compressChain(sectors), nil 505 | } 506 | } 507 | i++ 508 | } 509 | } 510 | 511 | func compressChain(locs [][2]int64) [][2]int64 { 512 | l := len(locs) 513 | for i, x := 0, 0; i < l && x+1 < len(locs); i++ { 514 | if locs[x][0]+locs[x][1] == locs[x+1][0] { 515 | locs[x][1] = locs[x][1] + locs[x+1][1] 516 | for j := range locs[x+1 : len(locs)-1] { 517 | locs[x+1+j] = locs[j+x+2] 518 | } 519 | locs = locs[:len(locs)-1] 520 | } else { 521 | x += 1 522 | } 523 | } 524 | return locs 525 | } 526 | -------------------------------------------------------------------------------- /file_test.go: -------------------------------------------------------------------------------- 1 | package mscfb 2 | 3 | import "testing" 4 | 5 | func equal(a [][2]int64, b [][2]int64) bool { 6 | if len(a) != len(b) { 7 | return false 8 | } 9 | for i, v := range a { 10 | if v[0] != b[i][0] || v[1] != b[i][1] { 11 | return false 12 | } 13 | } 14 | return true 15 | } 16 | 17 | func TestCompress(t *testing.T) { 18 | a := [][2]int64{{4608, 1024}, {5632, 1024}, {6656, 1024}, {7680, 1024}, {8704, 1024}, {9728, 1024}, {10752, 512}} 19 | ar := [][2]int64{{4608, 6656}} 20 | a = compressChain(a) 21 | if !equal(a, ar) { 22 | t.Errorf("Streams compress fail; Expecting: %v, Got: %v", ar, a) 23 | } 24 | b := [][2]int64{{4608, 1024}, {6656, 1024}, {7680, 1024}, {8704, 1024}, {10752, 512}} 25 | br := [][2]int64{{4608, 1024}, {6656, 3072}, {10752, 512}} 26 | b = compressChain(b) 27 | if !equal(b, br) { 28 | t.Errorf("Streams compress fail; Expecting: %v, Got: %v", br, b) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /fuzz.go: -------------------------------------------------------------------------------- 1 | //go:build gofuzz 2 | // +build gofuzz 3 | 4 | // fuzzing with https://github.com/dvyukov/go-fuzz 5 | package mscfb 6 | 7 | import ( 8 | "bytes" 9 | "io" 10 | ) 11 | 12 | // todo: replace with Fuzzing from go test package 13 | func Fuzz(data []byte) int { 14 | doc, err := New(bytes.NewReader(data)) 15 | if err != nil { 16 | if doc != nil { 17 | panic("doc != nil on error " + err.Error()) 18 | } 19 | return 0 20 | } 21 | buf := &bytes.Buffer{} 22 | for entry, err := doc.Next(); ; entry, err = doc.Next() { 23 | if err != nil { 24 | if err == io.EOF { 25 | return 1 26 | } 27 | if entry != nil { 28 | panic("entry != nil on error " + err.Error()) 29 | } 30 | } 31 | buf.Reset() 32 | buf.ReadFrom(entry) 33 | } 34 | return 1 35 | } 36 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/richardlehane/mscfb 2 | 3 | go 1.18 4 | 5 | require github.com/richardlehane/msoleps v1.0.3 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/richardlehane/msoleps v1.0.3 h1:aznSZzrwYRl3rLKRT3gUk9am7T/mLNSnJINvN0AQoVM= 2 | github.com/richardlehane/msoleps v1.0.3/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= 3 | -------------------------------------------------------------------------------- /mscfb.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Richard Lehane. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package mscfb implements a reader for Microsoft's Compound File Binary File Format (http://msdn.microsoft.com/en-us/library/dd942138.aspx). 16 | // 17 | // The Compound File Binary File Format is also known as the Object Linking and Embedding (OLE) or Component Object Model (COM) format and was used by many 18 | // early MS software such as MS Office. 19 | // 20 | // Example: 21 | // 22 | // file, _ := os.Open("test/test.doc") 23 | // defer file.Close() 24 | // doc, err := mscfb.New(file) 25 | // if err != nil { 26 | // log.Fatal(err) 27 | // } 28 | // for entry, err := doc.Next(); err == nil; entry, err = doc.Next() { 29 | // buf := make([]byte, 512) 30 | // i, _ := entry.Read(buf) 31 | // if i > 0 { 32 | // fmt.Println(buf[:i]) 33 | // } 34 | // fmt.Println(entry.Name) 35 | // } 36 | package mscfb 37 | 38 | import ( 39 | "encoding/binary" 40 | "io" 41 | "strconv" 42 | "time" 43 | ) 44 | 45 | func fileOffset(ss, sn uint32) int64 { 46 | return int64((sn + 1) * ss) 47 | } 48 | 49 | const ( 50 | signature uint64 = 0xE11AB1A1E011CFD0 51 | miniStreamSectorSize uint32 = 64 52 | miniStreamCutoffSize int64 = 4096 53 | dirEntrySize uint32 = 128 //128 bytes 54 | ) 55 | 56 | const ( 57 | maxRegSect uint32 = 0xFFFFFFFA // Maximum regular sector number 58 | difatSect uint32 = 0xFFFFFFFC //Specifies a DIFAT sector in the FAT 59 | fatSect uint32 = 0xFFFFFFFD // Specifies a FAT sector in the FAT 60 | endOfChain uint32 = 0xFFFFFFFE // End of linked chain of sectors 61 | freeSect uint32 = 0xFFFFFFFF // Speficies unallocated sector in the FAT, Mini FAT or DIFAT 62 | maxRegStreamID uint32 = 0xFFFFFFFA // maximum regular stream ID 63 | noStream uint32 = 0xFFFFFFFF // empty pointer 64 | ) 65 | 66 | const lenHeader int = 8 + 16 + 10 + 6 + 12 + 8 + 16 + 109*4 67 | 68 | type headerFields struct { 69 | signature uint64 70 | _ [16]byte //CLSID - ignore, must be null 71 | minorVersion uint16 //Version number for non-breaking changes. This field SHOULD be set to 0x003E if the major version field is either 0x0003 or 0x0004. 72 | majorVersion uint16 //Version number for breaking changes. This field MUST be set to either 0x0003 (version 3) or 0x0004 (version 4). 73 | _ [2]byte //byte order - ignore, must be little endian 74 | sectorSize uint16 //This field MUST be set to 0x0009, or 0x000c, depending on the Major Version field. This field specifies the sector size of the compound file as a power of 2. If Major Version is 3, then the Sector Shift MUST be 0x0009, specifying a sector size of 512 bytes. If Major Version is 4, then the Sector Shift MUST be 0x000C, specifying a sector size of 4096 bytes. 75 | _ [2]byte // ministream sector size - ignore, must be 64 bytes 76 | _ [6]byte // reserved - ignore, not used 77 | numDirectorySectors uint32 //This integer field contains the count of the number of directory sectors in the compound file. If Major Version is 3, then the Number of Directory Sectors MUST be zero. This field is not supported for version 3 compound files. 78 | numFatSectors uint32 //This integer field contains the count of the number of FAT sectors in the compound file. 79 | directorySectorLoc uint32 //This integer field contains the starting sector number for the directory stream. 80 | _ [4]byte // transaction - ignore, not used 81 | _ [4]byte // mini stream size cutooff - ignore, must be 4096 bytes 82 | miniFatSectorLoc uint32 //This integer field contains the starting sector number for the mini FAT. 83 | numMiniFatSectors uint32 //This integer field contains the count of the number of mini FAT sectors in the compound file. 84 | difatSectorLoc uint32 //This integer field contains the starting sector number for the DIFAT. 85 | numDifatSectors uint32 //This integer field contains the count of the number of DIFAT sectors in the compound file. 86 | initialDifats [109]uint32 //The first 109 difat sectors are included in the header 87 | } 88 | 89 | func makeHeader(b []byte) *headerFields { 90 | h := &headerFields{} 91 | h.signature = binary.LittleEndian.Uint64(b[:8]) 92 | h.minorVersion = binary.LittleEndian.Uint16(b[24:26]) 93 | h.majorVersion = binary.LittleEndian.Uint16(b[26:28]) 94 | h.sectorSize = binary.LittleEndian.Uint16(b[30:32]) 95 | h.numDirectorySectors = binary.LittleEndian.Uint32(b[40:44]) 96 | h.numFatSectors = binary.LittleEndian.Uint32(b[44:48]) 97 | h.directorySectorLoc = binary.LittleEndian.Uint32(b[48:52]) 98 | h.miniFatSectorLoc = binary.LittleEndian.Uint32(b[60:64]) 99 | h.numMiniFatSectors = binary.LittleEndian.Uint32(b[64:68]) 100 | h.difatSectorLoc = binary.LittleEndian.Uint32(b[68:72]) 101 | h.numDifatSectors = binary.LittleEndian.Uint32(b[72:76]) 102 | var idx int 103 | for i := 76; i < 512; i = i + 4 { 104 | h.initialDifats[idx] = binary.LittleEndian.Uint32(b[i : i+4]) 105 | idx++ 106 | } 107 | return h 108 | } 109 | 110 | type header struct { 111 | *headerFields 112 | difats []uint32 113 | miniFatLocs []uint32 114 | miniStreamLocs []uint32 // chain of sectors containing the ministream 115 | } 116 | 117 | func (r *Reader) setHeader() error { 118 | buf, err := r.readAt(0, lenHeader) 119 | if err != nil { 120 | return err 121 | } 122 | r.header = &header{headerFields: makeHeader(buf)} 123 | // sanity check - check signature 124 | if r.header.signature != signature { 125 | return Error{ErrFormat, "bad signature", int64(r.header.signature)} 126 | } 127 | // check for legal sector size 128 | if r.header.sectorSize == 0x0009 || r.header.sectorSize == 0x000c { 129 | r.sectorSize = uint32(1 << r.header.sectorSize) 130 | } else { 131 | return Error{ErrFormat, "illegal sector size", int64(r.header.sectorSize)} 132 | } 133 | // check for DIFAT overflow 134 | if r.header.numDifatSectors > 0 { 135 | sz := (r.sectorSize / 4) - 1 136 | if int(r.header.numDifatSectors*sz+109) < 0 { 137 | return Error{ErrFormat, "DIFAT int overflow", int64(r.header.numDifatSectors)} 138 | } 139 | if r.header.numDifatSectors*sz+109 > r.header.numFatSectors+sz { 140 | return Error{ErrFormat, "num DIFATs exceeds FAT sectors", int64(r.header.numDifatSectors)} 141 | } 142 | } 143 | // check for mini FAT overflow 144 | if r.header.numMiniFatSectors > 0 { 145 | if int(r.sectorSize/4*r.header.numMiniFatSectors) < 0 { 146 | return Error{ErrFormat, "mini FAT int overflow", int64(r.header.numMiniFatSectors)} 147 | } 148 | if r.header.numMiniFatSectors > r.header.numFatSectors*(r.sectorSize/miniStreamSectorSize) { 149 | return Error{ErrFormat, "num mini FATs exceeds FAT sectors", int64(r.header.numFatSectors)} 150 | } 151 | } 152 | return nil 153 | } 154 | 155 | func (r *Reader) setDifats() error { 156 | r.header.difats = r.header.initialDifats[:] 157 | // return early if no extra DIFAT sectors 158 | if r.header.numDifatSectors == 0 { 159 | return nil 160 | } 161 | sz := (r.sectorSize / 4) - 1 162 | n := make([]uint32, 109, r.header.numDifatSectors*sz+109) 163 | copy(n, r.header.difats) 164 | r.header.difats = n 165 | off := r.header.difatSectorLoc 166 | for i := 0; i < int(r.header.numDifatSectors); i++ { 167 | buf, err := r.readAt(fileOffset(r.sectorSize, off), int(r.sectorSize)) 168 | if err != nil { 169 | return Error{ErrFormat, "error setting DIFAT(" + err.Error() + ")", int64(off)} 170 | } 171 | for j := 0; j < int(sz); j++ { 172 | r.header.difats = append(r.header.difats, binary.LittleEndian.Uint32(buf[j*4:j*4+4])) 173 | } 174 | off = binary.LittleEndian.Uint32(buf[len(buf)-4:]) 175 | } 176 | return nil 177 | } 178 | 179 | // set the ministream FAT and sector slices in the header 180 | func (r *Reader) setMiniStream() error { 181 | // do nothing if there is no ministream 182 | if r.direntries[0].startingSectorLoc == endOfChain || r.header.miniFatSectorLoc == endOfChain || r.header.numMiniFatSectors == 0 { 183 | return nil 184 | } 185 | // build a slice of minifat sectors (akin to the DIFAT slice) 186 | c := int(r.header.numMiniFatSectors) 187 | r.header.miniFatLocs = make([]uint32, c) 188 | r.header.miniFatLocs[0] = r.header.miniFatSectorLoc 189 | for i := 1; i < c; i++ { 190 | loc, err := r.findNext(r.header.miniFatLocs[i-1], false) 191 | if err != nil { 192 | return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(r.header.miniFatLocs[i-1])} 193 | } 194 | r.header.miniFatLocs[i] = loc 195 | } 196 | // build a slice of ministream sectors 197 | c = int(r.sectorSize / 4 * r.header.numMiniFatSectors) 198 | r.header.miniStreamLocs = make([]uint32, 0, c) 199 | cycles := make(map[uint32]bool) 200 | sn := r.direntries[0].startingSectorLoc 201 | for sn != endOfChain { 202 | r.header.miniStreamLocs = append(r.header.miniStreamLocs, sn) 203 | nsn, err := r.findNext(sn, false) 204 | if err != nil { 205 | return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(sn)} 206 | } 207 | if nsn <= sn { 208 | if nsn == sn || cycles[nsn] { 209 | return Error{ErrRead, "cycle detected in mini stream", int64(nsn)} 210 | } 211 | cycles[nsn] = true 212 | } 213 | sn = nsn 214 | } 215 | return nil 216 | } 217 | 218 | func (r *Reader) readAt(offset int64, length int) ([]byte, error) { 219 | if r.slicer { 220 | b, err := r.ra.(slicer).Slice(offset, length) 221 | if err != nil { 222 | return nil, Error{ErrRead, "slicer read error (" + err.Error() + ")", offset} 223 | } 224 | return b, nil 225 | } 226 | if length > len(r.buf) { 227 | return nil, Error{ErrRead, "read length greater than read buffer", int64(length)} 228 | } 229 | if _, err := r.ra.ReadAt(r.buf[:length], offset); err != nil { 230 | return nil, Error{ErrRead, err.Error(), offset} 231 | } 232 | return r.buf[:length], nil 233 | } 234 | 235 | func (r *Reader) getOffset(sn uint32, mini bool) (int64, error) { 236 | if mini { 237 | num := r.sectorSize / 64 238 | sec := int(sn / num) 239 | if sec >= len(r.header.miniStreamLocs) { 240 | return 0, Error{ErrRead, "minisector number is outside minisector range", int64(sec)} 241 | } 242 | dif := sn % num 243 | return int64((r.header.miniStreamLocs[sec]+1)*r.sectorSize + dif*64), nil 244 | } 245 | return fileOffset(r.sectorSize, sn), nil 246 | } 247 | 248 | // check the FAT sector for the next sector in a chain 249 | func (r *Reader) findNext(sn uint32, mini bool) (uint32, error) { 250 | entries := r.sectorSize / 4 251 | index := int(sn / entries) // find position in DIFAT or minifat array 252 | var sect uint32 253 | if mini { 254 | if index < 0 || index >= len(r.header.miniFatLocs) { 255 | return 0, Error{ErrRead, "minisector index is outside miniFAT range", int64(index)} 256 | } 257 | sect = r.header.miniFatLocs[index] 258 | } else { 259 | if index < 0 || index >= len(r.header.difats) { 260 | return 0, Error{ErrRead, "FAT index is outside DIFAT range", int64(index)} 261 | } 262 | sect = r.header.difats[index] 263 | } 264 | fatIndex := sn % entries // find position within FAT or MiniFAT sector 265 | offset := fileOffset(r.sectorSize, sect) + int64(fatIndex*4) 266 | buf := make([]byte, 4) 267 | _, err := r.ra.ReadAt(buf, offset) 268 | if err != nil { 269 | return 0, Error{ErrRead, "bad read finding next sector (" + err.Error() + ")", offset} 270 | } 271 | return binary.LittleEndian.Uint32(buf), nil 272 | } 273 | 274 | // Reader provides sequential access to the contents of a MS compound file (MSCFB) 275 | type Reader struct { 276 | slicer bool 277 | sectorSize uint32 278 | buf []byte 279 | header *header 280 | File []*File // File is an ordered slice of final directory entries. 281 | direntries []*File // unordered raw directory entries 282 | entry int 283 | 284 | ra io.ReaderAt 285 | wa io.WriterAt 286 | } 287 | 288 | // New returns a MSCFB reader 289 | func New(ra io.ReaderAt) (*Reader, error) { 290 | r := &Reader{ra: ra} 291 | if _, ok := ra.(slicer); ok { 292 | r.slicer = true 293 | } else { 294 | r.buf = make([]byte, lenHeader) 295 | } 296 | if err := r.setHeader(); err != nil { 297 | return nil, err 298 | } 299 | // resize the buffer to 4096 if sector size isn't 512 300 | if !r.slicer && int(r.sectorSize) > len(r.buf) { 301 | r.buf = make([]byte, r.sectorSize) 302 | } 303 | if err := r.setDifats(); err != nil { 304 | return nil, err 305 | } 306 | if err := r.setDirEntries(); err != nil { 307 | return nil, err 308 | } 309 | if err := r.setMiniStream(); err != nil { 310 | return nil, err 311 | } 312 | if err := r.traverse(); err != nil { 313 | return nil, err 314 | } 315 | return r, nil 316 | } 317 | 318 | // ID returns the CLSID (class ID) field from the root directory entry 319 | func (r *Reader) ID() string { 320 | return r.File[0].ID() 321 | } 322 | 323 | // Created returns the created field from the root directory entry 324 | func (r *Reader) Created() time.Time { 325 | return r.File[0].Created() 326 | } 327 | 328 | // Modified returns the last modified field from the root directory entry 329 | func (r *Reader) Modified() time.Time { 330 | return r.File[0].Modified() 331 | } 332 | 333 | // Next iterates to the next directory entry. 334 | // This isn't necessarily an adjacent *File within the File slice, but is based on the Left Sibling, Right Sibling and Child information in directory entries. 335 | func (r *Reader) Next() (*File, error) { 336 | r.entry++ 337 | if r.entry >= len(r.File) { 338 | return nil, io.EOF 339 | } 340 | return r.File[r.entry], nil 341 | } 342 | 343 | // Read the current directory entry 344 | func (r *Reader) Read(b []byte) (n int, err error) { 345 | if r.entry >= len(r.File) { 346 | return 0, io.EOF 347 | } 348 | return r.File[r.entry].Read(b) 349 | } 350 | 351 | // Debug provides granular information from an mscfb file to assist with debugging 352 | func (r *Reader) Debug() map[string][]uint32 { 353 | ret := map[string][]uint32{ 354 | "sector size": {r.sectorSize}, 355 | "mini fat locs": r.header.miniFatLocs, 356 | "mini stream locs": r.header.miniStreamLocs, 357 | "directory sector": {r.header.directorySectorLoc}, 358 | "mini stream start/size": {r.File[0].startingSectorLoc, binary.LittleEndian.Uint32(r.File[0].streamSize[:])}, 359 | } 360 | for f, err := r.Next(); err == nil; f, err = r.Next() { 361 | ret[f.Name+" start/size"] = []uint32{f.startingSectorLoc, binary.LittleEndian.Uint32(f.streamSize[:])} 362 | } 363 | return ret 364 | } 365 | 366 | const ( 367 | // ErrFormat reports issues with the MSCFB's header structures 368 | ErrFormat = iota 369 | // ErrRead reports issues attempting to read MSCFB streams 370 | ErrRead 371 | // ErrSeek reports seek issues 372 | ErrSeek 373 | // ErrWrite reports write issues 374 | ErrWrite 375 | // ErrTraverse reports issues attempting to traverse the child-parent-sibling relations 376 | // between MSCFB storage objects 377 | ErrTraverse 378 | ) 379 | 380 | type Error struct { 381 | typ int 382 | msg string 383 | val int64 384 | } 385 | 386 | func (e Error) Error() string { 387 | return "mscfb: " + e.msg + "; " + strconv.FormatInt(e.val, 10) 388 | } 389 | 390 | // Typ gives the type of MSCFB error 391 | func (e Error) Typ() int { 392 | return e.typ 393 | } 394 | 395 | // Slicer interface avoids a copy by obtaining a byte slice directly from the underlying reader 396 | type slicer interface { 397 | Slice(offset int64, length int) ([]byte, error) 398 | } 399 | -------------------------------------------------------------------------------- /mscfb_test.go: -------------------------------------------------------------------------------- 1 | package mscfb 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "log" 7 | "os" 8 | "sync" 9 | "testing" 10 | ) 11 | 12 | var ( 13 | novPapPlan = "test/novpapplan.doc" 14 | testDoc = "test/test.doc" 15 | testXls = "test/test.xls" 16 | testPpt = "test/test.ppt" 17 | testMsg = "test/test.msg" 18 | testEntries = []*File{ 19 | {Name: "Root Node", 20 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: 1}, 21 | }, 22 | {Name: "Alpha", 23 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: 2, childID: noStream}, 24 | }, 25 | {Name: "Bravo", 26 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: 3, childID: 5}, 27 | }, 28 | {Name: "Charlie", 29 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: 7}, 30 | }, 31 | {Name: "Delta", 32 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: noStream}, 33 | }, 34 | {Name: "Echo", 35 | directoryEntryFields: &directoryEntryFields{leftSibID: 4, rightSibID: 6, childID: 9}, 36 | }, 37 | {Name: "Foxtrot", 38 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: noStream}, 39 | }, 40 | {Name: "Golf", 41 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: 10}, 42 | }, 43 | {Name: "Hotel", 44 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: noStream}, 45 | }, 46 | {Name: "Indigo", 47 | directoryEntryFields: &directoryEntryFields{leftSibID: 8, rightSibID: noStream, childID: 11}, 48 | }, 49 | {Name: "Jello", 50 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: noStream}, 51 | }, 52 | {Name: "Kilo", 53 | directoryEntryFields: &directoryEntryFields{leftSibID: noStream, rightSibID: noStream, childID: noStream}, 54 | }, 55 | } 56 | ) 57 | 58 | func testFile(t *testing.T, path string) { 59 | file, _ := os.Open(path) 60 | defer file.Close() 61 | doc, err := New(file) 62 | if err != nil { 63 | t.Fatalf("Error opening file; Returns error: %v", err) 64 | } 65 | if len(doc.File) < 3 { 66 | t.Fatalf("Expecting several directory entries, only got %d", len(doc.File)) 67 | } 68 | buf := make([]byte, 512) 69 | for entry, _ := doc.Next(); entry != nil; entry, _ = doc.Next() { 70 | _, err := doc.Read(buf) 71 | if err != nil && err != io.EOF { 72 | t.Errorf("Error reading entry name, %v", entry.Name) 73 | } 74 | if len(entry.Name) < 1 { 75 | t.Errorf("Error reading entry name") 76 | } 77 | } 78 | } 79 | 80 | func TestTraverse(t *testing.T) { 81 | r := new(Reader) 82 | r.direntries = testEntries 83 | if r.traverse() != nil { 84 | t.Error("Error traversing") 85 | } 86 | expect := []int{0, 1, 2, 4, 5, 8, 9, 11, 6, 3, 7, 10} 87 | if len(r.File) != len(expect) { 88 | t.Fatalf("Error traversing: expecting %d entries, got %d", len(expect), len(r.File)) 89 | } 90 | for i, v := range r.File { 91 | if v != testEntries[expect[i]] { 92 | t.Errorf("Error traversing: expecting %d at index %d; got %v", expect[i], i, v) 93 | } 94 | } 95 | if len(r.File[len(r.File)-1].Path) != 2 { 96 | t.Fatalf("Error traversing: expecting a path length of %d, got %d", 2, len(r.File[len(r.File)-1].Path)) 97 | } 98 | if r.File[len(r.File)-1].Path[0] != "Charlie" { 99 | t.Errorf("Error traversing: expecting Charlie got %s", r.File[expect[10]].Path[0]) 100 | } 101 | if r.File[len(r.File)-1].Path[1] != "Golf" { 102 | t.Errorf("Error traversing: expecting Golf got %s", r.File[expect[10]].Path[1]) 103 | } 104 | } 105 | 106 | func TestNovPapPlan(t *testing.T) { 107 | testFile(t, novPapPlan) 108 | } 109 | 110 | func TestWord(t *testing.T) { 111 | testFile(t, testDoc) 112 | } 113 | 114 | func TestMsg(t *testing.T) { 115 | testFile(t, testMsg) 116 | } 117 | 118 | func TestPpt(t *testing.T) { 119 | testFile(t, testPpt) 120 | } 121 | 122 | func TestXls(t *testing.T) { 123 | testFile(t, testXls) 124 | } 125 | 126 | func TestConcurrentAccess(t *testing.T) { 127 | file, _ := os.Open(testXls) 128 | defer file.Close() 129 | doc, err := New(file) 130 | if err != nil { 131 | log.Fatal(err) 132 | } 133 | var wg sync.WaitGroup 134 | wg.Add(len(doc.File)) 135 | for _, f := range doc.File { 136 | go func(ff *File) { 137 | defer wg.Done() 138 | _, err := io.Copy(io.Discard, ff) 139 | if err != nil { 140 | log.Println(err) 141 | } 142 | }(f) 143 | } 144 | wg.Wait() 145 | } 146 | 147 | func TestSeek(t *testing.T) { 148 | file, _ := os.Open(testXls) 149 | defer file.Close() 150 | doc, _ := New(file) 151 | // the third entry in the XLS file is 2719 bytes 152 | f := doc.File[3] 153 | if f.Size != 2719 { 154 | t.Fatalf("Expecting the third entry of the XLS file to be 2719 bytes long; it is %d", f.Size) 155 | } 156 | buf := make([]byte, 2719) 157 | i, err := f.Read(buf) 158 | if i != 2719 || err != nil { 159 | t.Fatalf("Expecting 2719 length and no error; got %d and %v", i, err) 160 | } 161 | s, err := f.Seek(50, 1) 162 | if s != 2719 || err == nil { 163 | t.Fatalf("%v, %d", err, s) 164 | } 165 | s, err = f.Seek(1500, 0) 166 | if s != 1500 || err != nil { 167 | t.Fatalf("Seek error: %v, %d", err, s) 168 | } 169 | nbuf := make([]byte, 475) 170 | i, err = f.Read(nbuf) 171 | if i != 475 || err != nil { 172 | t.Fatalf("Expecting 475 length and no error; got %d and %v", i, err) 173 | } 174 | if !bytes.Equal(buf[1500:1975], nbuf) { 175 | t.Fatalf("Slices not equal: %s, %s", string(buf[1500:1975]), string(nbuf)) 176 | } 177 | s, err = f.Seek(5, 1) 178 | if s != 1980 || err != nil { 179 | t.Fatalf("Seek error: %v, %d", err, s) 180 | } 181 | i, err = f.Read(nbuf[:5]) 182 | if i != 5 || err != nil { 183 | t.Fatalf("Expecting 5 length, and no error; got %d and %v", i, err) 184 | } 185 | if !bytes.Equal(buf[1980:1985], nbuf[:5]) { 186 | t.Fatalf("Slices not equal: %s, %s", string(buf[1980:1985]), string(nbuf[:5])) 187 | } 188 | s, err = f.Seek(30, 2) 189 | if s != 2689 || err != nil { 190 | t.Fatalf("Seek error: %v, %d", err, s) 191 | } 192 | i, err = f.Read(nbuf[:30]) 193 | if i != 30 || err != nil { 194 | t.Fatalf("Expecting 30 length, and no error; got %d and %v", i, err) 195 | } 196 | if !bytes.Equal(buf[2689:], nbuf[:30]) { 197 | t.Fatalf("Slices not equal: %d, %s, %s", len(buf[2688:]), string(buf[2688:]), string(nbuf[:30])) 198 | } 199 | } 200 | 201 | func TestWrite(t *testing.T) { 202 | file, err := os.OpenFile(testXls, os.O_RDWR, 0666) 203 | if err != nil { 204 | t.Fatalf("error opening file for read/write %v", err) 205 | } 206 | defer file.Close() 207 | doc, err := New(file) 208 | if err != nil { 209 | t.Fatalf("Error opening file; Returns error: %v", err) 210 | } 211 | // the third entry in the XLS file is 2719 bytes 212 | f := doc.File[3] 213 | s, err := f.Seek(30, 0) 214 | if s != 30 || err != nil { 215 | t.Fatalf("Seek error: %v, %d", err, s) 216 | } 217 | orig := make([]byte, 4) 218 | i, err := f.Read(orig) 219 | if i != 4 || err != nil { 220 | t.Fatalf("Expecting read length 4, and no error, got %d %v", i, err) 221 | } 222 | s, err = f.Seek(30, 0) 223 | if s != 30 || err != nil { 224 | t.Fatalf("Seek error: %v, %d", err, s) 225 | } 226 | i, err = f.Write([]byte("test")) 227 | if i != 4 || err != nil { 228 | t.Errorf("error writing, got %d %v", i, err) 229 | } 230 | s, err = f.Seek(30, 0) 231 | if s != 30 || err != nil { 232 | t.Fatalf("Seek error: %v, %d", err, s) 233 | } 234 | res := make([]byte, 4) 235 | i, err = f.Read(res) 236 | if i != 4 || err != nil { 237 | t.Errorf("error reading, got %d %v", i, err) 238 | } 239 | if string(res) != "test" { 240 | t.Errorf("expecting test, got %s", string(res)) 241 | } 242 | s, err = f.Seek(30, 0) 243 | if s != 30 || err != nil { 244 | t.Fatalf("Seek error: %v, %d", err, s) 245 | } 246 | i, err = f.Write(orig) 247 | if i != 4 || err != nil { 248 | t.Errorf("error writing, got %d %v", i, err) 249 | } 250 | s, err = f.Seek(30, 0) 251 | if s != 30 || err != nil { 252 | t.Fatalf("Seek error: %v, %d", err, s) 253 | } 254 | i, err = f.Read(res) 255 | if i != 4 || err != nil { 256 | t.Errorf("error reading, got %d %v", i, err) 257 | } 258 | if string(res) != string(orig) { 259 | t.Errorf("bad result, expected %s, got %s", string(orig), string(res)) 260 | } 261 | i, err = f.WriteAt([]byte("test"), 30) 262 | if i != 4 || err != nil { 263 | t.Errorf("error writing, got %d %v", i, err) 264 | } 265 | i, err = f.ReadAt(res, 30) 266 | if i != 4 || err != nil { 267 | t.Errorf("error reading, got %d %v", i, err) 268 | } 269 | if string(res) != "test" { 270 | t.Errorf("expecting test, got %s", string(res)) 271 | } 272 | i, err = f.WriteAt(orig, 30) 273 | if i != 4 || err != nil { 274 | t.Errorf("error writing, got %d %v", i, err) 275 | } 276 | i, err = f.ReadAt(res, 30) 277 | if i != 4 || err != nil { 278 | t.Errorf("error reading, got %d %v", i, err) 279 | } 280 | if string(res) != string(orig) { 281 | t.Errorf("bad result, expected %s, got %s", string(orig), string(res)) 282 | } 283 | } 284 | 285 | func benchFile(b *testing.B, path string) { 286 | b.StopTimer() 287 | buf, _ := os.ReadFile(path) 288 | entrybuf := make([]byte, 32000) 289 | b.StartTimer() 290 | rdr := bytes.NewReader(buf) 291 | for i := 0; i < b.N; i++ { 292 | doc, _ := New(rdr) 293 | for entry, _ := doc.Next(); entry != nil; entry, _ = doc.Next() { 294 | doc.Read(entrybuf) 295 | } 296 | } 297 | } 298 | 299 | func BenchmarkNovPapPlan(b *testing.B) { 300 | benchFile(b, novPapPlan) 301 | } 302 | 303 | func BenchmarkWord(b *testing.B) { 304 | benchFile(b, testDoc) 305 | } 306 | 307 | func BenchmarkMsg(b *testing.B) { 308 | benchFile(b, testMsg) 309 | } 310 | 311 | func BenchmarkPpt(b *testing.B) { 312 | benchFile(b, testPpt) 313 | } 314 | 315 | func BenchmarkXls(b *testing.B) { 316 | benchFile(b, testXls) 317 | } 318 | 319 | /* 320 | 22/12 321 | BenchmarkNovPapPlan 50000 31676 ns/op 322 | BenchmarkWord 20000 65693 ns/op 323 | BenchmarkMsg 10000 198380 ns/op 324 | BenchmarkPpt 50000 30156 ns/op 325 | BenchmarkXls 100000 20327 ns/op 326 | */ 327 | -------------------------------------------------------------------------------- /test/.gitattributes: -------------------------------------------------------------------------------- 1 | # Treat all files as binary 2 | 3 | * binary -------------------------------------------------------------------------------- /test/novpapplan.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardlehane/mscfb/473ed4156da572ed0af3e476e227eca7efad5ceb/test/novpapplan.doc -------------------------------------------------------------------------------- /test/test.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardlehane/mscfb/473ed4156da572ed0af3e476e227eca7efad5ceb/test/test.doc -------------------------------------------------------------------------------- /test/test.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardlehane/mscfb/473ed4156da572ed0af3e476e227eca7efad5ceb/test/test.msg -------------------------------------------------------------------------------- /test/test.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardlehane/mscfb/473ed4156da572ed0af3e476e227eca7efad5ceb/test/test.ppt -------------------------------------------------------------------------------- /test/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/richardlehane/mscfb/473ed4156da572ed0af3e476e227eca7efad5ceb/test/test.xls --------------------------------------------------------------------------------