├── .github └── workflows │ ├── build.yml │ └── codeql-analysis.yml ├── .gitignore ├── LICENSE ├── README.md ├── apilink.go ├── apipara.go ├── apirun.go ├── docxlib.go ├── empty.go ├── empty_constants.go ├── getstructure └── main.go ├── go.mod ├── go.sum ├── main └── main.go ├── pack.go ├── structdoc.go ├── structdoc_test.go ├── structnodes.go ├── structrel.go ├── structrun.go └── unpack.go /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Go 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: 1.16 20 | 21 | - name: Build 22 | run: go build -v ./... 23 | 24 | - name: Test 25 | run: go test -v ./... 26 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '38 4 * * 1' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'go' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 37 | # Learn more: 38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v2 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v1 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docxlib 2 | .vscode/ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 gingfrederik 4 | Copyright (c) 2021 Gonzalo Fernandez-Victorio 5 | Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Docx library 2 | 3 | Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go. 4 | 5 | ## Introduction 6 | 7 | As part of my work for [Basement Crowd](https://www.basementcrowd.com) and [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) Microsoft Word documents. 8 | 9 | The difference with other projects is the following: 10 | 11 | - [UniOffice](https://github.com/unidoc/unioffice) is probably the most complete but it is also commercial (you need to pay). It also very complete, but too much for my needs. 12 | 13 | - [gingfrederik/docx](https://github.com/gingfrederik/docx) only allows to write. 14 | 15 | There are also a couple of other projects [kingzbauer/docx](https://github.com/kingzbauer/docx) and [nguyenthenguyen/docx](https://github.com/nguyenthenguyen/docx) 16 | 17 | [gingfrederik/docx](https://github.com/gingfrederik/docx) was a heavy influence (the original structures and the main method come from that project). 18 | 19 | However, those original structures didn't handle reading and extending them was particularly difficult due to Go xml parser being a bit limited including a [6 year old bug](https://github.com/golang/go/issues/9519). 20 | 21 | Additionally, my requirements go beyond the original structure and a hard fork seemed more sensible. 22 | 23 | The plan is to evolve the library, so the API is likely to change according to my company's needs. But please do feel free to send patches, reports and PRs (or fork). 24 | 25 | In the mean time, shared as an example in case somebody finds it useful. 26 | 27 | ## Getting Started 28 | 29 | ### Install 30 | 31 | Go modules supported 32 | 33 | ```sh 34 | go get github.com/gonfva/docxlib 35 | ``` 36 | 37 | ### Usage 38 | 39 | See [main](main/main.go) for an example 40 | 41 | ``` 42 | $ go build -o docxlib ./main 43 | $ ./docxlib 44 | Preparing new document to write at /tmp/new-file.docx 45 | Document writen. 46 | Now trying to read it 47 | We've found a new run with the text ->test 48 | We've found a new run with the text ->test font size 49 | We've found a new run with the text ->test color 50 | We've found a new run with the text ->test font size and color 51 | We've found a new hyperlink with ref http://google.com and the text google 52 | End of main 53 | ``` 54 | You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go) 55 | ``` 56 | $ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx 57 | I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...] 58 | I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...] 59 | I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340] 60 | I0511 12:37:40.899369 18466 main.go:31] There is a new paragraph [...] 61 | We've found a new run with the text ->test 62 | We've found a new run with the text ->test font size 63 | We've found a new run with the text ->test color 64 | I0511 12:37:40.899389 18466 main.go:31] There is a new paragraph [...] 65 | We've found a new run with the text ->test font size and color 66 | I0511 12:37:40.899396 18466 main.go:31] There is a new paragraph [...] 67 | We've found a new hyperlink with ref http://google.com and the text google 68 | End of main 69 | ``` 70 | ### Build 71 | 72 | ``` 73 | $ go build ./... 74 | ``` 75 | 76 | ## License 77 | 78 | MIT. See [LICENSE](LICENSE) 79 | -------------------------------------------------------------------------------- /apilink.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import "strconv" 4 | 5 | // when adding an hyperlink we need to store a reference in the relationship field 6 | func (f *DocxLib) addLinkRelation(link string) string { 7 | rel := &Relationship{ 8 | ID: "rId" + strconv.Itoa(f.rId), 9 | Type: REL_HYPERLINK, 10 | Target: link, 11 | TargetMode: REL_TARGETMODE, 12 | } 13 | 14 | f.rId += 1 15 | 16 | f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel) 17 | 18 | return rel.ID 19 | } 20 | 21 | // AddLink adds an hyperlink to paragraph 22 | func (p *Paragraph) AddLink(text string, link string) *Hyperlink { 23 | rId := p.file.addLinkRelation(link) 24 | hyperlink := &Hyperlink{ 25 | ID: rId, 26 | Run: Run{ 27 | RunProperties: &RunProperties{ 28 | RunStyle: &RunStyle{ 29 | Val: HYPERLINK_STYLE, 30 | }, 31 | }, 32 | InstrText: text, 33 | }, 34 | } 35 | 36 | p.Data = append(p.Data, ParagraphChild{Link: hyperlink}) 37 | 38 | return hyperlink 39 | } 40 | -------------------------------------------------------------------------------- /apipara.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | // AddParagraph adds a new paragraph 4 | func (f *DocxLib) AddParagraph() *Paragraph { 5 | p := &Paragraph{ 6 | Data: make([]ParagraphChild, 0), 7 | file: f, 8 | } 9 | 10 | f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p) 11 | return p 12 | } 13 | 14 | func (f *DocxLib) Paragraphs() []*Paragraph { 15 | return f.Document.Body.Paragraphs 16 | } 17 | 18 | func (p *Paragraph) Children() (ret []ParagraphChild) { 19 | return p.Data 20 | } 21 | -------------------------------------------------------------------------------- /apirun.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | // Color allows to set run color 4 | func (r *Run) Color(color string) *Run { 5 | r.RunProperties.Color = &Color{ 6 | Val: color, 7 | } 8 | 9 | return r 10 | } 11 | 12 | // Size allows to set run size 13 | func (r *Run) Size(size int) *Run { 14 | r.RunProperties.Size = &Size{ 15 | Val: size * 2, 16 | } 17 | return r 18 | } 19 | 20 | // AddText adds text to paragraph 21 | func (p *Paragraph) AddText(text string) *Run { 22 | t := &Text{ 23 | Text: text, 24 | } 25 | 26 | run := &Run{ 27 | Text: t, 28 | RunProperties: &RunProperties{}, 29 | } 30 | 31 | p.Data = append(p.Data, ParagraphChild{Run: run}) 32 | 33 | return run 34 | } 35 | -------------------------------------------------------------------------------- /docxlib.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import ( 4 | "archive/zip" 5 | "errors" 6 | "io" 7 | ) 8 | 9 | // DocxLib is the structure that allow to access the internal represntation 10 | // in memory of the doc (either read or about to be written) 11 | type DocxLib struct { 12 | Document Document 13 | DocRelation Relationships 14 | 15 | rId int 16 | } 17 | 18 | // New generates a new empty docx file that we can manipulate and 19 | // later on, save 20 | func New() *DocxLib { 21 | return emptyFile() 22 | } 23 | 24 | // Parse generates a new docx file in memory from a reader 25 | // You can it invoke from a file 26 | // readFile, err := os.Open(FILE_PATH) 27 | // if err != nil { 28 | // panic(err) 29 | // } 30 | // fileinfo, err := readFile.Stat() 31 | // if err != nil { 32 | // panic(err) 33 | // } 34 | // size := fileinfo.Size() 35 | // doc, err := docxlib.Parse(readFile, int64(size)) 36 | // but also you can invoke from a webform (BEWARE of trusting users data!!!) 37 | // 38 | // func uploadFile(w http.ResponseWriter, r *http.Request) { 39 | // r.ParseMultipartForm(10 << 20) 40 | // 41 | // file, handler, err := r.FormFile("file") 42 | // if err != nil { 43 | // fmt.Println("Error Retrieving the File") 44 | // fmt.Println(err) 45 | // http.Error(w, err.Error(), http.StatusBadRequest) 46 | // return 47 | // } 48 | // defer file.Close() 49 | // docxlib.Parse(file, handler.Size) 50 | // } 51 | func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) { 52 | zipReader, err := zip.NewReader(reader, size) 53 | if err != nil { 54 | return nil, err 55 | } 56 | doc, err = unpack(zipReader) 57 | return 58 | } 59 | 60 | // Write allows to save a docx to a writer 61 | func (f *DocxLib) Write(writer io.Writer) (err error) { 62 | zipWriter := zip.NewWriter(writer) 63 | defer zipWriter.Close() 64 | 65 | return f.pack(zipWriter) 66 | } 67 | 68 | // References gets the url for a reference 69 | func (f *DocxLib) References(id string) (href string, err error) { 70 | for _, a := range f.DocRelation.Relationships { 71 | if a.ID == id { 72 | href = a.Target 73 | return 74 | } 75 | } 76 | err = errors.New("id not found") 77 | return 78 | } 79 | -------------------------------------------------------------------------------- /empty.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import "encoding/xml" 4 | 5 | func emptyRelationships() []*Relationship { 6 | defaultRel := []*Relationship{ 7 | { 8 | ID: "rId1", 9 | Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`, 10 | Target: "styles.xml", 11 | }, 12 | { 13 | ID: "rId2", 14 | Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`, 15 | Target: "theme/theme1.xml", 16 | }, 17 | { 18 | ID: "rId3", 19 | Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`, 20 | Target: "fontTable.xml", 21 | }, 22 | } 23 | return defaultRel 24 | } 25 | 26 | func emptyFile() *DocxLib { 27 | docx := &DocxLib{ 28 | Document: Document{ 29 | XMLName: xml.Name{ 30 | Space: "w", 31 | }, 32 | XMLW: XMLNS_W, 33 | XMLR: XMLNS_R, 34 | Body: &Body{ 35 | XMLName: xml.Name{ 36 | Space: "w", 37 | }, 38 | Paragraphs: make([]*Paragraph, 0), 39 | }, 40 | }, 41 | DocRelation: Relationships{ 42 | Xmlns: XMLNS, 43 | Relationships: emptyRelationships(), 44 | }, 45 | rId: 4, 46 | } 47 | return docx 48 | } 49 | -------------------------------------------------------------------------------- /empty_constants.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | const ( 4 | TEMP_REL = ` 5 | 6 | 7 | 8 | 9 | ` 10 | TEMP_DOCPROPS_APP = `Go DOCX` 11 | TEMP_DOCPROPS_CORE = `` 12 | TEMP_CONTENT = ` 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | ` 22 | TEMP_WORD_STYLE = ` 23 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | ` 71 | TEMP_WORD_THEME_THEME = ` 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | ` 389 | ) 390 | -------------------------------------------------------------------------------- /getstructure/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/golang/glog" 9 | "github.com/gonfva/docxlib" 10 | ) 11 | 12 | var fileLocation *string 13 | 14 | func init() { 15 | fileLocation = flag.String("file", "/tmp/new-file.docx", "file location") 16 | flag.Parse() 17 | } 18 | 19 | func main() { 20 | //Now let's try to read the file 21 | readFile, err := os.Open(*fileLocation) 22 | if err != nil { 23 | panic(err) 24 | } 25 | fileinfo, err := readFile.Stat() 26 | if err != nil { 27 | panic(err) 28 | } 29 | size := fileinfo.Size() 30 | doc, err := docxlib.Parse(readFile, int64(size)) 31 | if err != nil { 32 | panic(err) 33 | } 34 | for _, para := range doc.Paragraphs() { 35 | glog.Infoln("There is a new paragraph", para) 36 | for _, child := range para.Children() { 37 | if child.Run != nil && child.Run.Text != nil { 38 | fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) 39 | } 40 | if child.Link != nil { 41 | id := child.Link.ID 42 | text := child.Link.Run.InstrText 43 | link, err := doc.References(id) 44 | if err != nil { 45 | fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) 46 | } else { 47 | fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text) 48 | } 49 | 50 | } 51 | } 52 | } 53 | fmt.Println("End of main") 54 | } 55 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/gonfva/docxlib 2 | 3 | go 1.16 4 | 5 | require github.com/golang/glog v0.0.0-20210429001901-424d2337a529 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/golang/glog v0.0.0-20210429001901-424d2337a529 h1:2voWjNECnrZRbfwXxHB1/j8wa6xdKn85B5NzgVL/pTU= 2 | github.com/golang/glog v0.0.0-20210429001901-424d2337a529/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 3 | -------------------------------------------------------------------------------- /main/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/gonfva/docxlib" 9 | ) 10 | 11 | var fileLocation *string 12 | 13 | func init() { 14 | fileLocation = flag.String("file", "/tmp/new-file.docx", "file location") 15 | flag.Parse() 16 | } 17 | func main() { 18 | fmt.Printf("Preparing new document to write at %s\n", *fileLocation) 19 | 20 | w := docxlib.New() 21 | // add new paragraph 22 | para1 := w.AddParagraph() 23 | // add text 24 | para1.AddText("test") 25 | 26 | para1.AddText("test font size").Size(22) 27 | para1.AddText("test color").Color("808080") 28 | para2 := w.AddParagraph() 29 | para2.AddText("test font size and color").Size(22).Color("ff0000") 30 | 31 | nextPara := w.AddParagraph() 32 | nextPara.AddLink("google", `http://google.com`) 33 | 34 | f, err := os.Create(*fileLocation) 35 | if err != nil { 36 | panic(err) 37 | } 38 | defer f.Close() 39 | w.Write(f) 40 | fmt.Println("Document writen. \nNow trying to read it") 41 | // Now let's try to read the file 42 | readFile, err := os.Open(*fileLocation) 43 | if err != nil { 44 | panic(err) 45 | } 46 | fileinfo, err := readFile.Stat() 47 | if err != nil { 48 | panic(err) 49 | } 50 | size := fileinfo.Size() 51 | doc, err := docxlib.Parse(readFile, int64(size)) 52 | if err != nil { 53 | panic(err) 54 | } 55 | for _, para := range doc.Paragraphs() { 56 | for _, child := range para.Children() { 57 | if child.Run != nil { 58 | fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) 59 | } 60 | if child.Link != nil { 61 | id := child.Link.ID 62 | text := child.Link.Run.InstrText 63 | link, err := doc.References(id) 64 | if err != nil { 65 | fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) 66 | } else { 67 | fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text) 68 | } 69 | 70 | } 71 | } 72 | } 73 | fmt.Println("End of main") 74 | } 75 | -------------------------------------------------------------------------------- /pack.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import ( 4 | "archive/zip" 5 | "encoding/xml" 6 | 7 | "github.com/golang/glog" 8 | ) 9 | 10 | // This receives a zip file writer (word documents are a zip with multiple xml inside) 11 | // and writes the relevant files. Some of them come from the empty_constants file, 12 | // others from the actual in-memory structure 13 | func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) { 14 | files := map[string]string{} 15 | 16 | files["_rels/.rels"] = TEMP_REL 17 | files["docProps/app.xml"] = TEMP_DOCPROPS_APP 18 | files["docProps/core.xml"] = TEMP_DOCPROPS_CORE 19 | files["word/theme/theme1.xml"] = TEMP_WORD_THEME_THEME 20 | files["word/styles.xml"] = TEMP_WORD_STYLE 21 | files["[Content_Types].xml"] = TEMP_CONTENT 22 | files["word/_rels/document.xml.rels"], err = marshal(f.DocRelation) 23 | if err != nil { 24 | return err 25 | } 26 | files["word/document.xml"], err = marshal(f.Document) 27 | if err != nil { 28 | return err 29 | } 30 | 31 | for path, data := range files { 32 | w, err := zipWriter.Create(path) 33 | if err != nil { 34 | return err 35 | } 36 | 37 | _, err = w.Write([]byte(data)) 38 | if err != nil { 39 | return err 40 | } 41 | } 42 | 43 | return 44 | } 45 | 46 | func marshal(data interface{}) (out string, err error) { 47 | body, err := xml.Marshal(data) 48 | if err != nil { 49 | glog.Errorln("Error marshalling", err) 50 | return 51 | } 52 | 53 | out = xml.Header + string(body) 54 | return 55 | } 56 | -------------------------------------------------------------------------------- /structdoc.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import "encoding/xml" 4 | 5 | const ( 6 | XMLNS_W = `http://schemas.openxmlformats.org/wordprocessingml/2006/main` 7 | XMLNS_R = `http://schemas.openxmlformats.org/officeDocument/2006/relationships` 8 | ) 9 | 10 | type Body struct { 11 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main body"` 12 | Paragraphs []*Paragraph `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` 13 | } 14 | 15 | type Document struct { 16 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main document"` 17 | XMLW string `xml:"xmlns:w,attr"` 18 | XMLR string `xml:"xmlns:r,attr"` 19 | Body *Body 20 | } 21 | -------------------------------------------------------------------------------- /structdoc_test.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import ( 4 | "encoding/xml" 5 | "testing" 6 | ) 7 | 8 | const decoded_doc_1 = `testtest font sizetest colorNew style 1New style 2test font size and colorgoogle` 9 | const decoded_doc_2 = `Table of Contents TOC \h \z \t "Heading 1,2,S6,1,S0,1,S1,1,S2,1,S3,1,S4,1,S5,1" Holy Grail [xref:bRJduW6hNR] PAGEREF _Toc420414504 \h 21.What is your name? [xref:TH7u7QDqhD] PAGEREF _Toc420414505 \h 22.What is your quest? [xref:bC62HkFATC] PAGEREF _Toc420414506 \h 23.What is your favourite colour? [xref:I3TphuHX6N] PAGEREF _Toc420414507 \h 2Holy Grail [ FORMTEXT xref:bRJduW6hNR]What is your name? [ FORMTEXT xref:TH7u7QDqhD]My name is Sir Launcelot of Camelot.What is your quest? [ FORMTEXT xref:bC62HkFATC]To seek the Holy Grail[or a grail shaped beacon]. What is your favourite colour? [ FORMTEXT xref:I3TphuHX6N]Blue.How many paragraphs here then?` 10 | const NUM_PARAGRAPHS = 5 11 | 12 | func TestStructure(t *testing.T) { 13 | doc := Document{ 14 | XMLW: XMLNS_W, 15 | XMLR: XMLNS_R, 16 | XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} 17 | testCases := []struct { 18 | content string 19 | numParagraphs int 20 | }{ 21 | {decoded_doc_1, 5}, 22 | {decoded_doc_2, 19}, 23 | } 24 | for _, tc := range testCases { 25 | err := xml.Unmarshal([]byte(tc.content), &doc) 26 | if err != nil { 27 | t.Errorf("We expected to be able to decode %s but we didn't", 28 | tc.content) 29 | } 30 | if len(doc.Body.Paragraphs) != tc.numParagraphs { 31 | t.Errorf("We expected %d paragraphs, we got %d", 32 | NUM_PARAGRAPHS, len(doc.Body.Paragraphs)) 33 | } 34 | for _, p := range doc.Body.Paragraphs { 35 | if len(p.Children()) == 0 { 36 | t.Errorf("We were not able to parse paragraph %v", 37 | p) 38 | } 39 | for _, child := range p.Children() { 40 | if child.Link == nil && child.Properties == nil && child.Run == nil { 41 | t.Errorf("There are Paragraph children with all fields nil") 42 | } 43 | if child.Run != nil && child.Run.Text == nil && child.Run.InstrText == "" { 44 | t.Errorf("We have a run with no text") 45 | } 46 | if child.Link != nil && child.Link.ID == "" { 47 | t.Errorf("We have a link without ID") 48 | } 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /structnodes.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import ( 4 | "encoding/xml" 5 | "io" 6 | 7 | "github.com/golang/glog" 8 | ) 9 | 10 | type ParagraphChild struct { 11 | Link *Hyperlink `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink,omitempty"` 12 | Run *Run `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r,omitempty"` 13 | Properties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` 14 | } 15 | 16 | type Paragraph struct { 17 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` 18 | Data []ParagraphChild 19 | 20 | file *DocxLib 21 | } 22 | 23 | func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { 24 | children := make([]ParagraphChild, 0) 25 | for { 26 | t, err := d.Token() 27 | if err == io.EOF { 28 | break 29 | } 30 | switch tt := t.(type) { 31 | case xml.StartElement: 32 | var elem ParagraphChild 33 | if tt.Name.Local == "hyperlink" { 34 | var value Hyperlink 35 | d.DecodeElement(&value, &start) 36 | id := getAtt(tt.Attr, "id") 37 | anchor := getAtt(tt.Attr, "anchor") 38 | if id != "" { 39 | value.ID = id 40 | } 41 | if anchor != "" { 42 | value.ID = anchor 43 | } 44 | elem = ParagraphChild{Link: &value} 45 | } else if tt.Name.Local == "r" { 46 | var value Run 47 | d.DecodeElement(&value, &start) 48 | elem = ParagraphChild{Run: &value} 49 | if value.InstrText == "" && value.Text == nil { 50 | glog.V(0).Infof("Empty run, we ignore") 51 | continue 52 | } 53 | } else if tt.Name.Local == "rPr" { 54 | var value RunProperties 55 | d.DecodeElement(&value, &start) 56 | elem = ParagraphChild{Properties: &value} 57 | } else { 58 | continue 59 | } 60 | children = append(children, elem) 61 | } 62 | 63 | } 64 | *p = Paragraph{Data: children} 65 | return nil 66 | 67 | } 68 | -------------------------------------------------------------------------------- /structrel.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import "encoding/xml" 4 | 5 | const ( 6 | XMLNS = `http://schemas.openxmlformats.org/package/2006/relationships` 7 | REL_HYPERLINK = `http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink` 8 | 9 | REL_TARGETMODE = "External" 10 | ) 11 | 12 | type Relationships struct { 13 | XMLName xml.Name `xml:"Relationships"` 14 | Xmlns string `xml:"xmlns,attr"` 15 | Relationships []*Relationship `xml:"Relationship"` 16 | } 17 | 18 | type Relationship struct { 19 | XMLName xml.Name `xml:"Relationship"` 20 | ID string `xml:"Id,attr"` 21 | Type string `xml:"Type,attr"` 22 | Target string `xml:"Target,attr"` 23 | TargetMode string `xml:"TargetMode,attr,omitempty"` 24 | } 25 | -------------------------------------------------------------------------------- /structrun.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | import ( 4 | "encoding/xml" 5 | "io" 6 | ) 7 | 8 | const ( 9 | HYPERLINK_STYLE = "a1" 10 | ) 11 | 12 | // A Run is part of a paragraph that has its own style. It could be 13 | // a piece of text in bold, or a link 14 | type Run struct { 15 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r,omitempty"` 16 | RunProperties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` 17 | InstrText string `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main instrText,omitempty"` 18 | Text *Text 19 | } 20 | 21 | // The Text object contains the actual text 22 | type Text struct { 23 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main t"` 24 | XMLSpace string `xml:"xml:space,attr,omitempty"` 25 | Text string `xml:",chardata"` 26 | } 27 | 28 | // The hyperlink element contains links 29 | type Hyperlink struct { 30 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink,omitempty"` 31 | ID string `xml:"http://schemas.openxmlformats.org/officeDocument/2006/relationships id,attr"` 32 | Run Run 33 | } 34 | 35 | // RunProperties encapsulates visual properties of a run 36 | type RunProperties struct { 37 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` 38 | Color *Color `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main color,omitempty"` 39 | Size *Size `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main sz,omitempty"` 40 | RunStyle *RunStyle `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle,omitempty"` 41 | Style *Style `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main pStyle,omitempty"` 42 | } 43 | 44 | // RunStyle contains styling for a run 45 | type RunStyle struct { 46 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle,omitempty"` 47 | Val string `xml:"w:val,attr"` 48 | } 49 | 50 | // Style contains styling for a paragraph 51 | type Style struct { 52 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main pStyle,omitempty"` 53 | Val string `xml:"w:val,attr"` 54 | } 55 | 56 | // Color contains the sound of music. :D 57 | // I'm kidding. It contains the color 58 | type Color struct { 59 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main color"` 60 | Val string `xml:"w:val,attr"` 61 | } 62 | 63 | // Size contains the font size 64 | type Size struct { 65 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main sz"` 66 | Val int `xml:"w:val,attr"` 67 | } 68 | 69 | func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { 70 | var elem Run 71 | for { 72 | t, err := d.Token() 73 | if err == io.EOF { 74 | break 75 | } 76 | 77 | switch tt := t.(type) { 78 | case xml.StartElement: 79 | if tt.Name.Local == "rPr" { 80 | var value RunProperties 81 | d.DecodeElement(&value, &start) 82 | elem.RunProperties = &value 83 | } else if tt.Name.Local == "instrText" { 84 | var value string 85 | d.DecodeElement(&value, &start) 86 | elem.InstrText = value 87 | } else if tt.Name.Local == "t" { 88 | var value Text 89 | d.DecodeElement(&value, &start) 90 | elem.Text = &value 91 | } else { 92 | continue 93 | } 94 | } 95 | 96 | } 97 | *r = elem 98 | 99 | return nil 100 | 101 | } 102 | func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { 103 | var elem Text 104 | for { 105 | t, err := d.Token() 106 | if err == io.EOF { 107 | break 108 | } 109 | 110 | switch tt := t.(type) { 111 | case xml.CharData: 112 | cd := tt.Copy() 113 | elem.Text = string(cd) 114 | } 115 | 116 | } 117 | 118 | *r = elem 119 | return nil 120 | } 121 | func (r *Hyperlink) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { 122 | var elem Hyperlink 123 | for { 124 | t, err := d.Token() 125 | if err == io.EOF { 126 | break 127 | } 128 | 129 | switch tt := t.(type) { 130 | case xml.StartElement: 131 | if tt.Name.Local == "r" { 132 | d.DecodeElement(&elem.Run, &start) 133 | } else { 134 | continue 135 | } 136 | } 137 | 138 | } 139 | *r = elem 140 | return nil 141 | 142 | } 143 | func (r *RunStyle) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { 144 | var elem RunStyle 145 | for { 146 | t, err := d.Token() 147 | if err == io.EOF { 148 | break 149 | } 150 | 151 | switch tt := t.(type) { 152 | case xml.StartElement: 153 | elem.Val = getAtt(tt.Attr, "val") 154 | } 155 | 156 | } 157 | *r = elem 158 | return nil 159 | 160 | } 161 | 162 | func getAtt(atts []xml.Attr, name string) string { 163 | for _, at := range atts { 164 | if at.Name.Local == name { 165 | return at.Value 166 | } 167 | } 168 | return "" 169 | } 170 | -------------------------------------------------------------------------------- /unpack.go: -------------------------------------------------------------------------------- 1 | package docxlib 2 | 3 | // This contains internal functions needed to unpack (read) a zip file 4 | import ( 5 | "archive/zip" 6 | "encoding/xml" 7 | "io/ioutil" 8 | 9 | "github.com/golang/glog" 10 | ) 11 | 12 | // This receives a zip file (word documents are a zip with multiple xml inside) 13 | // and parses the files that are relevant for us: 14 | // 1.-Document 15 | // 2.-Relationships 16 | func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) { 17 | var doc *Document 18 | var relations *Relationships 19 | for _, f := range zipReader.File { 20 | if f.Name == "word/_rels/document.xml.rels" { 21 | relations, err = processRelations(f) 22 | if err != nil { 23 | return nil, err 24 | } 25 | } 26 | if f.Name == "word/document.xml" { 27 | doc, err = processDoc(f) 28 | if err != nil { 29 | return nil, err 30 | } 31 | } 32 | } 33 | docx = &DocxLib{ 34 | Document: *doc, 35 | DocRelation: *relations, 36 | } 37 | return docx, nil 38 | } 39 | 40 | // Processes one of the relevant files, the one with the actual document 41 | func processDoc(file *zip.File) (*Document, error) { 42 | filebytes, err := readZipFile(file) 43 | if err != nil { 44 | glog.Errorln("Error reading from internal zip file") 45 | return nil, err 46 | } 47 | glog.V(0).Infoln("Doc:", string(filebytes)) 48 | 49 | doc := Document{ 50 | XMLW: XMLNS_W, 51 | XMLR: XMLNS_R, 52 | XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} 53 | err = xml.Unmarshal(filebytes, &doc) 54 | if err != nil { 55 | glog.Errorln("Error unmarshalling doc", string(filebytes)) 56 | return nil, err 57 | } 58 | glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs) 59 | return &doc, nil 60 | } 61 | 62 | // Processes one of the relevant files, the one with the relationships 63 | func processRelations(file *zip.File) (*Relationships, error) { 64 | filebytes, err := readZipFile(file) 65 | if err != nil { 66 | glog.Errorln("Error reading from internal zip file") 67 | return nil, err 68 | } 69 | glog.V(0).Infoln("Relations:", string(filebytes)) 70 | 71 | rels := Relationships{Xmlns: XMLNS_R} 72 | err = xml.Unmarshal(filebytes, &rels) 73 | if err != nil { 74 | glog.Errorln("Error unmarshalling relationships") 75 | return nil, err 76 | } 77 | return &rels, nil 78 | } 79 | 80 | // From a zip file structure, we return a byte array 81 | func readZipFile(zf *zip.File) ([]byte, error) { 82 | f, err := zf.Open() 83 | if err != nil { 84 | return nil, err 85 | } 86 | defer f.Close() 87 | return ioutil.ReadAll(f) 88 | } 89 | --------------------------------------------------------------------------------