├── .github
└── workflows
│ ├── build.yml
│ └── codeql-analysis.yml
├── .gitignore
├── LICENSE
├── README.md
├── apilink.go
├── apipara.go
├── apirun.go
├── docxlib.go
├── empty.go
├── empty_constants.go
├── getstructure
└── main.go
├── go.mod
├── go.sum
├── main
└── main.go
├── pack.go
├── structdoc.go
├── structdoc_test.go
├── structnodes.go
├── structrel.go
├── structrun.go
└── unpack.go
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Go
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 |
9 | jobs:
10 |
11 | build:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v2
15 |
16 | - name: Set up Go
17 | uses: actions/setup-go@v2
18 | with:
19 | go-version: 1.16
20 |
21 | - name: Build
22 | run: go build -v ./...
23 |
24 | - name: Test
25 | run: go test -v ./...
26 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ master ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ master ]
20 | schedule:
21 | - cron: '38 4 * * 1'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'go' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 | # Learn more:
38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 |
40 | steps:
41 | - name: Checkout repository
42 | uses: actions/checkout@v2
43 |
44 | # Initializes the CodeQL tools for scanning.
45 | - name: Initialize CodeQL
46 | uses: github/codeql-action/init@v1
47 | with:
48 | languages: ${{ matrix.language }}
49 | # If you wish to specify custom queries, you can do so here or in a config file.
50 | # By default, queries listed here will override any specified in a config file.
51 | # Prefix the list here with "+" to use these queries and those in the config file.
52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 |
54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
55 | # If this step fails, then you should remove it and run the build manually (see below)
56 | - name: Autobuild
57 | uses: github/codeql-action/autobuild@v1
58 |
59 | # ℹ️ Command-line programs to run using the OS shell.
60 | # 📚 https://git.io/JvXDl
61 |
62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 | # and modify them (or add more) to build your code if your project
64 | # uses a compiled language
65 |
66 | #- run: |
67 | # make bootstrap
68 | # make release
69 |
70 | - name: Perform CodeQL Analysis
71 | uses: github/codeql-action/analyze@v1
72 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | docxlib
2 | .vscode/
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 gingfrederik
4 | Copyright (c) 2021 Gonzalo Fernandez-Victorio
5 | Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com)
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy
8 | of this software and associated documentation files (the "Software"), to deal
9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Docx library
2 |
3 | Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go.
4 |
5 | ## Introduction
6 |
7 | As part of my work for [Basement Crowd](https://www.basementcrowd.com) and [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) Microsoft Word documents.
8 |
9 | The difference with other projects is the following:
10 |
11 | - [UniOffice](https://github.com/unidoc/unioffice) is probably the most complete but it is also commercial (you need to pay). It also very complete, but too much for my needs.
12 |
13 | - [gingfrederik/docx](https://github.com/gingfrederik/docx) only allows to write.
14 |
15 | There are also a couple of other projects [kingzbauer/docx](https://github.com/kingzbauer/docx) and [nguyenthenguyen/docx](https://github.com/nguyenthenguyen/docx)
16 |
17 | [gingfrederik/docx](https://github.com/gingfrederik/docx) was a heavy influence (the original structures and the main method come from that project).
18 |
19 | However, those original structures didn't handle reading and extending them was particularly difficult due to Go xml parser being a bit limited including a [6 year old bug](https://github.com/golang/go/issues/9519).
20 |
21 | Additionally, my requirements go beyond the original structure and a hard fork seemed more sensible.
22 |
23 | The plan is to evolve the library, so the API is likely to change according to my company's needs. But please do feel free to send patches, reports and PRs (or fork).
24 |
25 | In the mean time, shared as an example in case somebody finds it useful.
26 |
27 | ## Getting Started
28 |
29 | ### Install
30 |
31 | Go modules supported
32 |
33 | ```sh
34 | go get github.com/gonfva/docxlib
35 | ```
36 |
37 | ### Usage
38 |
39 | See [main](main/main.go) for an example
40 |
41 | ```
42 | $ go build -o docxlib ./main
43 | $ ./docxlib
44 | Preparing new document to write at /tmp/new-file.docx
45 | Document writen.
46 | Now trying to read it
47 | We've found a new run with the text ->test
48 | We've found a new run with the text ->test font size
49 | We've found a new run with the text ->test color
50 | We've found a new run with the text ->test font size and color
51 | We've found a new hyperlink with ref http://google.com and the text google
52 | End of main
53 | ```
54 | You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go)
55 | ```
56 | $ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx
57 | I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...]
58 | I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...]
59 | I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340]
60 | I0511 12:37:40.899369 18466 main.go:31] There is a new paragraph [...]
61 | We've found a new run with the text ->test
62 | We've found a new run with the text ->test font size
63 | We've found a new run with the text ->test color
64 | I0511 12:37:40.899389 18466 main.go:31] There is a new paragraph [...]
65 | We've found a new run with the text ->test font size and color
66 | I0511 12:37:40.899396 18466 main.go:31] There is a new paragraph [...]
67 | We've found a new hyperlink with ref http://google.com and the text google
68 | End of main
69 | ```
70 | ### Build
71 |
72 | ```
73 | $ go build ./...
74 | ```
75 |
76 | ## License
77 |
78 | MIT. See [LICENSE](LICENSE)
79 |
--------------------------------------------------------------------------------
/apilink.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import "strconv"
4 |
5 | // when adding an hyperlink we need to store a reference in the relationship field
6 | func (f *DocxLib) addLinkRelation(link string) string {
7 | rel := &Relationship{
8 | ID: "rId" + strconv.Itoa(f.rId),
9 | Type: REL_HYPERLINK,
10 | Target: link,
11 | TargetMode: REL_TARGETMODE,
12 | }
13 |
14 | f.rId += 1
15 |
16 | f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel)
17 |
18 | return rel.ID
19 | }
20 |
21 | // AddLink adds an hyperlink to paragraph
22 | func (p *Paragraph) AddLink(text string, link string) *Hyperlink {
23 | rId := p.file.addLinkRelation(link)
24 | hyperlink := &Hyperlink{
25 | ID: rId,
26 | Run: Run{
27 | RunProperties: &RunProperties{
28 | RunStyle: &RunStyle{
29 | Val: HYPERLINK_STYLE,
30 | },
31 | },
32 | InstrText: text,
33 | },
34 | }
35 |
36 | p.Data = append(p.Data, ParagraphChild{Link: hyperlink})
37 |
38 | return hyperlink
39 | }
40 |
--------------------------------------------------------------------------------
/apipara.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | // AddParagraph adds a new paragraph
4 | func (f *DocxLib) AddParagraph() *Paragraph {
5 | p := &Paragraph{
6 | Data: make([]ParagraphChild, 0),
7 | file: f,
8 | }
9 |
10 | f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p)
11 | return p
12 | }
13 |
14 | func (f *DocxLib) Paragraphs() []*Paragraph {
15 | return f.Document.Body.Paragraphs
16 | }
17 |
18 | func (p *Paragraph) Children() (ret []ParagraphChild) {
19 | return p.Data
20 | }
21 |
--------------------------------------------------------------------------------
/apirun.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | // Color allows to set run color
4 | func (r *Run) Color(color string) *Run {
5 | r.RunProperties.Color = &Color{
6 | Val: color,
7 | }
8 |
9 | return r
10 | }
11 |
12 | // Size allows to set run size
13 | func (r *Run) Size(size int) *Run {
14 | r.RunProperties.Size = &Size{
15 | Val: size * 2,
16 | }
17 | return r
18 | }
19 |
20 | // AddText adds text to paragraph
21 | func (p *Paragraph) AddText(text string) *Run {
22 | t := &Text{
23 | Text: text,
24 | }
25 |
26 | run := &Run{
27 | Text: t,
28 | RunProperties: &RunProperties{},
29 | }
30 |
31 | p.Data = append(p.Data, ParagraphChild{Run: run})
32 |
33 | return run
34 | }
35 |
--------------------------------------------------------------------------------
/docxlib.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import (
4 | "archive/zip"
5 | "errors"
6 | "io"
7 | )
8 |
9 | // DocxLib is the structure that allow to access the internal represntation
10 | // in memory of the doc (either read or about to be written)
11 | type DocxLib struct {
12 | Document Document
13 | DocRelation Relationships
14 |
15 | rId int
16 | }
17 |
18 | // New generates a new empty docx file that we can manipulate and
19 | // later on, save
20 | func New() *DocxLib {
21 | return emptyFile()
22 | }
23 |
24 | // Parse generates a new docx file in memory from a reader
25 | // You can it invoke from a file
26 | // readFile, err := os.Open(FILE_PATH)
27 | // if err != nil {
28 | // panic(err)
29 | // }
30 | // fileinfo, err := readFile.Stat()
31 | // if err != nil {
32 | // panic(err)
33 | // }
34 | // size := fileinfo.Size()
35 | // doc, err := docxlib.Parse(readFile, int64(size))
36 | // but also you can invoke from a webform (BEWARE of trusting users data!!!)
37 | //
38 | // func uploadFile(w http.ResponseWriter, r *http.Request) {
39 | // r.ParseMultipartForm(10 << 20)
40 | //
41 | // file, handler, err := r.FormFile("file")
42 | // if err != nil {
43 | // fmt.Println("Error Retrieving the File")
44 | // fmt.Println(err)
45 | // http.Error(w, err.Error(), http.StatusBadRequest)
46 | // return
47 | // }
48 | // defer file.Close()
49 | // docxlib.Parse(file, handler.Size)
50 | // }
51 | func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) {
52 | zipReader, err := zip.NewReader(reader, size)
53 | if err != nil {
54 | return nil, err
55 | }
56 | doc, err = unpack(zipReader)
57 | return
58 | }
59 |
60 | // Write allows to save a docx to a writer
61 | func (f *DocxLib) Write(writer io.Writer) (err error) {
62 | zipWriter := zip.NewWriter(writer)
63 | defer zipWriter.Close()
64 |
65 | return f.pack(zipWriter)
66 | }
67 |
68 | // References gets the url for a reference
69 | func (f *DocxLib) References(id string) (href string, err error) {
70 | for _, a := range f.DocRelation.Relationships {
71 | if a.ID == id {
72 | href = a.Target
73 | return
74 | }
75 | }
76 | err = errors.New("id not found")
77 | return
78 | }
79 |
--------------------------------------------------------------------------------
/empty.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import "encoding/xml"
4 |
5 | func emptyRelationships() []*Relationship {
6 | defaultRel := []*Relationship{
7 | {
8 | ID: "rId1",
9 | Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`,
10 | Target: "styles.xml",
11 | },
12 | {
13 | ID: "rId2",
14 | Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`,
15 | Target: "theme/theme1.xml",
16 | },
17 | {
18 | ID: "rId3",
19 | Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`,
20 | Target: "fontTable.xml",
21 | },
22 | }
23 | return defaultRel
24 | }
25 |
26 | func emptyFile() *DocxLib {
27 | docx := &DocxLib{
28 | Document: Document{
29 | XMLName: xml.Name{
30 | Space: "w",
31 | },
32 | XMLW: XMLNS_W,
33 | XMLR: XMLNS_R,
34 | Body: &Body{
35 | XMLName: xml.Name{
36 | Space: "w",
37 | },
38 | Paragraphs: make([]*Paragraph, 0),
39 | },
40 | },
41 | DocRelation: Relationships{
42 | Xmlns: XMLNS,
43 | Relationships: emptyRelationships(),
44 | },
45 | rId: 4,
46 | }
47 | return docx
48 | }
49 |
--------------------------------------------------------------------------------
/empty_constants.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | const (
4 | TEMP_REL = `
5 |
6 |
7 |
8 |
9 | `
10 | TEMP_DOCPROPS_APP = `Go DOCX`
11 | TEMP_DOCPROPS_CORE = ``
12 | TEMP_CONTENT = `
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | `
22 | TEMP_WORD_STYLE = `
23 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 | `
71 | TEMP_WORD_THEME_THEME = `
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 | `
389 | )
390 |
--------------------------------------------------------------------------------
/getstructure/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "fmt"
6 | "os"
7 |
8 | "github.com/golang/glog"
9 | "github.com/gonfva/docxlib"
10 | )
11 |
12 | var fileLocation *string
13 |
14 | func init() {
15 | fileLocation = flag.String("file", "/tmp/new-file.docx", "file location")
16 | flag.Parse()
17 | }
18 |
19 | func main() {
20 | //Now let's try to read the file
21 | readFile, err := os.Open(*fileLocation)
22 | if err != nil {
23 | panic(err)
24 | }
25 | fileinfo, err := readFile.Stat()
26 | if err != nil {
27 | panic(err)
28 | }
29 | size := fileinfo.Size()
30 | doc, err := docxlib.Parse(readFile, int64(size))
31 | if err != nil {
32 | panic(err)
33 | }
34 | for _, para := range doc.Paragraphs() {
35 | glog.Infoln("There is a new paragraph", para)
36 | for _, child := range para.Children() {
37 | if child.Run != nil && child.Run.Text != nil {
38 | fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text)
39 | }
40 | if child.Link != nil {
41 | id := child.Link.ID
42 | text := child.Link.Run.InstrText
43 | link, err := doc.References(id)
44 | if err != nil {
45 | fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
46 | } else {
47 | fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text)
48 | }
49 |
50 | }
51 | }
52 | }
53 | fmt.Println("End of main")
54 | }
55 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/gonfva/docxlib
2 |
3 | go 1.16
4 |
5 | require github.com/golang/glog v0.0.0-20210429001901-424d2337a529
6 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/golang/glog v0.0.0-20210429001901-424d2337a529 h1:2voWjNECnrZRbfwXxHB1/j8wa6xdKn85B5NzgVL/pTU=
2 | github.com/golang/glog v0.0.0-20210429001901-424d2337a529/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
3 |
--------------------------------------------------------------------------------
/main/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "fmt"
6 | "os"
7 |
8 | "github.com/gonfva/docxlib"
9 | )
10 |
11 | var fileLocation *string
12 |
13 | func init() {
14 | fileLocation = flag.String("file", "/tmp/new-file.docx", "file location")
15 | flag.Parse()
16 | }
17 | func main() {
18 | fmt.Printf("Preparing new document to write at %s\n", *fileLocation)
19 |
20 | w := docxlib.New()
21 | // add new paragraph
22 | para1 := w.AddParagraph()
23 | // add text
24 | para1.AddText("test")
25 |
26 | para1.AddText("test font size").Size(22)
27 | para1.AddText("test color").Color("808080")
28 | para2 := w.AddParagraph()
29 | para2.AddText("test font size and color").Size(22).Color("ff0000")
30 |
31 | nextPara := w.AddParagraph()
32 | nextPara.AddLink("google", `http://google.com`)
33 |
34 | f, err := os.Create(*fileLocation)
35 | if err != nil {
36 | panic(err)
37 | }
38 | defer f.Close()
39 | w.Write(f)
40 | fmt.Println("Document writen. \nNow trying to read it")
41 | // Now let's try to read the file
42 | readFile, err := os.Open(*fileLocation)
43 | if err != nil {
44 | panic(err)
45 | }
46 | fileinfo, err := readFile.Stat()
47 | if err != nil {
48 | panic(err)
49 | }
50 | size := fileinfo.Size()
51 | doc, err := docxlib.Parse(readFile, int64(size))
52 | if err != nil {
53 | panic(err)
54 | }
55 | for _, para := range doc.Paragraphs() {
56 | for _, child := range para.Children() {
57 | if child.Run != nil {
58 | fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text)
59 | }
60 | if child.Link != nil {
61 | id := child.Link.ID
62 | text := child.Link.Run.InstrText
63 | link, err := doc.References(id)
64 | if err != nil {
65 | fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
66 | } else {
67 | fmt.Printf("\tWe've found a new hyperlink with ref %s and the text %s\n", link, text)
68 | }
69 |
70 | }
71 | }
72 | }
73 | fmt.Println("End of main")
74 | }
75 |
--------------------------------------------------------------------------------
/pack.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import (
4 | "archive/zip"
5 | "encoding/xml"
6 |
7 | "github.com/golang/glog"
8 | )
9 |
10 | // This receives a zip file writer (word documents are a zip with multiple xml inside)
11 | // and writes the relevant files. Some of them come from the empty_constants file,
12 | // others from the actual in-memory structure
13 | func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
14 | files := map[string]string{}
15 |
16 | files["_rels/.rels"] = TEMP_REL
17 | files["docProps/app.xml"] = TEMP_DOCPROPS_APP
18 | files["docProps/core.xml"] = TEMP_DOCPROPS_CORE
19 | files["word/theme/theme1.xml"] = TEMP_WORD_THEME_THEME
20 | files["word/styles.xml"] = TEMP_WORD_STYLE
21 | files["[Content_Types].xml"] = TEMP_CONTENT
22 | files["word/_rels/document.xml.rels"], err = marshal(f.DocRelation)
23 | if err != nil {
24 | return err
25 | }
26 | files["word/document.xml"], err = marshal(f.Document)
27 | if err != nil {
28 | return err
29 | }
30 |
31 | for path, data := range files {
32 | w, err := zipWriter.Create(path)
33 | if err != nil {
34 | return err
35 | }
36 |
37 | _, err = w.Write([]byte(data))
38 | if err != nil {
39 | return err
40 | }
41 | }
42 |
43 | return
44 | }
45 |
46 | func marshal(data interface{}) (out string, err error) {
47 | body, err := xml.Marshal(data)
48 | if err != nil {
49 | glog.Errorln("Error marshalling", err)
50 | return
51 | }
52 |
53 | out = xml.Header + string(body)
54 | return
55 | }
56 |
--------------------------------------------------------------------------------
/structdoc.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import "encoding/xml"
4 |
5 | const (
6 | XMLNS_W = `http://schemas.openxmlformats.org/wordprocessingml/2006/main`
7 | XMLNS_R = `http://schemas.openxmlformats.org/officeDocument/2006/relationships`
8 | )
9 |
10 | type Body struct {
11 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main body"`
12 | Paragraphs []*Paragraph `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"`
13 | }
14 |
15 | type Document struct {
16 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main document"`
17 | XMLW string `xml:"xmlns:w,attr"`
18 | XMLR string `xml:"xmlns:r,attr"`
19 | Body *Body
20 | }
21 |
--------------------------------------------------------------------------------
/structdoc_test.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import (
4 | "encoding/xml"
5 | "testing"
6 | )
7 |
8 | const decoded_doc_1 = `testtest font sizetest colorNew style 1New style 2test font size and colorgoogle`
9 | const decoded_doc_2 = `Table of Contents TOC \h \z \t "Heading 1,2,S6,1,S0,1,S1,1,S2,1,S3,1,S4,1,S5,1" Holy Grail [xref:bRJduW6hNR] PAGEREF _Toc420414504 \h 21.What is your name? [xref:TH7u7QDqhD] PAGEREF _Toc420414505 \h 22.What is your quest? [xref:bC62HkFATC] PAGEREF _Toc420414506 \h 23.What is your favourite colour? [xref:I3TphuHX6N] PAGEREF _Toc420414507 \h 2Holy Grail [ FORMTEXT xref:bRJduW6hNR]What is your name? [ FORMTEXT xref:TH7u7QDqhD]My name is Sir Launcelot of Camelot.What is your quest? [ FORMTEXT xref:bC62HkFATC]To seek the Holy Grail[or a grail shaped beacon]. What is your favourite colour? [ FORMTEXT xref:I3TphuHX6N]Blue.How many paragraphs here then?`
10 | const NUM_PARAGRAPHS = 5
11 |
12 | func TestStructure(t *testing.T) {
13 | doc := Document{
14 | XMLW: XMLNS_W,
15 | XMLR: XMLNS_R,
16 | XMLName: xml.Name{Space: XMLNS_W, Local: "document"}}
17 | testCases := []struct {
18 | content string
19 | numParagraphs int
20 | }{
21 | {decoded_doc_1, 5},
22 | {decoded_doc_2, 19},
23 | }
24 | for _, tc := range testCases {
25 | err := xml.Unmarshal([]byte(tc.content), &doc)
26 | if err != nil {
27 | t.Errorf("We expected to be able to decode %s but we didn't",
28 | tc.content)
29 | }
30 | if len(doc.Body.Paragraphs) != tc.numParagraphs {
31 | t.Errorf("We expected %d paragraphs, we got %d",
32 | NUM_PARAGRAPHS, len(doc.Body.Paragraphs))
33 | }
34 | for _, p := range doc.Body.Paragraphs {
35 | if len(p.Children()) == 0 {
36 | t.Errorf("We were not able to parse paragraph %v",
37 | p)
38 | }
39 | for _, child := range p.Children() {
40 | if child.Link == nil && child.Properties == nil && child.Run == nil {
41 | t.Errorf("There are Paragraph children with all fields nil")
42 | }
43 | if child.Run != nil && child.Run.Text == nil && child.Run.InstrText == "" {
44 | t.Errorf("We have a run with no text")
45 | }
46 | if child.Link != nil && child.Link.ID == "" {
47 | t.Errorf("We have a link without ID")
48 | }
49 | }
50 | }
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/structnodes.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import (
4 | "encoding/xml"
5 | "io"
6 |
7 | "github.com/golang/glog"
8 | )
9 |
10 | type ParagraphChild struct {
11 | Link *Hyperlink `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink,omitempty"`
12 | Run *Run `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r,omitempty"`
13 | Properties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"`
14 | }
15 |
16 | type Paragraph struct {
17 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"`
18 | Data []ParagraphChild
19 |
20 | file *DocxLib
21 | }
22 |
23 | func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
24 | children := make([]ParagraphChild, 0)
25 | for {
26 | t, err := d.Token()
27 | if err == io.EOF {
28 | break
29 | }
30 | switch tt := t.(type) {
31 | case xml.StartElement:
32 | var elem ParagraphChild
33 | if tt.Name.Local == "hyperlink" {
34 | var value Hyperlink
35 | d.DecodeElement(&value, &start)
36 | id := getAtt(tt.Attr, "id")
37 | anchor := getAtt(tt.Attr, "anchor")
38 | if id != "" {
39 | value.ID = id
40 | }
41 | if anchor != "" {
42 | value.ID = anchor
43 | }
44 | elem = ParagraphChild{Link: &value}
45 | } else if tt.Name.Local == "r" {
46 | var value Run
47 | d.DecodeElement(&value, &start)
48 | elem = ParagraphChild{Run: &value}
49 | if value.InstrText == "" && value.Text == nil {
50 | glog.V(0).Infof("Empty run, we ignore")
51 | continue
52 | }
53 | } else if tt.Name.Local == "rPr" {
54 | var value RunProperties
55 | d.DecodeElement(&value, &start)
56 | elem = ParagraphChild{Properties: &value}
57 | } else {
58 | continue
59 | }
60 | children = append(children, elem)
61 | }
62 |
63 | }
64 | *p = Paragraph{Data: children}
65 | return nil
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/structrel.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import "encoding/xml"
4 |
5 | const (
6 | XMLNS = `http://schemas.openxmlformats.org/package/2006/relationships`
7 | REL_HYPERLINK = `http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink`
8 |
9 | REL_TARGETMODE = "External"
10 | )
11 |
12 | type Relationships struct {
13 | XMLName xml.Name `xml:"Relationships"`
14 | Xmlns string `xml:"xmlns,attr"`
15 | Relationships []*Relationship `xml:"Relationship"`
16 | }
17 |
18 | type Relationship struct {
19 | XMLName xml.Name `xml:"Relationship"`
20 | ID string `xml:"Id,attr"`
21 | Type string `xml:"Type,attr"`
22 | Target string `xml:"Target,attr"`
23 | TargetMode string `xml:"TargetMode,attr,omitempty"`
24 | }
25 |
--------------------------------------------------------------------------------
/structrun.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | import (
4 | "encoding/xml"
5 | "io"
6 | )
7 |
8 | const (
9 | HYPERLINK_STYLE = "a1"
10 | )
11 |
12 | // A Run is part of a paragraph that has its own style. It could be
13 | // a piece of text in bold, or a link
14 | type Run struct {
15 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r,omitempty"`
16 | RunProperties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"`
17 | InstrText string `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main instrText,omitempty"`
18 | Text *Text
19 | }
20 |
21 | // The Text object contains the actual text
22 | type Text struct {
23 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main t"`
24 | XMLSpace string `xml:"xml:space,attr,omitempty"`
25 | Text string `xml:",chardata"`
26 | }
27 |
28 | // The hyperlink element contains links
29 | type Hyperlink struct {
30 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink,omitempty"`
31 | ID string `xml:"http://schemas.openxmlformats.org/officeDocument/2006/relationships id,attr"`
32 | Run Run
33 | }
34 |
35 | // RunProperties encapsulates visual properties of a run
36 | type RunProperties struct {
37 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"`
38 | Color *Color `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main color,omitempty"`
39 | Size *Size `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main sz,omitempty"`
40 | RunStyle *RunStyle `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle,omitempty"`
41 | Style *Style `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main pStyle,omitempty"`
42 | }
43 |
44 | // RunStyle contains styling for a run
45 | type RunStyle struct {
46 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rStyle,omitempty"`
47 | Val string `xml:"w:val,attr"`
48 | }
49 |
50 | // Style contains styling for a paragraph
51 | type Style struct {
52 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main pStyle,omitempty"`
53 | Val string `xml:"w:val,attr"`
54 | }
55 |
56 | // Color contains the sound of music. :D
57 | // I'm kidding. It contains the color
58 | type Color struct {
59 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main color"`
60 | Val string `xml:"w:val,attr"`
61 | }
62 |
63 | // Size contains the font size
64 | type Size struct {
65 | XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main sz"`
66 | Val int `xml:"w:val,attr"`
67 | }
68 |
69 | func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
70 | var elem Run
71 | for {
72 | t, err := d.Token()
73 | if err == io.EOF {
74 | break
75 | }
76 |
77 | switch tt := t.(type) {
78 | case xml.StartElement:
79 | if tt.Name.Local == "rPr" {
80 | var value RunProperties
81 | d.DecodeElement(&value, &start)
82 | elem.RunProperties = &value
83 | } else if tt.Name.Local == "instrText" {
84 | var value string
85 | d.DecodeElement(&value, &start)
86 | elem.InstrText = value
87 | } else if tt.Name.Local == "t" {
88 | var value Text
89 | d.DecodeElement(&value, &start)
90 | elem.Text = &value
91 | } else {
92 | continue
93 | }
94 | }
95 |
96 | }
97 | *r = elem
98 |
99 | return nil
100 |
101 | }
102 | func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
103 | var elem Text
104 | for {
105 | t, err := d.Token()
106 | if err == io.EOF {
107 | break
108 | }
109 |
110 | switch tt := t.(type) {
111 | case xml.CharData:
112 | cd := tt.Copy()
113 | elem.Text = string(cd)
114 | }
115 |
116 | }
117 |
118 | *r = elem
119 | return nil
120 | }
121 | func (r *Hyperlink) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
122 | var elem Hyperlink
123 | for {
124 | t, err := d.Token()
125 | if err == io.EOF {
126 | break
127 | }
128 |
129 | switch tt := t.(type) {
130 | case xml.StartElement:
131 | if tt.Name.Local == "r" {
132 | d.DecodeElement(&elem.Run, &start)
133 | } else {
134 | continue
135 | }
136 | }
137 |
138 | }
139 | *r = elem
140 | return nil
141 |
142 | }
143 | func (r *RunStyle) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
144 | var elem RunStyle
145 | for {
146 | t, err := d.Token()
147 | if err == io.EOF {
148 | break
149 | }
150 |
151 | switch tt := t.(type) {
152 | case xml.StartElement:
153 | elem.Val = getAtt(tt.Attr, "val")
154 | }
155 |
156 | }
157 | *r = elem
158 | return nil
159 |
160 | }
161 |
162 | func getAtt(atts []xml.Attr, name string) string {
163 | for _, at := range atts {
164 | if at.Name.Local == name {
165 | return at.Value
166 | }
167 | }
168 | return ""
169 | }
170 |
--------------------------------------------------------------------------------
/unpack.go:
--------------------------------------------------------------------------------
1 | package docxlib
2 |
3 | // This contains internal functions needed to unpack (read) a zip file
4 | import (
5 | "archive/zip"
6 | "encoding/xml"
7 | "io/ioutil"
8 |
9 | "github.com/golang/glog"
10 | )
11 |
12 | // This receives a zip file (word documents are a zip with multiple xml inside)
13 | // and parses the files that are relevant for us:
14 | // 1.-Document
15 | // 2.-Relationships
16 | func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) {
17 | var doc *Document
18 | var relations *Relationships
19 | for _, f := range zipReader.File {
20 | if f.Name == "word/_rels/document.xml.rels" {
21 | relations, err = processRelations(f)
22 | if err != nil {
23 | return nil, err
24 | }
25 | }
26 | if f.Name == "word/document.xml" {
27 | doc, err = processDoc(f)
28 | if err != nil {
29 | return nil, err
30 | }
31 | }
32 | }
33 | docx = &DocxLib{
34 | Document: *doc,
35 | DocRelation: *relations,
36 | }
37 | return docx, nil
38 | }
39 |
40 | // Processes one of the relevant files, the one with the actual document
41 | func processDoc(file *zip.File) (*Document, error) {
42 | filebytes, err := readZipFile(file)
43 | if err != nil {
44 | glog.Errorln("Error reading from internal zip file")
45 | return nil, err
46 | }
47 | glog.V(0).Infoln("Doc:", string(filebytes))
48 |
49 | doc := Document{
50 | XMLW: XMLNS_W,
51 | XMLR: XMLNS_R,
52 | XMLName: xml.Name{Space: XMLNS_W, Local: "document"}}
53 | err = xml.Unmarshal(filebytes, &doc)
54 | if err != nil {
55 | glog.Errorln("Error unmarshalling doc", string(filebytes))
56 | return nil, err
57 | }
58 | glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs)
59 | return &doc, nil
60 | }
61 |
62 | // Processes one of the relevant files, the one with the relationships
63 | func processRelations(file *zip.File) (*Relationships, error) {
64 | filebytes, err := readZipFile(file)
65 | if err != nil {
66 | glog.Errorln("Error reading from internal zip file")
67 | return nil, err
68 | }
69 | glog.V(0).Infoln("Relations:", string(filebytes))
70 |
71 | rels := Relationships{Xmlns: XMLNS_R}
72 | err = xml.Unmarshal(filebytes, &rels)
73 | if err != nil {
74 | glog.Errorln("Error unmarshalling relationships")
75 | return nil, err
76 | }
77 | return &rels, nil
78 | }
79 |
80 | // From a zip file structure, we return a byte array
81 | func readZipFile(zf *zip.File) ([]byte, error) {
82 | f, err := zf.Open()
83 | if err != nil {
84 | return nil, err
85 | }
86 | defer f.Close()
87 | return ioutil.ReadAll(f)
88 | }
89 |
--------------------------------------------------------------------------------