├── .github └── workflows │ ├── docd.yml │ └── go.yml ├── .gitignore ├── LICENSE ├── README.md ├── client ├── client.go └── cmd │ └── docconv-client │ └── main.go ├── doc.go ├── doc_test.go ├── docconv.go ├── docconv_test.go ├── docd ├── Dockerfile ├── appengine │ ├── Dockerfile │ ├── README.md │ └── app.yaml ├── convert.go ├── internal │ ├── cloudtrace │ │ ├── context.go │ │ ├── header.go │ │ ├── http_handler.go │ │ └── logger.go │ ├── debug │ │ ├── context.go │ │ ├── http_handler.go │ │ └── logger.go │ ├── error_reporter.go │ └── recovery.go └── main.go ├── docx.go ├── docx_test ├── docx_test.go └── testdata │ ├── decompression_size_limit.docx │ ├── sample.docx │ ├── sample_2.docx │ └── sample_3.docx ├── go.mod ├── go.sum ├── html.go ├── html_appengine.go ├── html_test ├── html_test.go └── testdata │ └── test.html ├── iWork ├── TSPArchiveMessages.pb.go ├── TSPDatabaseMessages.pb.go ├── TSPMessages.pb.go ├── generate.sh └── pb-schema │ ├── KNArchives.proto │ ├── KNCommandArchives.proto │ ├── README.md │ ├── TNArchives.proto │ ├── TNCommandArchives.proto │ ├── TPArchives.proto │ ├── TPCommandArchives.proto │ ├── TSAArchives.proto │ ├── TSCEArchives.proto │ ├── TSCH3DArchives.proto │ ├── TSCHArchives.Common.proto │ ├── TSCHArchives.GEN.proto │ ├── TSCHArchives.proto │ ├── TSCHCommandArchives.proto │ ├── TSCHPreUFFArchives.proto │ ├── TSDArchives.proto │ ├── TSDCommandArchives.proto │ ├── TSKArchives.proto │ ├── TSPArchiveMessages.proto │ ├── TSPDatabaseMessages.proto │ ├── TSPMessages.proto │ ├── TSSArchives.proto │ ├── TSTArchives.proto │ ├── TSTCommandArchives.proto │ ├── TSTStylePropertyArchiving.proto │ ├── TSWPArchives.proto │ └── TSWPCommandArchives.proto ├── image.go ├── image_ocr.go ├── image_ocr_test.go ├── limit.go ├── local.go ├── local_test.go ├── odt.go ├── pages.go ├── pdf.go ├── pdf_ocr.go ├── pdf_ocr_test.go ├── pdf_text.go ├── pptx.go ├── pptx_test ├── pptx_test.go └── testdata │ ├── decompression_size_limit.pptx │ └── sample.pptx ├── rtf.go ├── rtf_test ├── rtf_test.go └── testdata │ └── test.rtf ├── snappy ├── LICENSE ├── README ├── decode.go ├── encode.go ├── snappy.go └── snappy_test.go ├── testdata ├── 001-helloworld.png └── 001-test.doc ├── tidy.go ├── url.go └── xml.go /.github/workflows/docd.yml: -------------------------------------------------------------------------------- 1 | name: Release docd 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [published] 7 | 8 | jobs: 9 | docker: 10 | name: Publish Docker image 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v4 15 | - name: Set up QEMU 16 | uses: docker/setup-qemu-action@v3 17 | - name: Set up Docker Buildx 18 | uses: docker/setup-buildx-action@v3 19 | - name: Login to Docker Hub 20 | uses: docker/login-action@v3 21 | with: 22 | username: ${{ secrets.DOCKERHUB_USERNAME }} 23 | password: ${{ secrets.DOCKERHUB_PASSWORD }} 24 | - id: meta 25 | uses: docker/metadata-action@v5 26 | with: 27 | images: sajari/docd 28 | labels: | 29 | org.opencontainers.image.description=A tool which exposes code.sajari.com/docconv/v2 as a service 30 | org.opencontainers.image.title=docd 31 | tags: | 32 | type=semver,pattern={{version}} 33 | type=semver,pattern={{major}}.{{minor}} 34 | type=semver,pattern={{major}} 35 | type=sha,format=long 36 | - name: Build and push 37 | uses: docker/build-push-action@v5 38 | with: 39 | context: . 40 | file: docd/Dockerfile 41 | platforms: linux/amd64,linux/arm64 42 | labels: ${{ steps.meta.outputs.labels }} 43 | tags: ${{ steps.meta.outputs.tags }} 44 | push: true 45 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | go: ["1.21"] 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: Install dependencies 19 | run: sudo apt install wv unrtf tidy 20 | 21 | - name: Set up Go ${{ matrix.go }} 22 | uses: actions/setup-go@v2 23 | with: 24 | go-version: ${{ matrix.go }} 25 | 26 | - name: Build ${{ matrix.go }} 27 | run: go build -v ./... 28 | 29 | - name: Test ${{ matrix.go }} 30 | run: go test -v -race ./... 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | sajari-convert 3 | *tests/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Sajari Pty Ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # docconv 2 | 3 | [![Go reference](https://pkg.go.dev/badge/code.sajari.com/docconv/v2.svg)](https://pkg.go.dev/code.sajari.com/docconv/v2) 4 | [![Build status](https://github.com/sajari/docconv/workflows/Go/badge.svg?branch=master)](https://github.com/sajari/docconv/actions) 5 | [![Report card](https://goreportcard.com/badge/code.sajari.com/docconv/v2)](https://goreportcard.com/report/code.sajari.com/docconv/v2) 6 | [![Sourcegraph](https://sourcegraph.com/github.com/sajari/docconv/v2/-/badge.svg)](https://sourcegraph.com/github.com/sajari/docconv/v2) 7 | 8 | A Go wrapper library to convert PDF, DOC, DOCX, XML, HTML, RTF, ODT, Pages documents and images (see optional dependencies below) to plain text. 9 | 10 | ## Installation 11 | 12 | If you haven't setup Go before, you first need to [install Go](https://golang.org/doc/install). 13 | 14 | To fetch and build the code: 15 | 16 | ```console 17 | $ go install code.sajari.com/docconv/v2/docd@latest 18 | ``` 19 | 20 | See `go help install` for details on the installation location of the installed `docd` executable. Make sure that the full path to the executable is in your `PATH` environment variable. 21 | 22 | ## Dependencies 23 | 24 | - tidy 25 | - wv 26 | - popplerutils 27 | - unrtf 28 | - https://github.com/JalfResi/justext 29 | 30 | ### Debian-based Linux 31 | 32 | ```console 33 | $ sudo apt-get install poppler-utils wv unrtf tidy 34 | $ go get github.com/JalfResi/justext 35 | ``` 36 | 37 | ### macOS 38 | 39 | ```console 40 | $ brew install poppler-qt5 wv unrtf tidy-html5 41 | $ go get github.com/JalfResi/justext 42 | ``` 43 | 44 | ### Optional dependencies 45 | 46 | To add image support to the `docconv` library you first need to [install and build gosseract](https://github.com/otiai10/gosseract/tree/v2.2.4). 47 | 48 | Now you can add `-tags ocr` to any `go` command when building/fetching/testing `docconv` to include support for processing images: 49 | 50 | ```console 51 | $ go get -tags ocr code.sajari.com/docconv/v2/... 52 | ``` 53 | 54 | This may complain on macOS, which you can fix by installing [tesseract](https://tesseract-ocr.github.io) via brew: 55 | 56 | ```console 57 | $ brew install tesseract 58 | ``` 59 | 60 | ## docd tool 61 | 62 | The `docd` tool runs as either: 63 | 64 | 1. a service on port 8888 (by default) 65 | 66 | Documents can be sent as a multipart POST request and the plain text (body) and meta information are then returned as a JSON object. 67 | 68 | 2. a service exposed from within a Docker container 69 | 70 | This also runs as a service, but from within a Docker container. 71 | Official images are published at https://hub.docker.com/r/sajari/docd. 72 | 73 | Optionally you can build it yourself: 74 | 75 | ```console 76 | $ cd docd 77 | $ docker build -t docd . 78 | ``` 79 | 80 | 3. via the command line. 81 | 82 | Documents can be sent as an argument, e.g. 83 | 84 | ```console 85 | $ docd -input document.pdf 86 | ``` 87 | 88 | ### Optional flags 89 | 90 | - `addr` - the bind address for the HTTP server, default is ":8888" 91 | - `readability-length-low` - sets the readability length low if the ?readability=1 parameter is set 92 | - `readability-length-high` - sets the readability length high if the ?readability=1 parameter is set 93 | - `readability-stopwords-low` - sets the readability stopwords low if the ?readability=1 parameter is set 94 | - `readability-stopwords-high` - sets the readability stopwords high if the ?readability=1 parameter is set 95 | - `readability-max-link-density` - sets the readability max link density if the ?readability=1 parameter is set 96 | - `readability-max-heading-distance` - sets the readability max heading distance if the ?readability=1 parameter is set 97 | - `readability-use-classes` - comma separated list of readability classes to use if the ?readability=1 parameter is set 98 | 99 | ### How to start the service 100 | 101 | ```console 102 | $ # This runs on port 8000 103 | $ docd -addr :8000 104 | ``` 105 | 106 | ## Example usage (code) 107 | 108 | Some basic code is shown below, but normally you would accept the file by HTTP or open it from the file system. 109 | 110 | This should be enough to get you started though. 111 | 112 | ### Use case 1: run locally 113 | 114 | > Note: this assumes you have the [dependencies](#dependencies) installed. 115 | 116 | ```go 117 | package main 118 | 119 | import ( 120 | "fmt" 121 | 122 | "code.sajari.com/docconv/v2" 123 | ) 124 | 125 | func main() { 126 | res, err := docconv.ConvertPath("your-file.pdf") 127 | if err != nil { 128 | // TODO: handle 129 | } 130 | fmt.Println(res) 131 | } 132 | ``` 133 | 134 | ### Use case 2: request over the network 135 | 136 | ```go 137 | package main 138 | 139 | import ( 140 | "fmt" 141 | 142 | "code.sajari.com/docconv/v2/client" 143 | ) 144 | 145 | func main() { 146 | // Create a new client, using the default endpoint (localhost:8888) 147 | c := client.New() 148 | 149 | res, err := client.ConvertPath(c, "your-file.pdf") 150 | if err != nil { 151 | // TODO: handle 152 | } 153 | fmt.Println(res) 154 | } 155 | ``` 156 | 157 | Alternatively, via a `curl`: 158 | 159 | ```console 160 | $ curl -s -F input=@your-file.pdf http://localhost:8888/convert 161 | ``` 162 | -------------------------------------------------------------------------------- /client/client.go: -------------------------------------------------------------------------------- 1 | // Package client defines types and functions for interacting with 2 | // docconv HTTP servers. 3 | package client 4 | 5 | import ( 6 | "bytes" 7 | "encoding/json" 8 | "fmt" 9 | "io" 10 | "mime/multipart" 11 | "net/http" 12 | "os" 13 | ) 14 | 15 | // DefaultProtocol is the default protocol used to construct paths 16 | // when making docconv requests. 17 | const DefaultProtocol = "http://" 18 | 19 | // DefaultEndpoint is the default endpoint address (host:port) for 20 | // docconv HTTP servers. 21 | const DefaultEndpoint = "localhost:8888" 22 | 23 | // DefaultHTTPClient is the default HTTP client used to make requests 24 | // to docconv HTTP servers. 25 | var DefaultHTTPClient = http.DefaultClient 26 | 27 | // Opt is an option used in New to create Clients. 28 | type Opt func(*Client) 29 | 30 | // WithEndpoint set the endpoint on a Client. 31 | func WithEndpoint(endpoint string) Opt { 32 | return func(c *Client) { 33 | c.endpoint = endpoint 34 | } 35 | } 36 | 37 | // WithHTTPClient sets the *http.Client used for all underlying 38 | // calls. 39 | func WithHTTPClient(client *http.Client) Opt { 40 | return func(c *Client) { 41 | c.httpClient = client 42 | } 43 | } 44 | 45 | // WithProtocol sets the protocol used in HTTP requests. Currently this 46 | // must be either http:// or https://. 47 | func WithProtocol(protocol string) Opt { 48 | return func(c *Client) { 49 | c.protocol = protocol 50 | } 51 | } 52 | 53 | // New creates a new docconv client for interacting with a docconv HTTP 54 | // server. 55 | func New(opts ...Opt) *Client { 56 | c := &Client{ 57 | endpoint: DefaultEndpoint, 58 | protocol: DefaultProtocol, 59 | httpClient: DefaultHTTPClient, 60 | } 61 | 62 | for _, opt := range opts { 63 | opt(c) 64 | } 65 | return c 66 | } 67 | 68 | // Client is a docconv HTTP client. Use New to make new Clients. 69 | type Client struct { 70 | endpoint string 71 | protocol string 72 | httpClient *http.Client 73 | } 74 | 75 | // Response is from docconv.Response copied here to avoid dependency on 76 | // the docconv package. 77 | type Response struct { 78 | Body string `json:"body"` 79 | Meta map[string]string `json:"meta"` 80 | MSecs uint32 `json:"msecs"` 81 | Error string `json:"error"` 82 | } 83 | 84 | // Convert a file from a local path using the http client 85 | func (c *Client) Convert(r io.Reader, filename string) (*Response, error) { 86 | buf := &bytes.Buffer{} 87 | w := multipart.NewWriter(buf) 88 | part, err := w.CreateFormFile("input", filename) 89 | if err != nil { 90 | return nil, err 91 | } 92 | if n, err := io.Copy(part, r); err != nil { 93 | return nil, fmt.Errorf("could not copy file data into request (failed after %d bytes): %w", n, err) 94 | } 95 | if err := w.Close(); err != nil { 96 | return nil, err 97 | } 98 | 99 | req, err := http.NewRequest("POST", fmt.Sprintf("%v%v/convert", c.protocol, c.endpoint), buf) 100 | if err != nil { 101 | return nil, err 102 | } 103 | req.Header.Set("Content-Type", w.FormDataContentType()) 104 | 105 | resp, err := c.httpClient.Do(req) 106 | if err != nil { 107 | return nil, err 108 | } 109 | defer resp.Body.Close() 110 | 111 | res := &Response{} 112 | if resp.StatusCode != http.StatusOK { 113 | err := json.NewDecoder(resp.Body).Decode(&res) 114 | if err != nil { 115 | // Invalid JSON can come from proxies etc, so try 116 | // to give something meaningful. 117 | return nil, fmt.Errorf("non-OK status from convert server: %d (%v)", resp.StatusCode, http.StatusText(resp.StatusCode)) 118 | } 119 | return nil, fmt.Errorf("non-OK status from convert server: %d (%v) with error: %v", resp.StatusCode, http.StatusText(resp.StatusCode), res.Error) 120 | } 121 | 122 | if err := json.NewDecoder(resp.Body).Decode(&res); err != nil { 123 | return nil, err 124 | } 125 | return res, nil 126 | } 127 | 128 | // ConvertPath uses the docconv Client to convert the local file 129 | // found at path. 130 | func ConvertPath(c *Client, path string) (*Response, error) { 131 | f, err := os.Open(path) 132 | if err != nil { 133 | return nil, err 134 | } 135 | defer f.Close() 136 | 137 | return c.Convert(f, f.Name()) 138 | } 139 | -------------------------------------------------------------------------------- /client/cmd/docconv-client/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "fmt" 7 | "os" 8 | 9 | "code.sajari.com/docconv/v2/client" 10 | ) 11 | 12 | var ( 13 | path = flag.String("path", "", "`path` to file to convert") 14 | endpoint = flag.String("endpoint", "", "docconv HTTP server to use for conversion") 15 | ) 16 | 17 | func main() { 18 | flag.Parse() 19 | 20 | if *path == "" { 21 | fmt.Println("must set -path") 22 | os.Exit(1) 23 | } 24 | 25 | var opts []client.Opt 26 | if *endpoint != "" { 27 | opts = append(opts, client.WithEndpoint(*endpoint)) 28 | } 29 | 30 | c := client.New(opts...) 31 | resp, err := client.ConvertPath(c, *path) 32 | if err != nil { 33 | fmt.Printf("Could not convert: %v", err) 34 | os.Exit(1) 35 | } 36 | 37 | enc := json.NewEncoder(os.Stdout) 38 | enc.SetIndent("", " ") 39 | if err := enc.Encode(resp); err != nil { 40 | fmt.Printf("Could not encode response: %v", err) 41 | os.Exit(1) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "os" 8 | "os/exec" 9 | "time" 10 | 11 | "github.com/richardlehane/mscfb" 12 | "github.com/richardlehane/msoleps" 13 | ) 14 | 15 | // ConvertDoc converts an MS Word .doc to text. 16 | func ConvertDoc(r io.Reader) (string, map[string]string, error) { 17 | f, err := NewLocalFile(r) 18 | if err != nil { 19 | return "", nil, fmt.Errorf("error creating local file: %v", err) 20 | } 21 | defer f.Done() 22 | 23 | // Meta data 24 | mc := make(chan map[string]string, 1) 25 | go func() { 26 | defer func() { 27 | if e := recover(); e != nil { 28 | // TODO: Propagate error. 29 | } 30 | }() 31 | 32 | meta := make(map[string]string) 33 | 34 | doc, err := mscfb.New(f) 35 | if err != nil { 36 | // TODO: Propagate error. 37 | mc <- meta 38 | return 39 | } 40 | 41 | props := msoleps.New() 42 | for entry, err := doc.Next(); err == nil; entry, err = doc.Next() { 43 | if msoleps.IsMSOLEPS(entry.Initial) { 44 | if err := props.Reset(doc); err != nil { 45 | // TODO: Propagate error. 46 | break 47 | } 48 | 49 | for _, prop := range props.Property { 50 | meta[prop.Name] = prop.String() 51 | } 52 | } 53 | } 54 | 55 | const defaultTimeFormat = "2006-01-02 15:04:05.999999999 -0700 MST" 56 | 57 | // Convert parsed meta 58 | if tmp, ok := meta["LastSaveTime"]; ok { 59 | if t, err := time.Parse(defaultTimeFormat, tmp); err == nil { 60 | meta["ModifiedDate"] = fmt.Sprintf("%d", t.Unix()) 61 | } 62 | } 63 | if tmp, ok := meta["CreateTime"]; ok { 64 | if t, err := time.Parse(defaultTimeFormat, tmp); err == nil { 65 | meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix()) 66 | } 67 | } 68 | 69 | mc <- meta 70 | }() 71 | 72 | // Document body 73 | bc := make(chan string, 1) 74 | go func() { 75 | // Save output to a file 76 | var buf bytes.Buffer 77 | outputFile, err := os.CreateTemp("/tmp", "sajari-convert-") 78 | if err != nil { 79 | bc <- buf.String() 80 | return 81 | } 82 | defer os.Remove(outputFile.Name()) 83 | 84 | err = exec.Command("wvText", f.Name(), outputFile.Name()).Run() 85 | if err != nil { 86 | // TODO: Propagate error. 87 | } 88 | 89 | _, err = buf.ReadFrom(outputFile) 90 | if err != nil { 91 | // TODO: Propagate error. 92 | } 93 | 94 | bc <- buf.String() 95 | }() 96 | 97 | // TODO: Should errors in either of the above Goroutines stop things from progressing? 98 | body := <-bc 99 | meta := <-mc 100 | 101 | // TODO: Check for errors instead of len(body) == 0? 102 | if len(body) == 0 { 103 | f.Seek(0, 0) 104 | return ConvertDocx(f) 105 | } 106 | return body, meta, nil 107 | } 108 | -------------------------------------------------------------------------------- /doc_test.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "os" 5 | "os/exec" 6 | "path" 7 | "strings" 8 | "testing" 9 | "time" 10 | 11 | "github.com/google/go-cmp/cmp" 12 | ) 13 | 14 | func TestConvertDoc(t *testing.T) { 15 | if _, err := exec.LookPath("wvText"); err != nil { 16 | t.Skip("wvText not installed") 17 | return 18 | } 19 | 20 | tests := []struct { 21 | file string 22 | wantTrimmedText string 23 | wantMeta map[string]string 24 | wantErr bool 25 | }{ 26 | { 27 | file: "001-test.doc", 28 | wantTrimmedText: "test", 29 | wantMeta: map[string]string{ 30 | "AppName": "Microsoft Office Word", 31 | "CharCount": "4", 32 | "Character count": "4", 33 | "CodePage": "1252", 34 | "Company": "", 35 | "CreateTime": "2023-09-13 01:54:00 +0000 UTC", 36 | "CreatedDate": "1694570040", 37 | "Dirty links": "false", 38 | "DocSecurity": "0", 39 | "Document parts": "0", 40 | "EditTime": "1970-01-01 00:00:00 +0000 UTC", 41 | "Heading pair": "0", 42 | "Hyperlinks changed": "false", 43 | "LastAuthor": "cloudconvert_7", 44 | "LastSaveTime": "2023-09-13 01:54:00 +0000 UTC", 45 | "Line count": "1", 46 | "ModifiedDate": "1694570040", 47 | "PageCount": "1", 48 | "Paragraph count": "1", 49 | "RevNumber": "1", 50 | "Scale": "false", 51 | "Shared document": "false", 52 | "Template": "Normal", 53 | "Version": "1048576", 54 | "WordCount": "0", 55 | }, 56 | }, 57 | } 58 | for _, tt := range tests { 59 | t.Run(tt.file, func(t *testing.T) { 60 | f, err := os.Open(path.Join("testdata", tt.file)) 61 | if err != nil { 62 | t.Fatal(err) 63 | } 64 | defer f.Close() 65 | 66 | gotText, gotMeta, err := ConvertDoc(f) 67 | if (err != nil) != tt.wantErr { 68 | t.Errorf("ConvertDoc() error = %v, wantErr %v", err, tt.wantErr) 69 | return 70 | } 71 | gotText = strings.TrimSpace(gotText) 72 | if gotText != tt.wantTrimmedText { 73 | t.Errorf("ConvertDoc() text = %v, want %v", gotText, tt.wantTrimmedText) 74 | } 75 | if !cmp.Equal(tt.wantMeta, gotMeta, maybeTimeComparer) { 76 | t.Errorf("ConvertDoc() meta mismatch (-want +got):\n%v", cmp.Diff(tt.wantMeta, gotMeta, maybeTimeComparer)) 77 | } 78 | }) 79 | } 80 | } 81 | 82 | // Compares strings as time.Times if they look like times. Required because 83 | // wvText returns different time formats depending on system clock. 84 | var maybeTimeComparer = cmp.Comparer(func(x, y string) bool { 85 | xt, xterr := time.Parse("2006-01-02 15:04:05 -0700 MST", x) 86 | yt, yterr := time.Parse("2006-01-02 15:04:05 -0700 MST", y) 87 | if xterr == nil && yterr == nil { 88 | return xt.Equal(yt) 89 | } 90 | return x == y 91 | }) 92 | -------------------------------------------------------------------------------- /docconv.go: -------------------------------------------------------------------------------- 1 | package docconv // import "code.sajari.com/docconv/v2" 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "os" 8 | "path" 9 | "strings" 10 | "time" 11 | ) 12 | 13 | // Response payload sent back to the requestor 14 | type Response struct { 15 | Body string `json:"body"` 16 | Meta map[string]string `json:"meta"` 17 | MSecs uint32 `json:"msecs"` 18 | Error string `json:"error"` 19 | } 20 | 21 | // MimeTypeByExtension returns a mimetype for the given extension, or 22 | // application/octet-stream if none can be determined. 23 | func MimeTypeByExtension(filename string) string { 24 | switch strings.ToLower(path.Ext(filename)) { 25 | case ".doc": 26 | return "application/msword" 27 | case ".docx": 28 | return "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 29 | case ".odt": 30 | return "application/vnd.oasis.opendocument.text" 31 | case ".pages": 32 | return "application/vnd.apple.pages" 33 | case ".pdf": 34 | return "application/pdf" 35 | case ".pptx": 36 | return "application/vnd.openxmlformats-officedocument.presentationml.presentation" 37 | case ".rtf": 38 | return "application/rtf" 39 | case ".xml": 40 | return "text/xml" 41 | case ".xhtml", ".html", ".htm": 42 | return "text/html" 43 | case ".jpg", ".jpeg", ".jpe", ".jfif", ".jfif-tbnl": 44 | return "image/jpeg" 45 | case ".png": 46 | return "image/png" 47 | case ".tif": 48 | return "image/tif" 49 | case ".tiff": 50 | return "image/tiff" 51 | case ".txt": 52 | return "text/plain" 53 | } 54 | return "application/octet-stream" 55 | } 56 | 57 | // Convert a file to plain text. 58 | func Convert(r io.Reader, mimeType string, readability bool) (*Response, error) { 59 | start := time.Now() 60 | 61 | var body string 62 | var meta map[string]string 63 | var err error 64 | switch mimeType { 65 | case "application/msword", "application/vnd.ms-word": 66 | body, meta, err = ConvertDoc(r) 67 | 68 | case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 69 | body, meta, err = ConvertDocx(r) 70 | 71 | case "application/vnd.openxmlformats-officedocument.presentationml.presentation": 72 | body, meta, err = ConvertPptx(r) 73 | 74 | case "application/vnd.oasis.opendocument.text": 75 | body, meta, err = ConvertODT(r) 76 | 77 | case "application/vnd.apple.pages", "application/x-iwork-pages-sffpages": 78 | body, meta, err = ConvertPages(r) 79 | 80 | case "application/pdf": 81 | body, meta, err = ConvertPDF(r) 82 | 83 | case "application/rtf", "application/x-rtf", "text/rtf", "text/richtext": 84 | body, meta, err = ConvertRTF(r) 85 | 86 | case "text/html": 87 | body, meta, err = ConvertHTML(r, readability) 88 | 89 | case "text/url": 90 | body, meta, err = ConvertURL(r, readability) 91 | 92 | case "text/xml", "application/xml": 93 | body, meta, err = ConvertXML(r) 94 | 95 | case "image/jpeg", "image/png", "image/tif", "image/tiff": 96 | body, meta, err = ConvertImage(r) 97 | 98 | case "text/plain": 99 | var b []byte 100 | b, err = io.ReadAll(r) 101 | body = string(b) 102 | } 103 | 104 | if err != nil { 105 | return nil, fmt.Errorf("error converting data: %v", err) 106 | } 107 | 108 | return &Response{ 109 | Body: strings.TrimSpace(body), 110 | Meta: meta, 111 | MSecs: uint32(time.Since(start) / time.Millisecond), 112 | }, nil 113 | } 114 | 115 | // ConvertPath converts a local path to text. 116 | func ConvertPath(path string) (*Response, error) { 117 | mimeType := MimeTypeByExtension(path) 118 | 119 | f, err := os.Open(path) 120 | if err != nil { 121 | return nil, err 122 | } 123 | defer f.Close() 124 | 125 | return Convert(f, mimeType, true) 126 | } 127 | 128 | // ConvertPathReadability converts a local path to text, with the given readability 129 | // option. 130 | func ConvertPathReadability(path string, readability bool) ([]byte, error) { 131 | mimeType := MimeTypeByExtension(path) 132 | 133 | f, err := os.Open(path) 134 | if err != nil { 135 | return nil, err 136 | } 137 | defer f.Close() 138 | 139 | data, err := Convert(f, mimeType, readability) 140 | if err != nil { 141 | return nil, err 142 | } 143 | return json.Marshal(data) 144 | } 145 | -------------------------------------------------------------------------------- /docconv_test.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestConvertTrimsSpace(t *testing.T) { 9 | resp, err := Convert( 10 | strings.NewReader(" \n\n\nthe \n file\n\n"), 11 | "text/plain", 12 | false, 13 | ) 14 | if err != nil { 15 | t.Fatalf("got error = %v, want nil", err) 16 | } 17 | if want := "the \n file"; resp.Body != want { 18 | t.Errorf("body = %v, want %v", resp.Body, want) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /docd/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.21 AS build 2 | 3 | WORKDIR /app 4 | COPY . ./ 5 | RUN go build -o /bin/docd ./docd 6 | 7 | ################################################################################ 8 | 9 | FROM debian:12-slim AS docd 10 | 11 | RUN apt-get update \ 12 | && apt-get install -y --no-install-recommends \ 13 | ca-certificates \ 14 | lynx \ 15 | poppler-utils \ 16 | tidy \ 17 | unrtf \ 18 | wv \ 19 | zip \ 20 | && apt-get clean \ 21 | && rm -rf /var/lib/apt/lists/* 22 | 23 | RUN update-ca-certificates 24 | 25 | EXPOSE 8888 26 | 27 | COPY --from=build /bin/docd /docd 28 | ENTRYPOINT ["/docd"] 29 | CMD ["--help"] 30 | -------------------------------------------------------------------------------- /docd/appengine/Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile which produces an AppEngine custom runtime containing docd and all 2 | # of its runtime dependencies. 3 | # https://cloud.google.com/appengine/docs/flexible/custom-runtimes/about-custom-runtimes 4 | FROM sajari/docd:1.3.8 5 | CMD ["-addr", ":8080", "-json-cloud-logging", "-error-reporting"] 6 | -------------------------------------------------------------------------------- /docd/appengine/README.md: -------------------------------------------------------------------------------- 1 | # Deploying docd to AppEngine 2 | 3 | Within this directory run: 4 | 5 | ``` 6 | gcloud app deploy 7 | ``` 8 | -------------------------------------------------------------------------------- /docd/appengine/app.yaml: -------------------------------------------------------------------------------- 1 | runtime: custom 2 | env: flex 3 | 4 | service: docd 5 | 6 | resources: 7 | memory_gb: 1.2 8 | 9 | manual_scaling: 10 | instances: 2 11 | 12 | handlers: 13 | - url: /.* 14 | script: _go_app 15 | -------------------------------------------------------------------------------- /docd/convert.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "errors" 8 | "fmt" 9 | "io" 10 | "log/slog" 11 | "net/http" 12 | "os" 13 | "syscall" 14 | 15 | "cloud.google.com/go/errorreporting" 16 | 17 | "code.sajari.com/docconv/v2" 18 | "code.sajari.com/docconv/v2/docd/internal" 19 | ) 20 | 21 | type convertServer struct { 22 | l *slog.Logger 23 | er internal.ErrorReporter 24 | } 25 | 26 | func (s *convertServer) convert(w http.ResponseWriter, r *http.Request) { 27 | ctx := r.Context() 28 | 29 | // Readability flag. Currently only used for HTML 30 | var readability bool 31 | if r.FormValue("readability") == "1" { 32 | readability = true 33 | s.l.DebugContext(ctx, "Readability is on") 34 | } 35 | 36 | path := r.FormValue("path") 37 | if path != "" { 38 | mimeType := docconv.MimeTypeByExtension(path) 39 | 40 | f, err := os.Open(path) 41 | if err != nil { 42 | s.serverError(ctx, w, r, fmt.Errorf("could not open file: %w", err)) 43 | return 44 | } 45 | defer f.Close() 46 | 47 | data, err := docconv.Convert(f, mimeType, readability) 48 | if err != nil { 49 | s.serverError(ctx, w, r, fmt.Errorf("could not convert file from path %v: %w", path, err)) 50 | return 51 | } 52 | 53 | s.respond(ctx, w, r, http.StatusOK, data) 54 | return 55 | } 56 | 57 | // Get uploaded file 58 | file, info, err := r.FormFile("input") 59 | if err != nil { 60 | s.serverError(ctx, w, r, fmt.Errorf("could not get input file: %w", err)) 61 | return 62 | } 63 | defer file.Close() 64 | 65 | // Abort if file doesn't have a mime type 66 | if len(info.Header["Content-Type"]) == 0 { 67 | s.clientError(ctx, w, r, http.StatusUnprocessableEntity, "input file %v does not have a Content-Type header", info.Filename) 68 | return 69 | } 70 | 71 | // If a generic mime type was provided then use file extension to determine mimetype 72 | mimeType := info.Header["Content-Type"][0] 73 | if mimeType == "application/octet-stream" { 74 | mimeType = docconv.MimeTypeByExtension(info.Filename) 75 | } 76 | 77 | s.l.InfoContext(ctx, "Received file", "filename", info.Filename, "mimeType", mimeType) 78 | 79 | data, err := docconv.Convert(file, mimeType, readability) 80 | if err != nil { 81 | s.serverError(ctx, w, r, fmt.Errorf("could not convert file: %w", err)) 82 | return 83 | } 84 | 85 | s.respond(ctx, w, r, http.StatusOK, data) 86 | } 87 | 88 | func (s *convertServer) clientError(ctx context.Context, w http.ResponseWriter, r *http.Request, code int, pattern string, args ...interface{}) { 89 | s.respond(ctx, w, r, code, &docconv.Response{ 90 | Error: fmt.Sprintf(pattern, args...), 91 | }) 92 | 93 | s.l.InfoContext(ctx, fmt.Sprintf(pattern, args...)) 94 | } 95 | 96 | func (s *convertServer) serverError(ctx context.Context, w http.ResponseWriter, r *http.Request, err error) { 97 | w.WriteHeader(http.StatusInternalServerError) 98 | w.Write([]byte(`{"error":"internal server error"}`)) 99 | 100 | e := errorreporting.Entry{ 101 | Error: err, 102 | Req: r, 103 | } 104 | s.er.Report(e) 105 | 106 | s.l.ErrorContext(ctx, err.Error(), "error", err) 107 | } 108 | 109 | func (s *convertServer) respond(ctx context.Context, w http.ResponseWriter, r *http.Request, code int, resp interface{}) { 110 | buf := &bytes.Buffer{} 111 | err := json.NewEncoder(buf).Encode(resp) 112 | if err != nil { 113 | s.serverError(ctx, w, r, fmt.Errorf("could not marshal JSON response: %w", err)) 114 | return 115 | } 116 | w.WriteHeader(code) 117 | n, err := io.Copy(w, buf) 118 | if err != nil { 119 | // Avoid panicking on broken pipe errors. 120 | // See https://gosamples.dev/broken-pipe/ 121 | if errors.Is(err, syscall.EPIPE) { 122 | s.l.DebugContext(ctx, err.Error(), "error", err) 123 | return 124 | } 125 | panic(fmt.Errorf("could not write to response (failed after %d bytes): %w", n, err)) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /docd/internal/cloudtrace/context.go: -------------------------------------------------------------------------------- 1 | package cloudtrace 2 | 3 | import ( 4 | "context" 5 | ) 6 | 7 | type traceKey struct{} 8 | type spanKey struct{} 9 | 10 | func contextWithTraceInfo(ctx context.Context, traceHeader string) context.Context { 11 | traceID, spanID := parseHeader(traceHeader) 12 | if traceID != "" { 13 | ctx = context.WithValue(ctx, traceKey{}, traceID) 14 | } 15 | if spanID != "" { 16 | ctx = context.WithValue(ctx, spanKey{}, spanID) 17 | } 18 | return ctx 19 | } 20 | 21 | func traceInfoFromContext(ctx context.Context) (traceID, spanID string) { 22 | traceID, _ = ctx.Value(traceKey{}).(string) 23 | spanID, _ = ctx.Value(spanKey{}).(string) 24 | return 25 | } 26 | -------------------------------------------------------------------------------- /docd/internal/cloudtrace/header.go: -------------------------------------------------------------------------------- 1 | package cloudtrace 2 | 3 | import "strings" 4 | 5 | // The header specification is: 6 | // "X-Cloud-Trace-Context: TRACE_ID/SPAN_ID;o=TRACE_TRUE" 7 | const CloudTraceContextHeader = "X-Cloud-Trace-Context" 8 | 9 | func parseHeader(value string) (traceID, spanID string) { 10 | var found bool 11 | traceID, after, found := strings.Cut(value, "/") 12 | if found { 13 | spanID, _, _ = strings.Cut(after, ";") 14 | } 15 | return 16 | } 17 | -------------------------------------------------------------------------------- /docd/internal/cloudtrace/http_handler.go: -------------------------------------------------------------------------------- 1 | package cloudtrace 2 | 3 | import "net/http" 4 | 5 | type HTTPHandler struct { 6 | // Handler to wrap. 7 | Handler http.Handler 8 | } 9 | 10 | func (h *HTTPHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { 11 | ctx := contextWithTraceInfo(r.Context(), r.Header.Get(CloudTraceContextHeader)) 12 | 13 | h.Handler.ServeHTTP(w, r.WithContext(ctx)) 14 | } 15 | -------------------------------------------------------------------------------- /docd/internal/cloudtrace/logger.go: -------------------------------------------------------------------------------- 1 | package cloudtrace 2 | 3 | // Inspired by https://github.com/remko/cloudrun-slog 4 | 5 | import ( 6 | "context" 7 | "fmt" 8 | "log/slog" 9 | "os" 10 | ) 11 | 12 | // Extra log level supported by Cloud Logging 13 | const LevelCritical = slog.Level(12) 14 | 15 | // Handler that outputs JSON understood by the structured log agent. 16 | // See https://cloud.google.com/logging/docs/agent/logging/configuration#special-fields 17 | type CloudLoggingHandler struct { 18 | handler slog.Handler 19 | projectID string 20 | } 21 | 22 | var _ slog.Handler = (*CloudLoggingHandler)(nil) 23 | 24 | func NewCloudLoggingHandler(projectID string, level slog.Level) *CloudLoggingHandler { 25 | return &CloudLoggingHandler{ 26 | projectID: projectID, 27 | handler: slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ 28 | AddSource: true, 29 | Level: level, 30 | ReplaceAttr: func(groups []string, a slog.Attr) slog.Attr { 31 | if a.Key == slog.MessageKey { 32 | a.Key = "message" 33 | } else if a.Key == slog.SourceKey { 34 | a.Key = "logging.googleapis.com/sourceLocation" 35 | } else if a.Key == slog.LevelKey { 36 | a.Key = "severity" 37 | level := a.Value.Any().(slog.Level) 38 | if level == LevelCritical { 39 | a.Value = slog.StringValue("CRITICAL") 40 | } 41 | } 42 | return a 43 | }, 44 | }), 45 | } 46 | } 47 | 48 | func (h *CloudLoggingHandler) Enabled(ctx context.Context, level slog.Level) bool { 49 | return h.handler.Enabled(ctx, level) 50 | } 51 | 52 | func (h *CloudLoggingHandler) Handle(ctx context.Context, rec slog.Record) error { 53 | traceID, spanID := traceInfoFromContext(ctx) 54 | if traceID != "" { 55 | rec = rec.Clone() 56 | // https://cloud.google.com/logging/docs/agent/logging/configuration#special-fields 57 | trace := fmt.Sprintf("projects/%s/traces/%s", h.projectID, traceID) 58 | rec.Add("logging.googleapis.com/trace", slog.StringValue(trace)) 59 | if spanID != "" { 60 | rec.Add("logging.googleapis.com/spanId", slog.StringValue(spanID)) 61 | } 62 | } 63 | return h.handler.Handle(ctx, rec) 64 | } 65 | 66 | func (h *CloudLoggingHandler) WithAttrs(attrs []slog.Attr) slog.Handler { 67 | return &CloudLoggingHandler{handler: h.handler.WithAttrs(attrs)} 68 | } 69 | 70 | func (h *CloudLoggingHandler) WithGroup(name string) slog.Handler { 71 | return &CloudLoggingHandler{handler: h.handler.WithGroup(name)} 72 | } 73 | -------------------------------------------------------------------------------- /docd/internal/debug/context.go: -------------------------------------------------------------------------------- 1 | package debug 2 | 3 | import ( 4 | "context" 5 | ) 6 | 7 | type debugEnabledKey struct{} 8 | 9 | func debugEnabledInContext(ctx context.Context) bool { 10 | enabled, ok := ctx.Value(debugEnabledKey{}).(bool) 11 | if !ok { 12 | return false 13 | } 14 | return enabled 15 | } 16 | -------------------------------------------------------------------------------- /docd/internal/debug/http_handler.go: -------------------------------------------------------------------------------- 1 | package debug 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "strconv" 7 | ) 8 | 9 | type HTTPHandler struct { 10 | // Handler to wrap. 11 | Handler http.Handler 12 | } 13 | 14 | func (h *HTTPHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { 15 | ctx := r.Context() 16 | 17 | if ok, _ := strconv.ParseBool(r.Header.Get(DebugHeader)); ok { 18 | ctx = context.WithValue(ctx, debugEnabledKey{}, true) 19 | } 20 | 21 | h.Handler.ServeHTTP(w, r.WithContext(ctx)) 22 | } 23 | -------------------------------------------------------------------------------- /docd/internal/debug/logger.go: -------------------------------------------------------------------------------- 1 | package debug 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | ) 7 | 8 | const DebugHeader = "X-Debug" 9 | 10 | type debugHandler struct { 11 | slog.Handler 12 | } 13 | 14 | func NewDebugHandler(h slog.Handler) *debugHandler { 15 | return &debugHandler{Handler: h} 16 | } 17 | 18 | var _ slog.Handler = (*debugHandler)(nil) 19 | 20 | func (h *debugHandler) Enabled(ctx context.Context, level slog.Level) bool { 21 | if debugEnabledInContext(ctx) { 22 | return true 23 | } 24 | return h.Handler.Enabled(ctx, level) 25 | } 26 | 27 | func (h *debugHandler) WithAttrs(attrs []slog.Attr) slog.Handler { 28 | return &debugHandler{Handler: h.Handler.WithAttrs(attrs)} 29 | } 30 | 31 | func (h *debugHandler) WithGroup(name string) slog.Handler { 32 | return &debugHandler{Handler: h.Handler.WithGroup(name)} 33 | } 34 | -------------------------------------------------------------------------------- /docd/internal/error_reporter.go: -------------------------------------------------------------------------------- 1 | package internal 2 | 3 | import ( 4 | "io" 5 | 6 | "cloud.google.com/go/errorreporting" 7 | ) 8 | 9 | // ErrorReporter reports errors. 10 | type ErrorReporter interface { 11 | Report(errorreporting.Entry) 12 | io.Closer 13 | } 14 | 15 | // NopErrorReporter is a no-op reporter. 16 | type NopErrorReporter struct{} 17 | 18 | var _ ErrorReporter = (*NopErrorReporter)(nil) 19 | 20 | // Report implements ErrorReporter. 21 | func (r *NopErrorReporter) Report(e errorreporting.Entry) {} 22 | 23 | // Close implements ErrorReporter. 24 | func (r *NopErrorReporter) Close() error { return nil } 25 | -------------------------------------------------------------------------------- /docd/internal/recovery.go: -------------------------------------------------------------------------------- 1 | package internal 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "log/slog" 7 | "net/http" 8 | "runtime/debug" 9 | 10 | "cloud.google.com/go/errorreporting" 11 | ) 12 | 13 | type recoveryHandler struct { 14 | l *slog.Logger 15 | er ErrorReporter 16 | handler http.Handler 17 | } 18 | 19 | // RecoveryHandler is HTTP middleware that recovers from a panic, writes a 20 | // 500, reports the panic, logs the panic and continues to the next handler. 21 | func RecoveryHandler(l *slog.Logger, er ErrorReporter) func(h http.Handler) http.Handler { 22 | return func(h http.Handler) http.Handler { 23 | return &recoveryHandler{l: l, er: er, handler: h} 24 | } 25 | } 26 | 27 | func (h recoveryHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { 28 | defer func() { 29 | if rec := recover(); rec != nil { 30 | w.WriteHeader(http.StatusInternalServerError) 31 | w.Write([]byte(`{"error":"internal server error"}`)) 32 | h.handle(req, &recovered{rec, debug.Stack()}) 33 | } 34 | }() 35 | 36 | h.handler.ServeHTTP(w, req) 37 | } 38 | 39 | func (h recoveryHandler) handle(r *http.Request, err error) { 40 | stack, _ := stackFromRecovered(err) 41 | 42 | e := errorreporting.Entry{ 43 | Error: err, 44 | Stack: stack, 45 | Req: r, 46 | } 47 | h.er.Report(e) 48 | 49 | h.l.ErrorContext(r.Context(), err.Error(), "error", err, "stack", string(stack)) 50 | } 51 | 52 | // recovered represents the return value from a call to recover. 53 | type recovered struct { 54 | // p is the error value passed to the call of panic. 55 | p interface{} 56 | // stack is the panic stack trace. 57 | stack []byte 58 | } 59 | 60 | var _ error = (*recovered)(nil) 61 | 62 | // Error implements error. 63 | func (e *recovered) Error() string { 64 | if err, ok := e.p.(error); ok { 65 | return err.Error() 66 | } 67 | return fmt.Sprintf("panic: %v", e.p) 68 | } 69 | 70 | // stackFromRecovered returns a stack trace and true if the recovdered has a 71 | // stack trace created by this package. 72 | // 73 | // Otherwise it returns nil and false. 74 | func stackFromRecovered(err error) ([]byte, bool) { 75 | var rec *recovered 76 | if errors.As(err, &rec) { 77 | return rec.stack, true 78 | } 79 | return nil, false 80 | } 81 | -------------------------------------------------------------------------------- /docd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "log/slog" 8 | "net/http" 9 | "os" 10 | "runtime" 11 | 12 | "cloud.google.com/go/compute/metadata" 13 | "cloud.google.com/go/errorreporting" 14 | 15 | "github.com/gorilla/mux" 16 | 17 | "code.sajari.com/docconv/v2" 18 | "code.sajari.com/docconv/v2/docd/internal" 19 | "code.sajari.com/docconv/v2/docd/internal/cloudtrace" 20 | "code.sajari.com/docconv/v2/docd/internal/debug" 21 | ) 22 | 23 | var ( 24 | listenAddr = flag.String("addr", ":8888", "The address to listen on (e.g. 127.0.0.1:8888)") 25 | 26 | inputPath = flag.String("input", "", "The file path to convert and exit; no server") 27 | 28 | jsonCloudLogging = flag.Bool("json-cloud-logging", false, "Whether or not to enable JSON Cloud Logging") 29 | 30 | cloudTraceGCPProjectID = flag.String("cloud-trace-gcp-project-id", "", "The GCP project to use for Cloud Trace") 31 | 32 | errorReporting = flag.Bool("error-reporting", false, "Whether or not to enable GCP Error Reporting") 33 | errorReportingGCPProjectID = flag.String("error-reporting-gcp-project-id", "", "The GCP project to use for Error Reporting") 34 | errorReportingAppEngineService = flag.String("error-reporting-app-engine-service", "", "The App Engine service to use for Error Reporting") 35 | 36 | readabilityLengthLow = flag.Int("readability-length-low", 70, "Sets the readability length low") 37 | readabilityLengthHigh = flag.Int("readability-length-high", 200, "Sets the readability length high") 38 | readabilityStopwordsLow = flag.Float64("readability-stopwords-low", 0.2, "Sets the readability stopwords low") 39 | readabilityStopwordsHigh = flag.Float64("readability-stopwords-high", 0.3, "Sets the readability stopwords high") 40 | readabilityMaxLinkDensity = flag.Float64("readability-max-link-density", 0.2, "Sets the readability max link density") 41 | readabilityMaxHeadingDistance = flag.Int("readability-max-heading-distance", 200, "Sets the readability max heading distance") 42 | readabilityUseClasses = flag.String("readability-use-classes", "good,neargood", "Comma separated list of readability classes to use") 43 | ) 44 | 45 | func main() { 46 | flag.Parse() 47 | 48 | l := slog.New(debug.NewDebugHandler(slog.Default().Handler())) 49 | 50 | if *jsonCloudLogging { 51 | gcpProjectID := *cloudTraceGCPProjectID 52 | if gcpProjectID == "" { 53 | gcpProjectID = os.Getenv("GOOGLE_CLOUD_PROJECT") 54 | } 55 | if gcpProjectID == "" { 56 | l.Debug("GOOGLE_CLOUD_PROJECT env var not provided, looking up internal metadata service") 57 | var err error 58 | gcpProjectID, err = metadata.ProjectID() 59 | if err != nil { 60 | l.Error("Could not autodetect GCP project ID", "error", err) 61 | os.Exit(1) 62 | } 63 | } 64 | 65 | l = slog.New(debug.NewDebugHandler(cloudtrace.NewCloudLoggingHandler(gcpProjectID, slog.LevelInfo))) 66 | slog.SetDefault(l) 67 | l.Info("Cloud Trace GCP project ID", "projectID", gcpProjectID) 68 | } 69 | 70 | var er internal.ErrorReporter = &internal.NopErrorReporter{} 71 | if *errorReporting { 72 | if *errorReportingGCPProjectID == "" { 73 | *errorReportingGCPProjectID = os.Getenv("GOOGLE_CLOUD_PROJECT") 74 | } 75 | if *errorReportingAppEngineService == "" { 76 | *errorReportingAppEngineService = os.Getenv("GAE_SERVICE") 77 | } 78 | var err error 79 | er, err = errorreporting.NewClient(context.Background(), *errorReportingGCPProjectID, errorreporting.Config{ 80 | ServiceName: *errorReportingAppEngineService, 81 | OnError: func(err error) { 82 | l.Error("Could not report error to Error Reporting service", "error", err) 83 | }, 84 | }) 85 | if err != nil { 86 | l.Error("Could not create Error Reporting client", "error", err) 87 | os.Exit(1) 88 | } 89 | } 90 | 91 | cs := &convertServer{ 92 | l: l, 93 | er: er, 94 | } 95 | 96 | // TODO: Improve this (remove the need for it!) 97 | docconv.HTMLReadabilityOptionsValues = docconv.HTMLReadabilityOptions{ 98 | LengthLow: *readabilityLengthLow, 99 | LengthHigh: *readabilityLengthHigh, 100 | StopwordsLow: *readabilityStopwordsLow, 101 | StopwordsHigh: *readabilityStopwordsHigh, 102 | MaxLinkDensity: *readabilityMaxLinkDensity, 103 | MaxHeadingDistance: *readabilityMaxHeadingDistance, 104 | ReadabilityUseClasses: *readabilityUseClasses, 105 | } 106 | 107 | if *inputPath != "" { 108 | resp, err := docconv.ConvertPath(*inputPath) 109 | if err != nil { 110 | l.Error("Could not convert file", "error", err, "path", *inputPath) 111 | os.Exit(1) 112 | } 113 | fmt.Print(string(resp.Body)) 114 | return 115 | } 116 | 117 | serve(l, er, cs) 118 | } 119 | 120 | // Start the conversion web service 121 | func serve(l *slog.Logger, er internal.ErrorReporter, cs *convertServer) { 122 | r := mux.NewRouter() 123 | r.HandleFunc("/convert", cs.convert) 124 | 125 | h := internal.RecoveryHandler(l, er)(r) 126 | h = &debug.HTTPHandler{Handler: h} 127 | h = &cloudtrace.HTTPHandler{Handler: h} 128 | 129 | l.Info("Go version " + runtime.Version()) 130 | 131 | l.Info(fmt.Sprintf("HTTP server listening on %q...", *listenAddr)) 132 | if err := http.ListenAndServe(*listenAddr, h); err != nil { 133 | l.Error("HTTP server ListenAndServe", "error", err) 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /docx.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "encoding/xml" 7 | "fmt" 8 | "io" 9 | "os" 10 | "time" 11 | ) 12 | 13 | type typeOverride struct { 14 | XMLName xml.Name `xml:"Override"` 15 | ContentType string `xml:"ContentType,attr"` 16 | PartName string `xml:"PartName,attr"` 17 | } 18 | 19 | type contentTypeDefinition struct { 20 | XMLName xml.Name `xml:"Types"` 21 | Overrides []typeOverride `xml:"Override"` 22 | } 23 | 24 | // ConvertDocx converts an MS Word docx file to text. 25 | func ConvertDocx(r io.Reader) (string, map[string]string, error) { 26 | var size int64 27 | 28 | // Common case: if the reader is a file (or trivial wrapper), avoid 29 | // loading it all into memory. 30 | var ra io.ReaderAt 31 | if f, ok := r.(interface { 32 | io.ReaderAt 33 | Stat() (os.FileInfo, error) 34 | }); ok { 35 | si, err := f.Stat() 36 | if err != nil { 37 | return "", nil, err 38 | } 39 | size = si.Size() 40 | ra = f 41 | } else { 42 | b, err := io.ReadAll(io.LimitReader(r, maxBytes)) 43 | if err != nil { 44 | return "", nil, nil 45 | } 46 | size = int64(len(b)) 47 | ra = bytes.NewReader(b) 48 | } 49 | 50 | zr, err := zip.NewReader(ra, size) 51 | if err != nil { 52 | return "", nil, fmt.Errorf("error unzipping data: %v", err) 53 | } 54 | 55 | zipFiles := mapZipFiles(zr.File) 56 | 57 | contentTypeDefinition, err := getContentTypeDefinition(zipFiles["[Content_Types].xml"]) 58 | if err != nil { 59 | return "", nil, err 60 | } 61 | 62 | meta := make(map[string]string) 63 | var textHeader, textBody, textFooter string 64 | for _, override := range contentTypeDefinition.Overrides { 65 | f := zipFiles[override.PartName] 66 | 67 | switch { 68 | case override.ContentType == "application/vnd.openxmlformats-package.core-properties+xml": 69 | rc, err := f.Open() 70 | if err != nil { 71 | return "", nil, fmt.Errorf("error opening '%v' from archive: %v", f.Name, err) 72 | } 73 | defer rc.Close() 74 | 75 | meta, err = XMLToMap(rc) 76 | if err != nil { 77 | return "", nil, fmt.Errorf("error parsing '%v': %v", f.Name, err) 78 | } 79 | 80 | if tmp, ok := meta["modified"]; ok { 81 | if t, err := time.Parse(time.RFC3339, tmp); err == nil { 82 | meta["ModifiedDate"] = fmt.Sprintf("%d", t.Unix()) 83 | } 84 | } 85 | if tmp, ok := meta["created"]; ok { 86 | if t, err := time.Parse(time.RFC3339, tmp); err == nil { 87 | meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix()) 88 | } 89 | } 90 | case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml": 91 | body, err := parseDocxText(f) 92 | if err != nil { 93 | return "", nil, err 94 | } 95 | textBody += body + "\n" 96 | case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml": 97 | footer, err := parseDocxText(f) 98 | if err != nil { 99 | return "", nil, err 100 | } 101 | textFooter += footer + "\n" 102 | case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml": 103 | header, err := parseDocxText(f) 104 | if err != nil { 105 | return "", nil, err 106 | } 107 | textHeader += header + "\n" 108 | } 109 | 110 | } 111 | return textHeader + "\n" + textBody + "\n" + textFooter, meta, nil 112 | } 113 | 114 | func getContentTypeDefinition(zf *zip.File) (*contentTypeDefinition, error) { 115 | f, err := zf.Open() 116 | if err != nil { 117 | return nil, err 118 | } 119 | defer f.Close() 120 | 121 | x := &contentTypeDefinition{} 122 | if err := xml.NewDecoder(io.LimitReader(f, maxBytes)).Decode(x); err != nil { 123 | return nil, err 124 | } 125 | return x, nil 126 | } 127 | 128 | func mapZipFiles(files []*zip.File) map[string]*zip.File { 129 | filesMap := make(map[string]*zip.File, 2*len(files)) 130 | for _, f := range files { 131 | filesMap[f.Name] = f 132 | filesMap["/"+f.Name] = f 133 | } 134 | return filesMap 135 | } 136 | 137 | func parseDocxText(f *zip.File) (string, error) { 138 | r, err := f.Open() 139 | if err != nil { 140 | return "", fmt.Errorf("error opening '%v' from archive: %v", f.Name, err) 141 | } 142 | defer r.Close() 143 | 144 | text, err := DocxXMLToText(r) 145 | if err != nil { 146 | return "", fmt.Errorf("error parsing '%v': %v", f.Name, err) 147 | } 148 | return text, nil 149 | } 150 | 151 | // DocxXMLToText converts Docx XML into plain text. 152 | func DocxXMLToText(r io.Reader) (string, error) { 153 | return XMLToText(r, []string{"br", "p", "tab"}, []string{"instrText", "script"}, true) 154 | } 155 | -------------------------------------------------------------------------------- /docx_test/docx_test.go: -------------------------------------------------------------------------------- 1 | package docx_test 2 | 3 | import ( 4 | "encoding/xml" 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "code.sajari.com/docconv/v2" 10 | ) 11 | 12 | func TestConvertDocx(t *testing.T) { 13 | f, err := os.Open("./testdata/sample.docx") 14 | if err != nil { 15 | t.Fatalf("got error = %v, want nil", err) 16 | } 17 | 18 | resp, _, err := docconv.ConvertDocx(f) 19 | if err != nil { 20 | t.Fatalf("got error = %v, want nil", err) 21 | } 22 | 23 | if want := "Header"; !strings.Contains(resp, want) { 24 | t.Errorf("expected %v to contains %v", resp, want) 25 | } 26 | if want := "Footer"; !strings.Contains(resp, want) { 27 | t.Errorf("expected %v to contains %v", resp, want) 28 | } 29 | if want := "Content"; !strings.Contains(resp, want) { 30 | t.Errorf("expected %v to contains %v", resp, want) 31 | } 32 | } 33 | 34 | func TestConvertDocxWithUncommonValidStructure(t *testing.T) { 35 | f, err := os.Open("./testdata/sample_2.docx") 36 | if err != nil { 37 | t.Fatalf("got error = %v, want nil", err) 38 | } 39 | resp, _, err := docconv.ConvertDocx(f) 40 | if err != nil { 41 | t.Fatalf("got error = %v, want nil", err) 42 | } 43 | 44 | if want := "Header"; !strings.Contains(resp, want) { 45 | t.Errorf("expected %v to contains %v", resp, want) 46 | } 47 | if want := "Footer"; !strings.Contains(resp, want) { 48 | t.Errorf("expected %v to contains %v", resp, want) 49 | } 50 | if want := "Content"; !strings.Contains(resp, want) { 51 | t.Errorf("expected %v to contains %v", resp, want) 52 | } 53 | } 54 | 55 | 56 | func TestConvertDocxDecompressionSizeLimit(t *testing.T) { 57 | f, err := os.Open("./testdata/decompression_size_limit.docx") 58 | if err != nil { 59 | t.Fatalf("got error = %v, want nil", err) 60 | } 61 | _, _, err = docconv.ConvertDocx(f) 62 | if _, ok := err.(*xml.SyntaxError); !ok { 63 | t.Errorf("got error = %T, want *xml.SyntaxError", err) 64 | } 65 | if want := "EOF"; !strings.Contains(err.Error(), want) { 66 | t.Errorf("got error = %v, want %v", err, want) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /docx_test/testdata/decompression_size_limit.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/docx_test/testdata/decompression_size_limit.docx -------------------------------------------------------------------------------- /docx_test/testdata/sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/docx_test/testdata/sample.docx -------------------------------------------------------------------------------- /docx_test/testdata/sample_2.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/docx_test/testdata/sample_2.docx -------------------------------------------------------------------------------- /docx_test/testdata/sample_3.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/docx_test/testdata/sample_3.docx -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module code.sajari.com/docconv/v2 2 | 3 | go 1.21 4 | 5 | require ( 6 | cloud.google.com/go/compute/metadata v0.2.3 7 | cloud.google.com/go/errorreporting v0.3.0 8 | github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 9 | github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1 10 | github.com/google/go-cmp v0.5.9 11 | github.com/gorilla/mux v1.8.0 12 | github.com/otiai10/gosseract/v2 v2.2.4 13 | github.com/richardlehane/mscfb v1.0.3 14 | github.com/richardlehane/msoleps v1.0.4-0.20231124170528-c8ca5a164365 15 | golang.org/x/net v0.17.0 16 | google.golang.org/protobuf v1.30.0 17 | ) 18 | 19 | require ( 20 | cloud.google.com/go/compute v1.19.1 // indirect 21 | github.com/PuerkitoBio/goquery v1.5.1 // indirect 22 | github.com/andybalholm/cascadia v1.2.0 // indirect 23 | github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1 // indirect 24 | github.com/fatih/set v0.2.1 // indirect 25 | github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect 26 | github.com/go-resty/resty/v2 v2.3.0 // indirect 27 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect 28 | github.com/golang/protobuf v1.5.3 // indirect 29 | github.com/googleapis/enterprise-certificate-proxy v0.2.3 // indirect 30 | github.com/googleapis/gax-go/v2 v2.7.1 // indirect 31 | github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7 // indirect 32 | github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect 33 | github.com/mattn/go-runewidth v0.0.9 // indirect 34 | github.com/olekukonko/tablewriter v0.0.4 // indirect 35 | github.com/pkg/errors v0.9.1 // indirect 36 | github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect 37 | go.opencensus.io v0.24.0 // indirect 38 | golang.org/x/oauth2 v0.7.0 // indirect 39 | golang.org/x/sync v0.1.0 // indirect 40 | golang.org/x/sys v0.13.0 // indirect 41 | golang.org/x/text v0.13.0 // indirect 42 | google.golang.org/api v0.114.0 // indirect 43 | google.golang.org/appengine v1.6.7 // indirect 44 | google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect 45 | google.golang.org/grpc v1.56.3 // indirect 46 | ) 47 | -------------------------------------------------------------------------------- /html.go: -------------------------------------------------------------------------------- 1 | //go:build !appengine 2 | 3 | package docconv 4 | 5 | import ( 6 | "bytes" 7 | "io" 8 | "strings" 9 | 10 | "golang.org/x/net/html" 11 | 12 | "github.com/JalfResi/justext" 13 | ) 14 | 15 | // ConvertHTML converts HTML into text. 16 | func ConvertHTML(r io.Reader, readability bool) (string, map[string]string, error) { 17 | meta := make(map[string]string) 18 | 19 | buf := new(bytes.Buffer) 20 | _, err := buf.ReadFrom(r) 21 | if err != nil { 22 | return "", nil, err 23 | } 24 | 25 | cleanXML, err := Tidy(buf, false) 26 | if err != nil { 27 | // Tidy failed, so we now manually tokenize instead 28 | clean := cleanHTML(buf, true) 29 | cleanXML = []byte(clean) 30 | } 31 | 32 | if readability { 33 | var err error 34 | cleanXML, err = HTMLReadability(bytes.NewReader(cleanXML)) 35 | if err != nil { 36 | return "", nil, err 37 | } 38 | } 39 | text, err := HTMLToText(bytes.NewReader(cleanXML)) 40 | if err != nil { 41 | return "", nil, err 42 | } 43 | return text, meta, nil 44 | } 45 | 46 | var acceptedHTMLTags = [...]string{ 47 | "div", "p", "br", "span", "body", "head", "html", "ul", "ol", "li", "dl", "dt", "dd", "a", "form", "article", 48 | "section", "table", "tr", "td", "tbody", "thead", "th", "tfoot", "col", "colgroup", "caption", "form", "input", 49 | "title", "h1", "h2", "h3", "h4", "h5", "h6", "meta", "strong", "cite", "em", "address", "abbr", "acronym", 50 | "blockquote", "q", "pre", "samp", "select", "fieldset", "legend", "button", "option", "textarea", "label", 51 | } 52 | 53 | // Tests for known friendly HTML parameters that tidy is unlikely to choke on 54 | func acceptedHTMLTag(tagName string) bool { 55 | for _, tag := range acceptedHTMLTags { 56 | if tag == tagName { 57 | return true 58 | } 59 | } 60 | return false 61 | } 62 | 63 | // Removes scripts, comments, styles and parameters from HTML. 64 | // Also removes made up tags, e.g. 65 | // Can keep head elements or not. Typically not much in there. 66 | func cleanHTML(r io.Reader, all bool) string { 67 | output := "" 68 | if !all { 69 | output = "" 70 | } 71 | mainSection := false 72 | junkSection := false 73 | 74 | d := html.NewTokenizer(r) 75 | for { 76 | // token type 77 | tokenType := d.Next() 78 | if tokenType == html.ErrorToken { 79 | return output 80 | } 81 | token := d.Token() 82 | 83 | switch tokenType { 84 | case html.StartTagToken: // 85 | if token.Data == "body" || (token.Data == "html" && all) { 86 | mainSection = true 87 | } 88 | if !acceptedHTMLTag(token.Data) { 89 | junkSection = true 90 | } 91 | 92 | if !junkSection && mainSection { 93 | output += "<" + token.Data + ">" 94 | } 95 | 96 | case html.TextToken: // text between start and end tag 97 | if !junkSection && mainSection { 98 | output += token.Data 99 | } 100 | 101 | case html.EndTagToken: // 102 | if !junkSection && mainSection { 103 | output += "" 104 | } 105 | if !acceptedHTMLTag(token.Data) { 106 | junkSection = false 107 | } 108 | 109 | case html.SelfClosingTagToken: // 110 | if !junkSection && mainSection { 111 | output += "<" + token.Data + " />" // TODO: Can probably keep attributes from the meta tags 112 | } 113 | } 114 | } 115 | } 116 | 117 | // HTMLReadabilityOptions is a type which defines parameters that are passed to the justext package. 118 | // TODO: Improve this! 119 | type HTMLReadabilityOptions struct { 120 | LengthLow int 121 | LengthHigh int 122 | StopwordsLow float64 123 | StopwordsHigh float64 124 | MaxLinkDensity float64 125 | MaxHeadingDistance int 126 | ReadabilityUseClasses string 127 | } 128 | 129 | // HTMLReadabilityOptionsValues are the global settings used for HTMLReadability. 130 | // TODO: Remove this from global state. 131 | var HTMLReadabilityOptionsValues HTMLReadabilityOptions 132 | 133 | // HTMLReadability extracts the readable text in an HTML document 134 | func HTMLReadability(r io.Reader) ([]byte, error) { 135 | jr := justext.NewReader(r) 136 | 137 | // TODO: Improve this! 138 | jr.Stoplist = readabilityStopList 139 | jr.LengthLow = HTMLReadabilityOptionsValues.LengthLow 140 | jr.LengthHigh = HTMLReadabilityOptionsValues.LengthHigh 141 | jr.StopwordsLow = HTMLReadabilityOptionsValues.StopwordsLow 142 | jr.StopwordsHigh = HTMLReadabilityOptionsValues.StopwordsHigh 143 | jr.MaxLinkDensity = HTMLReadabilityOptionsValues.MaxLinkDensity 144 | jr.MaxHeadingDistance = HTMLReadabilityOptionsValues.MaxHeadingDistance 145 | 146 | paragraphSet, err := jr.ReadAll() 147 | if err != nil { 148 | return nil, err 149 | } 150 | 151 | useClasses := strings.SplitN(HTMLReadabilityOptionsValues.ReadabilityUseClasses, ",", 10) 152 | 153 | output := "" 154 | for _, paragraph := range paragraphSet { 155 | for _, class := range useClasses { 156 | if paragraph.CfClass == class { 157 | output += paragraph.Text + "\n" 158 | } 159 | } 160 | } 161 | 162 | return []byte(output), nil 163 | } 164 | 165 | // HTMLToText converts HTML to plain text. 166 | func HTMLToText(input io.Reader) (string, error) { 167 | return XMLToText(input, []string{"br", "p", "h1", "h2", "h3", "h4"}, []string{}, false) 168 | } 169 | 170 | var readabilityStopList = map[string]bool{"and": true, "the": true, "a": true, "about": true, "above": true, "across": true, "after": true, "afterwards": true, "again": true, "against": true, "all": true, "almost": true, "alone": true, 171 | "along": true, "already": true, "also": true, "although": true, "always": true, "am": true, "among": true, "amongst": true, "amoungst": true, "amount": true, "an": true, "another": true, "any": true, 172 | "anyhow": true, "anyone": true, "anything": true, "anyway": true, "anywhere": true, "are": true, "around": true, "as": true, "at": true, "back": true, "be": true, "became": true, "because": true, 173 | "become": true, "becomes": true, "becoming": true, "been": true, "before": true, "beforehand": true, "behind": true, "being": true, "below": true, "beside": true, "besides": true, "between": true, 174 | "beyond": true, "both": true, "bottom": true, "but": true, "by": true, "can": true, "cannot": true, "cant": true, "co": true, "con": true, "could": true, "couldnt": true, "cry": true, 175 | "de": true, "describe": true, "detail": true, "do": true, "done": true, "down": true, "due": true, "during": true, "each": true, "eg": true, "eight": true, "either": true, "eleven": true, "else": true, 176 | "elsewhere": true, "empty": true, "enough": true, "etc": true, "even": true, "ever": true, "every": true, "everyone": true, "everything": true, "everywhere": true, "except": true, "few": true, 177 | "fifteen": true, "fify": true, "fill": true, "find": true, "fire": true, "first": true, "five": true, "for": true, "former": true, "formerly": true, "forty": true, "found": true, "four": true, "from": true, 178 | "front": true, "full": true, "further": true, "get": true, "give": true, "go": true, "had": true, "has": true, "hasnt": true, "have": true, "he": true, "hence": true, "her": true, "here": true, "hereafter": true, 179 | "hereby": true, "herein": true, "hereupon": true, "hers": true, "herself": true, "him": true, "himself": true, "his": true, "how": true, "however": true, "hundred": true, "ie": true, "if": true, "in": true, 180 | "inc": true, "indeed": true, "interest": true, "into": true, "is": true, "it": true, "its": true, "itself": true, "keep": true, "last": true, "latter": true, "latterly": true, "least": true, "less": true, 181 | "ltd": true, "made": true, "many": true, "may": true, "me": true, "meanwhile": true, "might": true, "mill": true, "mine": true, "more": true, "moreover": true, "most": true, "mostly": true, "move": true, 182 | "much": true, "must": true, "my": true, "myself": true, "name": true, "namely": true, "neither": true, "never": true, "nevertheless": true, "next": true, "nine": true, "no": true, "nobody": true, 183 | "none": true, "noone": true, "nor": true, "not": true, "nothing": true, "now": true, "nowhere": true, "of": true, "off": true, "often": true, "on": true, "once": true, "one": true, "only": true, "onto": true, 184 | "or": true, "other": true, "others": true, "otherwise": true, "our": true, "ours": true, "ourselves": true, "out": true, "over": true, "own": true, "part": true, "per": true, "perhaps": true, 185 | "please": true, "put": true, "rather": true, "re": true, "same": true, "see": true, "seem": true, "seemed": true, "seeming": true, "seems": true, "serious": true, "several": true, "she": true, 186 | "should": true, "show": true, "side": true, "since": true, "sincere": true, "six": true, "sixty": true, "so": true, "some": true, "somehow": true, "someone": true, "something": true, "sometime": true, 187 | "sometimes": true, "somewhere": true, "still": true, "such": true, "take": true, "ten": true, "than": true, "that": true, "their": true, "them": true, "themselves": true, 188 | "then": true, "thence": true, "there": true, "thereafter": true, "thereby": true, "therefore": true, "therein": true, "thereupon": true, "these": true, "they": true, "thickv": true, "thin": true, 189 | "third": true, "this": true, "those": true, "though": true, "three": true, "through": true, "throughout": true, "thru": true, "thus": true, "to": true, "together": true, "too": true, "top": true, 190 | "toward": true, "towards": true, "twelve": true, "twenty": true, "two": true, "un": true, "under": true, "until": true, "up": true, "upon": true, "us": true, "very": true, "via": true, "was": true, "we": true, 191 | "well": true, "were": true, "what": true, "whatever": true, "when": true, "whence": true, "whenever": true, "where": true, "whereafter": true, "whereas": true, "whereby": true, "wherein": true, 192 | "whereupon": true, "wherever": true, "whether": true, "which": true, "while": true, "whither": true, "who": true, "whoever": true, "whole": true, "whom": true, "whose": true, "why": true, "will": true, 193 | "with": true, "within": true, "without": true, "would": true, "yet": true, "you": true, "your": true, "youre": true, "yours": true, "yourself": true, "yourselves": true, "www": true, "com": true, "http": true} 194 | -------------------------------------------------------------------------------- /html_appengine.go: -------------------------------------------------------------------------------- 1 | //go:build appengine 2 | 3 | package docconv 4 | 5 | import ( 6 | "io" 7 | ) 8 | 9 | func HTMLReadability(r io.Reader) ([]byte, error) { 10 | return io.ReadAll(r) 11 | } 12 | -------------------------------------------------------------------------------- /html_test/html_test.go: -------------------------------------------------------------------------------- 1 | package html_test 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os" 7 | "testing" 8 | 9 | "github.com/google/go-cmp/cmp" 10 | 11 | "code.sajari.com/docconv/v2" 12 | ) 13 | 14 | func TestConvertHTML_readabilityUseClasses(t *testing.T) { 15 | tests := []struct { 16 | readabilityUseClasses string 17 | want string 18 | }{ 19 | // The default value set by this package. 20 | { 21 | readabilityUseClasses: "", 22 | want: ``, 23 | }, 24 | // Enable output with readability true. 25 | { 26 | readabilityUseClasses: "good", 27 | want: `1 28 | word 29 | This is a full sentence. 30 | `, 31 | }, 32 | } 33 | 34 | for _, tt := range tests { 35 | t.Run(fmt.Sprintf("%q", tt.readabilityUseClasses), func(t *testing.T) { 36 | old := docconv.HTMLReadabilityOptionsValues.ReadabilityUseClasses 37 | t.Cleanup(func() { 38 | docconv.HTMLReadabilityOptionsValues.ReadabilityUseClasses = old 39 | }) 40 | docconv.HTMLReadabilityOptionsValues.ReadabilityUseClasses = tt.readabilityUseClasses 41 | 42 | data, err := os.ReadFile("testdata/test.html") 43 | must(t, err) 44 | got, _, err := docconv.ConvertHTML(bytes.NewReader(data), true) 45 | must(t, err) 46 | 47 | diff := cmp.Diff(tt.want, got) 48 | if diff != "" { 49 | t.Errorf("result mismatch (-want +got):\n%v", diff) 50 | } 51 | }) 52 | } 53 | } 54 | 55 | func must(t *testing.T, err error) { 56 | t.Helper() 57 | if err != nil { 58 | t.Fatal(err) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /html_test/testdata/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | 13 |

14 | 1

15 |

word

16 |

This is a full sentence.

17 | 18 | 19 | -------------------------------------------------------------------------------- /iWork/generate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Generate code from protos stored in pb-schema. 3 | 4 | set -euo pipefail 5 | 6 | cd "$(dirname "$0")" 7 | 8 | function ensure_installed() { 9 | command -v "$1" >/dev/null 2>&1 || { 10 | echo >&2 "Could not find $1, please ensure it is installed." 11 | echo "Try running: $2" 12 | exit 1 13 | } 14 | } 15 | 16 | ensure_installed protoc "brew install protobuf" 17 | ensure_installed protoc-gen-go "go install google.golang.org/protobuf/cmd/protoc-gen-go@latest" 18 | 19 | protoc -I=./pb-schema --go_out=paths=source_relative:. TSPArchiveMessages.proto 20 | protoc -I=./pb-schema --go_out=paths=source_relative:. TSPDatabaseMessages.proto 21 | protoc -I=./pb-schema --go_out=paths=source_relative:. TSPMessages.proto 22 | -------------------------------------------------------------------------------- /iWork/pb-schema/README.md: -------------------------------------------------------------------------------- 1 | These .proto files were created by running [proto-dump](https://github.com/obriensp/proto-dump) on Keynote 6.0, Pages 5.0 and Numbers 3.0. -------------------------------------------------------------------------------- /iWork/pb-schema/TNArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSCHArchives.proto"; 6 | import "TSCEArchives.proto"; 7 | import "TSSArchives.proto"; 8 | import "TSDArchives.proto"; 9 | import "TSWPArchives.proto"; 10 | import "TSAArchives.proto"; 11 | import "TSTArchives.proto"; 12 | package TN; 13 | 14 | enum SheetPageOrder { 15 | SheetPageOrderTopToBottom = 0; 16 | SheetPageOrderLeftToRight = 1; 17 | } 18 | 19 | message SheetUIStateArchive { 20 | required float view_scale = 1; 21 | required .TSP.Point scroll_position = 2; 22 | optional float previous_view_scale = 3; 23 | optional bool scroll_position_is_unscaled = 4; 24 | optional .TSP.Point previous_scroll_position = 5; 25 | optional bool scroll_position_valid = 6; 26 | optional bool previous_scroll_position_valid = 7; 27 | optional .TSP.Size visible_size = 8; 28 | optional .TSP.Size previous_visible_size = 9; 29 | optional uint32 device_idiom = 10; 30 | optional uint32 form_focused_record_index = 11; 31 | optional uint32 form_focused_field_index = 12; 32 | } 33 | 34 | message SheetUIStateDictionaryEntryArchive { 35 | required .TSP.Reference sheet = 1; 36 | required .TN.SheetUIStateArchive sheet_uistate = 2; 37 | } 38 | 39 | message UIStateArchive { 40 | enum InspectorPaneViewMode { 41 | kInspectorPaneViewModeFormat = 0; 42 | kInspectorPaneViewModeFilter = 1; 43 | } 44 | required uint32 active_sheet_index = 1 [deprecated = true]; 45 | repeated .TSP.Reference selected_info = 2; 46 | repeated .TN.SheetUIStateDictionaryEntryArchive sheet_uistate_dictionary_entry = 3; 47 | optional .TST.SelectionArchive table_selection = 4; 48 | optional uint32 editing_sheet_index = 5 [deprecated = true]; 49 | optional int32 document_mode = 6; 50 | repeated .TN.SheetUIStateDictionaryEntryArchive edit_mode_sheet_uistate_dictionary_entry = 7; 51 | optional int32 table_editing_mode = 8; 52 | optional uint32 form_focused_record_index = 9 [deprecated = true]; 53 | optional uint32 form_focused_field_index = 10 [deprecated = true]; 54 | optional bool in_chart_mode = 11; 55 | optional .TN.ChartSelectionArchive chart_selection = 12; 56 | optional .TSP.Reference sheet_selection = 13; 57 | optional bool inspector_pane_visible = 14 [default = true]; 58 | optional .TN.UIStateArchive.InspectorPaneViewMode inspector_pane_view_mode = 15 [default = kInspectorPaneViewModeFormat]; 59 | repeated uint32 selected_quick_calc_functions = 16; 60 | optional bool removed_all_quick_calc_functions = 17; 61 | optional bool show_canvas_guides = 18; 62 | optional bool shows_comments = 19; 63 | } 64 | 65 | message SheetSelectionArchive { 66 | optional .TSP.Reference sheet = 1; 67 | optional bool paginated = 2; 68 | } 69 | 70 | message UndoRedoStateArchive { 71 | required .TN.UIStateArchive ui_state = 1; 72 | } 73 | 74 | message DocumentArchive { 75 | repeated .TSP.Reference sheets = 1; 76 | required .TSA.DocumentArchive super = 8; 77 | optional .TSP.Reference calculation_engine = 3 [deprecated = true]; 78 | required .TSP.Reference stylesheet = 4; 79 | required .TSP.Reference sidebar_order = 5; 80 | required .TSP.Reference theme = 6; 81 | optional .TN.UIStateArchive uistate = 7; 82 | optional .TSP.Reference custom_format_list = 9; 83 | optional string printer_id = 10; 84 | optional string paper_id = 11; 85 | optional .TSP.Size page_size = 12; 86 | } 87 | 88 | message PlaceholderArchive { 89 | required .TSWP.ShapeInfoArchive super = 1; 90 | } 91 | 92 | message SheetArchive { 93 | required string name = 1; 94 | repeated .TSP.Reference drawable_infos = 2; 95 | optional bool in_portrait_page_orientation = 3; 96 | optional bool show_repeating_headers = 4 [deprecated = true]; 97 | optional bool show_page_numbers = 5; 98 | optional bool is_autofit_on = 6; 99 | optional float content_scale = 7; 100 | optional .TN.SheetPageOrder page_order = 8; 101 | optional .TSD.EdgeInsetsArchive print_margins = 10; 102 | optional bool using_start_page_number = 11; 103 | optional int32 start_page_number = 12; 104 | optional float page_header_inset = 13; 105 | optional float page_footer_inset = 14; 106 | optional .TSP.Reference header_storage = 15; 107 | optional .TSP.Reference footer_storage = 16; 108 | optional .TSP.Reference userDefinedGuideStorage = 17; 109 | } 110 | 111 | message FormBasedSheetArchive { 112 | required .TN.SheetArchive super = 1; 113 | optional .TSCE.CFUUIDArchive table_id = 2; 114 | } 115 | 116 | message ThemeArchive { 117 | required .TSS.ThemeArchive super = 1; 118 | repeated .TSP.Reference prototypes = 2; 119 | } 120 | 121 | message ChartMediatorFormulaStorage { 122 | repeated .TSCE.FormulaArchive data_formulae = 1; 123 | repeated .TSCE.FormulaArchive row_label_formulae = 3; 124 | repeated .TSCE.FormulaArchive col_label_formulae = 4; 125 | optional int32 direction = 5; 126 | repeated .TSCE.FormulaArchive error_custom_pos_formulae = 6; 127 | repeated .TSCE.FormulaArchive error_custom_neg_formulae = 7; 128 | repeated .TSCE.FormulaArchive error_custom_pos_scatterX_formulae = 8; 129 | repeated .TSCE.FormulaArchive error_custom_neg_scatterX_formulae = 9; 130 | } 131 | 132 | message ChartMediatorArchive { 133 | required .TSCH.ChartMediatorArchive super = 1; 134 | required string entity_id = 2; 135 | optional .TN.ChartMediatorFormulaStorage formulas = 3; 136 | optional bool columns_are_series = 4; 137 | optional bool is_registered_with_calc_engine = 5 [deprecated = true]; 138 | } 139 | 140 | message ChartSelectionArchive { 141 | optional .TSCE.RangeReferenceArchive reference = 1; 142 | optional .TSCH.ChartSelectionArchive super = 2; 143 | } 144 | 145 | -------------------------------------------------------------------------------- /iWork/pb-schema/TNCommandArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSCHArchives.proto"; 6 | import "TSCHCommandArchives.proto"; 7 | import "TSCEArchives.proto"; 8 | import "TSSArchives.proto"; 9 | import "TSDArchives.proto"; 10 | import "TSWPArchives.proto"; 11 | import "TSAArchives.proto"; 12 | import "TSTArchives.proto"; 13 | import "TNArchives.proto"; 14 | package TN; 15 | 16 | message SheetCommandSelectionBehaviorArchive { 17 | optional .TSP.Reference sheet_selection = 1; 18 | } 19 | 20 | message CommandFormChooseTargetTableArchive { 21 | required .TSK.CommandArchive super = 1; 22 | optional .TSCE.CFUUIDArchive table_id = 2; 23 | optional .TSCE.CFUUIDArchive previous_table_id = 3; 24 | optional string sheet_name = 4; 25 | optional string previous_sheet_name = 5; 26 | required .TSP.Reference sheet = 6; 27 | } 28 | 29 | message CommandSheetInsertDrawablesArchive { 30 | required .TSP.Reference sheet = 1; 31 | repeated .TSP.Reference drawables = 2; 32 | optional .TSP.Reference provider_undo = 3; 33 | required .TSK.CommandArchive super = 4; 34 | optional bool forDrag = 5; 35 | optional bool forPaste = 7; 36 | optional int32 atIndex = 6; 37 | } 38 | 39 | message CommandSheetRemoveDrawablesArchive { 40 | required .TSK.CommandArchive super = 1; 41 | required .TSP.Reference sheet = 2; 42 | repeated .TSP.Reference sortedDrawables = 3; 43 | required .TSP.IndexSet drawableIndices = 4; 44 | repeated .TSP.Reference old_sheet_sidebar_order = 5; 45 | optional .TSP.Reference formula_rewrite_command_for_undo = 6; 46 | } 47 | 48 | message CommandSheetMoveDrawableZOrderArchive { 49 | required .TSK.CommandArchive super = 1; 50 | required .TSP.Reference sheet = 2; 51 | repeated .TSP.Reference drawable_infos = 3; 52 | required .TSP.IndexSet indexes = 4; 53 | } 54 | 55 | message CommandDocumentInsertSheetArchive { 56 | required .TSP.Reference document = 1; 57 | required .TSP.Reference sheet = 2; 58 | required .TSK.CommandArchive super = 3; 59 | } 60 | 61 | message CommandDocumentRemoveSheetArchive { 62 | required .TSP.Reference document = 1; 63 | required .TSP.Reference sheet = 2; 64 | required uint32 index = 3; 65 | repeated .TSP.Reference old_sheet_sidebar_order = 4; 66 | required .TSK.CommandArchive super = 5; 67 | optional .TSP.Reference formula_rewrite_command_for_undo = 6; 68 | } 69 | 70 | message CommandDocumentReplaceLastSheetArchive { 71 | required .TSP.Reference document = 1; 72 | required .TSP.Reference last_sheet = 2; 73 | required .TSP.Reference new_sheet = 3; 74 | required .TSK.CommandArchive super = 4; 75 | optional uint32 index = 5; 76 | } 77 | 78 | message CommandDocumentReorderSheetArchive { 79 | required .TSP.Reference document = 1; 80 | required .TSP.Reference sheet = 2; 81 | required uint32 source_index = 3; 82 | required uint32 dest_index = 4; 83 | required .TSK.CommandArchive super = 5; 84 | } 85 | 86 | message CommandSetSheetNameArchive { 87 | required string newname = 1; 88 | required string oldname = 2; 89 | required .TSP.Reference sheet = 3; 90 | required .TSK.CommandArchive super = 4; 91 | } 92 | 93 | message CommandSetPageOrientationArchive { 94 | required bool in_portrait_page_orientation = 1; 95 | required .TSP.Reference sheet = 2; 96 | required .TSK.CommandArchive super = 3; 97 | } 98 | 99 | message CommandSetShowPageNumbersValueArchive { 100 | required bool show_page_numbers = 1; 101 | required .TSP.Reference sheet = 2; 102 | required .TSK.CommandArchive super = 3; 103 | } 104 | 105 | message CommandSetRepeatingHeadersValueArchive { 106 | required bool show_repeating_headers = 1; 107 | required .TSP.Reference sheet = 2; 108 | required .TSK.CommandArchive super = 3; 109 | } 110 | 111 | message CommandSetContentScaleArchive { 112 | required bool is_autofit_on = 1; 113 | required float old_content_scale = 2; 114 | optional float new_content_scale = 3; 115 | required .TSP.Reference sheet = 4; 116 | required .TSK.CommandArchive super = 5; 117 | } 118 | 119 | message CommandSetAutofitValueArchive { 120 | required float old_content_scale = 1; 121 | required .TSP.Reference sheet = 2; 122 | required .TSK.CommandArchive super = 3; 123 | } 124 | 125 | message CommandSetDocumentPrinterOptions { 126 | required string printer_id = 1; 127 | required string paper_id = 2; 128 | required .TSP.Size page_size = 3; 129 | required .TSK.CommandArchive super = 4; 130 | } 131 | 132 | message CommandEnterPrintPreviewModeArchive { 133 | required .TSK.CommandArchive super = 1; 134 | } 135 | 136 | message CommandExitPrintPreviewModeArchive { 137 | required .TSK.CommandArchive super = 1; 138 | } 139 | 140 | message CommandPasteDrawablesArchive { 141 | required .TSK.CommandArchive super = 1; 142 | required .TSP.Reference sheet = 2; 143 | repeated .TSP.Reference drawables = 3; 144 | } 145 | 146 | message CommandPasteSheetArchive { 147 | required .TSK.CommandArchive super = 1; 148 | required .TSP.Reference document = 2; 149 | required .TSP.Reference sheet = 3; 150 | optional .TN.SheetUIStateArchive sheet_uistate = 4; 151 | required uint32 sheetIndex = 5; 152 | } 153 | 154 | message CommandReorderSidebarItemChildrenAchive { 155 | required .TSK.CommandArchive super = 1; 156 | required .TSP.Reference sheet = 2; 157 | repeated .TSP.Reference old_children = 3; 158 | repeated .TSP.Reference new_children = 4; 159 | } 160 | 161 | message CommandChartMediatorSetEditingState { 162 | required .TSCH.ChartCommandArchive super = 1; 163 | optional .TN.ChartMediatorFormulaStorage old_formulas = 3; 164 | optional .TN.ChartMediatorFormulaStorage new_formulas = 4; 165 | optional int32 old_direction = 5; 166 | optional int32 new_direction = 6; 167 | optional int32 old_scatter_format = 7; 168 | optional int32 new_scatter_format = 8; 169 | } 170 | 171 | message CommandChartMediatorUpdateForEntityDelete { 172 | required .TSCH.ChartCommandArchive super = 1; 173 | optional .TSP.Reference cmd = 3; 174 | } 175 | 176 | message ChartCommandSetSeriesNameArchive { 177 | required .TSCH.ChartCommandArchive super = 1; 178 | required .TSP.Reference mediator = 2; 179 | required uint32 seriesindex = 3; 180 | optional .TSCE.FormulaArchive old_formula = 4; 181 | optional .TSCE.FormulaArchive new_formula = 5; 182 | } 183 | 184 | message ChartCommandSelectionBehaviorArchive { 185 | required .TSP.Reference drawable_info = 1; 186 | optional .TN.ChartSelectionArchive begin_selection = 2; 187 | optional .TN.ChartSelectionArchive end_selection = 3; 188 | } 189 | 190 | -------------------------------------------------------------------------------- /iWork/pb-schema/TPArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSDArchives.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSPMessages.proto"; 6 | import "TSWPArchives.proto"; 7 | import "TSSArchives.proto"; 8 | import "TSAArchives.proto"; 9 | import "TSCHArchives.proto"; 10 | package TP; 11 | 12 | enum ViewScaleMode { 13 | ViewScaleMode_UserDefined = 0; 14 | ViewScaleMode_FitWidth = 1; 15 | ViewScaleMode_FirPage = 2; 16 | } 17 | 18 | message DocumentArchive { 19 | required .TSA.DocumentArchive super = 15; 20 | optional .TSP.Reference stylesheet = 2; 21 | optional .TSP.Reference floating_drawables = 3; 22 | optional .TSP.Reference body_storage = 4; 23 | optional .TSP.Reference section = 5; 24 | optional .TSP.Reference theme = 6; 25 | optional .TSP.Reference settings = 7; 26 | optional .TSP.Reference deprecated_layout_state = 11; 27 | optional .TSP.Reference deprecated_view_state = 12; 28 | repeated .TSP.Reference citation_records = 13; 29 | repeated .TSP.Reference toc_styles = 14; 30 | repeated .TSP.Reference change_sessions = 16; 31 | optional .TSP.Reference drawables_zorder = 20; 32 | optional bool uses_single_header_footer = 21; 33 | optional float page_width = 30; 34 | optional float page_height = 31; 35 | optional float left_margin = 32; 36 | optional float right_margin = 33; 37 | optional float top_margin = 34; 38 | optional float bottom_margin = 35; 39 | optional float header_margin = 36; 40 | optional float footer_margin = 37; 41 | optional float page_scale = 38; 42 | optional bool layout_body_vertically = 39; 43 | optional bool change_tracking_enabled = 40; 44 | optional .TSP.Reference tables_custom_format_list = 41; 45 | optional uint32 orientation = 42 [default = 0]; 46 | optional string printer_id = 43; 47 | optional string paper_id = 44; 48 | optional bool change_tracking_paused = 45; 49 | } 50 | 51 | message ThemeArchive { 52 | required .TSS.ThemeArchive super = 1; 53 | } 54 | 55 | message SettingsArchive { 56 | enum FootnoteKind { 57 | kFootnoteKindFootnotes = 0; 58 | kFootnoteKindDocumentEndnotes = 1; 59 | kFootnoteKindSectionEndnotes = 2; 60 | } 61 | enum FootnoteFormat { 62 | kFootnoteFormatNumeric = 0; 63 | kFootnoteFormatRoman = 1; 64 | kFootnoteFormatSymbolic = 2; 65 | kFootnoteFormatJapaneseNumeric = 3; 66 | kFootnoteFormatJapaneseIdeographic = 4; 67 | } 68 | enum FootnoteNumbering { 69 | kFootnoteNumberingContinuous = 0; 70 | kFootnoteNumberingRestartEachPage = 1; 71 | kFootnoteNumberingRestartEachSection = 2; 72 | } 73 | optional bool body = 1 [default = true]; 74 | optional bool headers = 2 [default = true]; 75 | optional bool footers = 3 [default = true]; 76 | optional bool preview = 4 [default = true]; 77 | optional bool copy_movies = 5 [default = true]; 78 | optional bool copy_assets = 6 [default = true]; 79 | optional bool placeholder_authoring = 7 [default = false]; 80 | optional bool links_enabled = 8 [default = true]; 81 | optional bool hyphenation = 9 [default = false]; 82 | optional bool use_ligatures = 10 [default = false]; 83 | optional bool toc_links_enabled = 11 [default = false]; 84 | optional bool show_ct_markup = 12 [default = true]; 85 | optional bool show_ct_deletions = 13 [default = true]; 86 | optional int32 ct_bubbles_visibility = 14; 87 | optional bool change_bars_visible = 15 [default = true]; 88 | optional bool format_changes_visible = 16 [default = true]; 89 | optional bool annotations_visible = 17 [default = true]; 90 | optional bool document_is_rtl = 18 [default = false]; 91 | optional string decimal_tab = 20; 92 | optional string language = 21; 93 | optional string hyphenation_language = 22; 94 | optional string creation_locale = 23; 95 | optional string last_locale = 24; 96 | optional string orig_template = 25; 97 | optional string creation_date = 26; 98 | optional string bibliography_format = 27; 99 | optional .TP.SettingsArchive.FootnoteKind footnote_kind = 30; 100 | optional .TP.SettingsArchive.FootnoteFormat footnote_format = 31; 101 | optional .TP.SettingsArchive.FootnoteNumbering footnote_numbering = 32; 102 | optional int32 footnote_gap = 33; 103 | optional bool section_authoring = 40 [default = false]; 104 | } 105 | 106 | message PlaceholderArchive { 107 | required .TSWP.ShapeInfoArchive super = 1; 108 | } 109 | 110 | message FloatingDrawablesArchive { 111 | message DrawableEntry { 112 | optional .TSP.Reference drawable = 1; 113 | } 114 | message PageGroup { 115 | required uint32 page_index = 1; 116 | repeated .TP.FloatingDrawablesArchive.DrawableEntry background_drawables = 2; 117 | repeated .TP.FloatingDrawablesArchive.DrawableEntry foreground_drawables = 3; 118 | repeated .TP.FloatingDrawablesArchive.DrawableEntry drawables = 4; 119 | } 120 | repeated .TP.FloatingDrawablesArchive.PageGroup page_groups = 1; 121 | } 122 | 123 | message DrawablesZOrderArchive { 124 | repeated .TSP.Reference drawables = 1; 125 | } 126 | 127 | message PageMasterArchive { 128 | repeated .TSP.Reference headers = 1; 129 | repeated .TSP.Reference footers = 2; 130 | repeated .TSP.Reference master_drawables = 3; 131 | } 132 | 133 | message SectionArchive { 134 | optional bool OBSOLETE_shows_header = 1; 135 | optional bool OBSOLETE_shows_footer = 2; 136 | repeated .TSP.Reference OBSOLETE_headers = 3; 137 | repeated .TSP.Reference OBSOLETE_footers = 4; 138 | optional float OBSOLETE_left_margin = 5; 139 | optional float OBSOLETE_right_margin = 6; 140 | optional float OBSOLETE_top_margin = 7; 141 | optional float OBSOLETE_bottom_margin = 8; 142 | optional float OBSOLETE_header_padding = 9; 143 | optional float OBSOLETE_footer_padding = 10; 144 | optional float OBSOLETE_paper_width = 11; 145 | optional float OBSOLETE_paper_height = 12; 146 | optional bool OBSOLETE_landscape_mode = 13; 147 | repeated .TSP.Reference OBSOLETE_master_drawables = 14; 148 | optional float OBSOLETE_header_margin = 15; 149 | optional float OBSOLETE_footer_margin = 16; 150 | optional bool inherit_previous_header_footer = 17; 151 | optional bool page_master_first_page_different = 18; 152 | optional bool page_master_even_odd_pages_different = 19; 153 | optional uint32 section_start_kind = 20; 154 | optional uint32 section_page_number_kind = 21; 155 | optional uint32 section_page_number_start = 22; 156 | optional .TSP.Reference first_page_master = 23; 157 | optional .TSP.Reference even_page_master = 24; 158 | optional .TSP.Reference odd_page_master = 25; 159 | optional string name = 26; 160 | optional bool page_master_first_page_hides_header_footer = 28; 161 | } 162 | 163 | message AnchorPosArchive { 164 | optional .TSP.Reference attachment = 1; 165 | optional .TSP.Point position = 2; 166 | } 167 | 168 | message TargetHintArchive { 169 | optional .TSP.Point frame_origin = 1; 170 | optional .TSP.Size frame_size = 2; 171 | optional .TSP.Range range = 3; 172 | optional int32 next_widow_pulls_down_from_char_index = 4; 173 | optional .TSP.Range anchored_range = 5; 174 | optional int32 column_count = 6; 175 | } 176 | 177 | message PageHintArchive { 178 | enum PageKind { 179 | kPageKindNone = 0; 180 | kPageKindText = 1; 181 | kPageKindFiller = 2; 182 | kPageKindOrphan = 3; 183 | kPageKindEndnote = 4; 184 | kPageKindDirty = 5; 185 | kTPPageKindPageLayout = 6; 186 | } 187 | optional .TP.PageHintArchive.PageKind page_kind = 1; 188 | repeated .TP.TargetHintArchive target_hints = 2; 189 | optional .TSP.Range footnote_auto_number_range = 3; 190 | optional .TSP.Range footnote_layout_range = 4; 191 | optional .TSP.Reference first_child_hint = 6; 192 | optional .TSP.Reference last_child_hint = 7; 193 | repeated .TP.AnchorPosArchive anchored_attachments_map = 8; 194 | optional .TP.TopicNumberHintsArchive topic_numbers = 9; 195 | optional uint32 version_number = 10; 196 | optional uint32 platform_id = 11; 197 | } 198 | 199 | message SectionHintArchive { 200 | repeated .TP.PageHintArchive page_hints = 1; 201 | optional uint32 start_page_index = 2; 202 | } 203 | 204 | message TextboxHintArchive { 205 | required .TSP.Range range = 1; 206 | required .TSP.Size size = 2; 207 | } 208 | 209 | message TopicNumberEntryArchive { 210 | required .TSP.Reference list_style = 1; 211 | repeated uint32 topic_number = 2; 212 | repeated uint32 character_index = 3; 213 | } 214 | 215 | message TopicNumberHintsArchive { 216 | optional uint32 charIndex = 1; 217 | repeated .TP.TopicNumberEntryArchive topic_numbers_map = 2; 218 | optional uint32 valid_through_char_index = 3; 219 | } 220 | 221 | message LayoutStateArchive { 222 | optional uint32 section_index = 1; 223 | optional uint32 section_page_index = 2; 224 | optional uint32 document_page_index = 3; 225 | optional uint32 last_page_count = 4; 226 | repeated .TP.SectionHintArchive section_hints = 5; 227 | } 228 | 229 | message CanvasSelectionArchive { 230 | required .TSWP.SelectionType type = 1; 231 | required .TSP.Range range = 2; 232 | required .TSWP.StyleInsertionBehavior style_insertion_behavior = 3; 233 | required .TSWP.CaretAffinity caret_affinity = 4; 234 | repeated .TSP.Reference infos = 5; 235 | repeated .TSP.Reference excluded_infos = 6; 236 | repeated .TSP.Reference additional_infos = 7; 237 | optional .TSP.Reference container = 8; 238 | } 239 | 240 | message ViewStateArchive { 241 | optional float OBSOLETE_view_scale = 1; 242 | optional .TSP.Point visible_rect_origin = 2; 243 | optional bool OBSOLETE_landscape = 3; 244 | optional .TSP.Reference selection = 4; 245 | optional .TSP.Reference selection_model = 5; 246 | optional bool master_drawables_selectable = 6; 247 | optional .TSP.Size visible_rect_size = 7; 248 | repeated .TSCH.ChartUIState chart_ui_state = 8; 249 | optional bool ruler_visible = 9 [default = true]; 250 | optional bool layout_borders_visible = 10 [default = false]; 251 | optional bool word_count_hud_visible = 11 [default = false]; 252 | optional bool shows_comments = 12 [default = true]; 253 | optional bool shows_page_navigator = 13 [default = false]; 254 | optional .TP.ViewScaleMode view_scale_mode = 14 [default = ViewScaleMode_UserDefined]; 255 | optional float view_scale = 15 [default = 1.25]; 256 | optional .TSP.Point window_frame_origin = 16; 257 | optional .TSP.Size window_frame_size = 17; 258 | optional string selected_inspector_switch_segment_identifier = 18; 259 | optional bool inspector_hidden = 19; 260 | } 261 | 262 | message UIStateArchive { 263 | optional .TSP.Reference layout_state = 1; 264 | optional .TSP.Reference view_state = 2; 265 | } 266 | 267 | -------------------------------------------------------------------------------- /iWork/pb-schema/TPCommandArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSDArchives.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSPMessages.proto"; 6 | import "TSWPArchives.proto"; 7 | import "TSSArchives.proto"; 8 | import "TSAArchives.proto"; 9 | import "TPArchives.proto"; 10 | import "TSWPCommandArchives.proto"; 11 | package TP; 12 | 13 | message InsertDrawablesCommandArchive { 14 | required .TSK.CommandArchive super = 1; 15 | optional uint32 page_index = 2; 16 | optional uint32 z_order = 3; 17 | repeated .TSP.Reference drawables = 4; 18 | optional bool forPaste = 5; 19 | } 20 | 21 | message PasteDrawablesCommandArchive { 22 | message DrawableAndPage { 23 | optional .TSP.Reference drawable = 1; 24 | optional uint32 page_index = 2; 25 | } 26 | required .TSK.CommandArchive super = 1; 27 | repeated .TP.PasteDrawablesCommandArchive.DrawableAndPage drawables = 2; 28 | } 29 | 30 | message PasteAnchoredDrawablesCommandArchive { 31 | required .TSK.CommandGroupArchive deprecated_super = 1; 32 | optional .TSK.CommandArchive super = 4; 33 | optional bool select = 2; 34 | optional .TSP.Reference deprecated_undo_selection = 3; 35 | optional .TSP.Reference canvas_selection = 5; 36 | } 37 | 38 | message MoveDrawablesPageIndexCommandArchive { 39 | message Drawable { 40 | optional .TSP.Reference drawable = 1; 41 | optional uint32 z_order = 2; 42 | } 43 | required .TSK.CommandArchive super = 1; 44 | optional uint32 page_index = 2; 45 | repeated .TP.MoveDrawablesPageIndexCommandArchive.Drawable drawables = 3; 46 | } 47 | 48 | message InsertSectionBreakCommandArchive { 49 | required .TSWP.TextCommandArchive super = 1; 50 | } 51 | 52 | message DeleteSectionCommandArchive { 53 | required .TSK.CommandArchive super = 1; 54 | } 55 | 56 | message ReplaceSectionCommandArchive { 57 | required .TSWP.TextCommandArchive super = 1; 58 | } 59 | 60 | message ChangeSectionPropertyCommandArchive { 61 | required .TSK.CommandArchive super = 1; 62 | optional uint32 section_index = 2; 63 | optional string section_property = 3; 64 | optional bool section_value_bool = 4; 65 | } 66 | 67 | message SwapDrawableZOrderCommandArchive { 68 | message SwapPair { 69 | required uint32 z_order_1 = 1; 70 | required uint32 z_order_2 = 2; 71 | } 72 | required .TSK.CommandArchive super = 1; 73 | repeated .TP.SwapDrawableZOrderCommandArchive.SwapPair swap_pairs = 2; 74 | } 75 | 76 | message RemoveDrawablesCommandArchive { 77 | message DrawableInfo { 78 | optional uint32 page_index = 1; 79 | optional int32 z_order = 2; 80 | optional .TSP.Reference drawable_object = 3; 81 | } 82 | required .TSK.CommandArchive super = 1; 83 | repeated .TP.RemoveDrawablesCommandArchive.DrawableInfo drawables = 2; 84 | } 85 | 86 | message NudgeDrawablesCommandArchive { 87 | required .TSK.CommandArchive super = 1; 88 | repeated .TSP.Reference child_commands = 2; 89 | } 90 | 91 | message ChangeHeaderFooterVisibilityCommandArchive { 92 | enum TPHeaderFooterType { 93 | kTPHeaderType = 0; 94 | kTPFooterType = 1; 95 | } 96 | enum TPHeaderFragmentIndex { 97 | kTPHeaderLeft = 0; 98 | kTPHeaderCenter = 1; 99 | kTPHeaderRight = 2; 100 | } 101 | required .TSK.CommandArchive super = 1; 102 | optional bool visible = 2; 103 | optional .TP.ChangeHeaderFooterVisibilityCommandArchive.TPHeaderFooterType type = 3; 104 | optional .TP.ChangeHeaderFooterVisibilityCommandArchive.TPHeaderFragmentIndex index_to_select = 4; 105 | repeated .TSP.Reference storages = 5; 106 | } 107 | 108 | message ChangeSectionMarginsCommandArchive { 109 | enum SectionMargin { 110 | kSectionMarginLeft = 0; 111 | kSectionMarginRight = 1; 112 | kSectionMarginTop = 2; 113 | kSectionMarginBottom = 3; 114 | kSectionMarginHeader = 4; 115 | kSectionMarginFooter = 5; 116 | } 117 | required .TSK.CommandArchive super = 1; 118 | optional .TSP.Reference section = 2; 119 | optional .TP.ChangeSectionMarginsCommandArchive.SectionMargin margin = 3; 120 | optional float margin_value = 4; 121 | } 122 | 123 | message ChangeDocumentPrinterOptionsCommandArchive { 124 | required .TSK.CommandArchive super = 1; 125 | optional .TSP.Reference OBSOLETE_section = 2; 126 | optional float paper_width = 3; 127 | optional float paper_height = 4; 128 | optional float page_scale = 5; 129 | optional uint32 orientation = 6; 130 | optional string printer_id = 7; 131 | optional string paper_id = 8; 132 | } 133 | 134 | message InsertMasterDrawablesCommandArchive { 135 | required .TSK.CommandArchive super = 1; 136 | optional .TSP.Reference OBSOLETE_section = 2; 137 | repeated .TSP.Reference master_drawables = 3; 138 | optional uint32 drawable_index = 4; 139 | optional .TSP.Reference master_drawable_provider = 5; 140 | } 141 | 142 | message MoveMasterDrawableZOrderCommandArchive { 143 | required .TSK.CommandArchive super = 1; 144 | optional .TSP.Reference OBSOLETE_section = 2; 145 | repeated .TSP.Reference master_drawables = 3; 146 | repeated uint32 indexes = 4; 147 | optional .TSP.Reference master_drawable_provider = 5; 148 | } 149 | 150 | message RemoveMasterDrawablesCommandArchive { 151 | message MasterDrawable { 152 | optional .TSP.Reference drawable = 1; 153 | optional uint32 drawable_index = 2; 154 | } 155 | required .TSK.CommandArchive super = 1; 156 | optional .TSP.Reference OBSOLETE_section = 2; 157 | repeated .TP.RemoveMasterDrawablesCommandArchive.MasterDrawable master_drawables = 3; 158 | optional .TSP.Reference master_drawable_provider = 4; 159 | } 160 | 161 | message PasteMasterDrawablesCommandArchive { 162 | required .TSK.CommandArchive super = 1; 163 | optional .TSP.Reference section = 2; 164 | repeated .TSP.Reference master_drawables = 3; 165 | optional uint32 drawable_index = 4; 166 | } 167 | 168 | message MoveDrawablesAttachedCommandArchive { 169 | message FloatingUndo { 170 | optional .TSP.Reference drawable = 1; 171 | optional .TSD.GeometryArchive geometry = 2; 172 | optional .TSP.Reference attachment = 3; 173 | optional uint32 wrap_type = 4; 174 | optional uint32 page_index = 5; 175 | optional int32 z_order = 6; 176 | } 177 | required .TSK.CommandArchive super = 1; 178 | optional .TSWP.UndoTransaction undo_transaction = 2; 179 | repeated .TP.MoveDrawablesAttachedCommandArchive.FloatingUndo floating_undo = 3; 180 | optional bool select = 4; 181 | optional bool make_inline = 5; 182 | } 183 | 184 | message MoveDrawablesFloatingCommandArchive { 185 | message AttachedUndo { 186 | optional .TSP.Reference drawable = 1; 187 | optional .TSD.GeometryArchive geometry = 2; 188 | optional .TSP.Reference attachment = 3; 189 | optional uint32 page_index = 4; 190 | optional bool is_html_wrap = 5; 191 | optional uint32 type = 6; 192 | optional uint32 direction = 7; 193 | optional uint32 fit_type = 8; 194 | optional float margin = 9; 195 | optional float alpha_threshold = 10; 196 | optional .TSP.Reference storage = 11; 197 | optional .TSWP.UndoTransaction undo_transaction = 12; 198 | } 199 | required .TSK.CommandArchive super = 1; 200 | optional .TSWP.UndoTransaction deprecated_undo_transaction = 2; 201 | repeated .TP.MoveDrawablesFloatingCommandArchive.AttachedUndo attached_undo = 3; 202 | optional bool select = 4; 203 | } 204 | 205 | message RemoveAnchoredDrawableCommandArchive { 206 | required .TSWP.TextCommandArchive super = 1; 207 | optional uint32 char_index = 2; 208 | optional uint32 z_order = 3; 209 | } 210 | 211 | message ChangeFootnoteFormatCommandArchive { 212 | enum FootnoteFormat { 213 | kFootnoteFormatNumeric = 0; 214 | kFootnoteFormatRoman = 1; 215 | kFootnoteFormatSymbolic = 2; 216 | kFootnoteFormatJapaneseNumeric = 3; 217 | kFootnoteFormatJapaneseIdeographic = 4; 218 | } 219 | required .TSK.CommandArchive super = 1; 220 | optional .TP.ChangeFootnoteFormatCommandArchive.FootnoteFormat format = 2; 221 | } 222 | 223 | message ChangeFootnoteKindCommandArchive { 224 | enum FootnoteKind { 225 | kFootnoteKindFootnotes = 0; 226 | kFootnoteKindDocumentEndnotes = 1; 227 | kFootnoteKindSectionEndnotes = 2; 228 | } 229 | required .TSK.CommandArchive super = 1; 230 | optional .TP.ChangeFootnoteKindCommandArchive.FootnoteKind kind = 2; 231 | optional .TSWP.UndoTransaction undo_transaction = 3; 232 | } 233 | 234 | message ChangeFootnoteNumberingCommandArchive { 235 | enum FootnoteNumbering { 236 | kFootnoteNumberingContinuous = 0; 237 | kFootnoteNumberingRestartEachPage = 1; 238 | kFootnoteNumberingRestartEachSection = 2; 239 | } 240 | required .TSK.CommandArchive super = 1; 241 | optional .TP.ChangeFootnoteNumberingCommandArchive.FootnoteNumbering numbering = 2; 242 | } 243 | 244 | message ChangeFootnoteSpacingCommandArchive { 245 | required .TSK.CommandArchive super = 1; 246 | optional int32 footnote_spacing = 2; 247 | } 248 | 249 | message MoveInlineDrawableAnchoredCommandArchive { 250 | required .TSK.CommandArchive super = 1; 251 | optional .TSP.Reference drawable = 2; 252 | optional bool is_html_wrap = 3; 253 | optional uint32 type = 4; 254 | optional uint32 direction = 5; 255 | optional uint32 fit_type = 6; 256 | optional float margin = 7; 257 | optional float alpha_threshold = 8; 258 | } 259 | 260 | message MoveAnchoredDrawableInlineCommandArchive { 261 | required .TSK.CommandArchive super = 1; 262 | optional .TSP.Reference drawable = 2; 263 | optional bool is_html_wrap = 3; 264 | optional uint32 type = 4; 265 | optional uint32 direction = 5; 266 | optional uint32 fit_type = 6; 267 | optional float margin = 7; 268 | optional float alpha_threshold = 8; 269 | optional uint32 z_order = 9; 270 | } 271 | 272 | message InsertFootnoteCommandArchive { 273 | required .TSWP.TextCommandArchive super = 1; 274 | } 275 | 276 | message ToggleBodyLayoutDirectionCommandArchive { 277 | required .TSK.CommandArchive super = 1; 278 | optional bool new_direction_is_vertical = 2; 279 | } 280 | 281 | message ChangeCTVisibilityCommandArchive { 282 | required .TSK.CommandArchive super = 1; 283 | optional bool markup_visible = 2; 284 | optional bool deletions_visible = 3; 285 | optional uint32 selection_range_location = 4; 286 | optional uint32 selection_range_length = 5; 287 | } 288 | 289 | message TrackChangesCommandArchive { 290 | required .TSK.CommandArchive super = 1; 291 | optional bool track_changes = 2; 292 | repeated .TSP.Reference change_session_history = 3; 293 | optional bool paused = 4; 294 | } 295 | 296 | message DocumentHyphenationCommandArchive { 297 | required .TSK.CommandArchive super = 1; 298 | optional bool hyphenate_document = 2; 299 | } 300 | 301 | message DocumentLigaturesCommandArchive { 302 | required .TSK.CommandArchive super = 1; 303 | optional bool use_ligatures = 2; 304 | } 305 | 306 | message DocumentHasBodyCommandArchive { 307 | required .TSK.CommandArchive super = 1; 308 | optional bool has_body = 2; 309 | } 310 | 311 | message PauseChangeTrackingCommandArchive { 312 | required .TSK.CommandArchive super = 1; 313 | optional bool paused = 2; 314 | } 315 | 316 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSAArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSKArchives.proto"; 4 | import "TSPMessages.proto"; 5 | import "TSWPArchives.proto"; 6 | import "TSSArchives.proto"; 7 | package TSA; 8 | 9 | message DocumentArchive { 10 | required .TSK.DocumentArchive super = 1; 11 | repeated .TSWP.TextPresetDisplayItemArchive text_preset_display_items = 2; 12 | optional string creation_language = 3; 13 | optional .TSP.Reference calculation_engine = 4; 14 | optional .TSP.Reference view_state = 5; 15 | optional .TSP.Reference function_browser_state = 6; 16 | optional .TSP.Reference tables_custom_format_list = 7; 17 | optional bool needs_movie_compatibility_upgrade = 8; 18 | optional string template_identifier = 9; 19 | } 20 | 21 | message FunctionBrowserStateArchive { 22 | repeated uint32 recent_functions = 1; 23 | repeated uint32 back_functions = 2; 24 | repeated uint32 forward_functions = 3; 25 | optional uint32 current_function = 4; 26 | } 27 | 28 | message TestDocumentArchive { 29 | required .TSA.DocumentArchive super = 1; 30 | optional string value = 2; 31 | } 32 | 33 | message PropagatePresetCommandArchive { 34 | required .TSK.CommandArchive super = 1; 35 | } 36 | 37 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSCH3DArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSDArchives.proto"; 5 | package TSCH; 6 | 7 | enum TextureTilingMode { 8 | textureTilingModeNone = 0; 9 | textureTilingModeTallest = 1; 10 | } 11 | 12 | enum TextureTilingFace { 13 | textureTilingFaceAll = 0; 14 | textureTilingFaceTopAndBottom = 1; 15 | textureTilingFaceSide = 2; 16 | } 17 | 18 | enum TextureTilingWrap { 19 | textureTilingWrapProjected = 0; 20 | textureTilingWrapFaceWrap = 1; 21 | } 22 | 23 | enum TextureTilingXPosition { 24 | textureTilingXPositionLeft = 0; 25 | textureTilingXPositionCenter = 1; 26 | textureTilingXPositionRight = 2; 27 | } 28 | 29 | enum TextureTilingYPosition { 30 | textureTilingYPositionTop = 0; 31 | textureTilingYPositionMiddle = 1; 32 | textureTilingYPositionBottom = 2; 33 | } 34 | 35 | enum TextureTilingContinuity { 36 | textureTilingContinuityNone = 0; 37 | textureTilingContinuityGlobal = 1; 38 | textureTilingContinuitySeries = 2; 39 | textureTilingContinuityJittered = 3; 40 | } 41 | 42 | enum FillPropertyType { 43 | fillPropertyTypeUndefined = 0; 44 | fillPropertyTypeArea = 1; 45 | fillPropertyTypeBar = 2; 46 | fillPropertyTypeColumn = 3; 47 | fillPropertyTypeLine = 4; 48 | fillPropertyTypePie = 5; 49 | } 50 | 51 | message Chart3DEnvironmentPackageArchive { 52 | repeated .TSCH.Chart3DEnvironmentMaterialArchive materials = 1; 53 | } 54 | 55 | message Chart3DFillArchive { 56 | optional .TSCH.Chart3DLightingModelArchive lightingmodel = 1; 57 | optional string textureset_id = 2; 58 | optional .TSCH.FillPropertyType fill_type = 3; 59 | optional uint32 series_index = 4; 60 | } 61 | 62 | message Chart3DPointLightArchive { 63 | required .TSCH.Chart3DVectorArchive position = 1; 64 | } 65 | 66 | message Chart3DDirectionalLightArchive { 67 | required .TSCH.Chart3DVectorArchive direction = 1; 68 | } 69 | 70 | message Chart3DSpotLightArchive { 71 | required .TSCH.Chart3DVectorArchive position = 1; 72 | required .TSCH.Chart3DVectorArchive direction = 2; 73 | required float cutoff = 3; 74 | required float dropoff = 4; 75 | } 76 | 77 | message Chart3DLightArchive { 78 | required string name = 1; 79 | required .TSCH.Chart3DVectorArchive ambient_color = 2; 80 | required .TSCH.Chart3DVectorArchive diffuse_color = 3; 81 | required .TSCH.Chart3DVectorArchive specular_color = 4; 82 | required float intensity = 5; 83 | required .TSCH.Chart3DVectorArchive attenuation = 6; 84 | required uint32 coordinate_space = 7; 85 | required bool enabled = 8; 86 | optional .TSCH.Chart3DPointLightArchive point_light = 9; 87 | optional .TSCH.Chart3DDirectionalLightArchive directional_light = 10; 88 | optional .TSCH.Chart3DSpotLightArchive spot_light = 11; 89 | } 90 | 91 | message Chart3DLightingModelArchive { 92 | optional .TSCH.Chart3DPhongLightingModelArchive phong = 1; 93 | optional .TSCH.Chart3DFixedFunctionLightingModelArchive fixed_function = 2; 94 | optional .TSCH.Chart3DEnvironmentPackageArchive environment = 3; 95 | } 96 | 97 | message Chart3DLightingPackageArchive { 98 | required string name = 1; 99 | repeated .TSCH.Chart3DLightArchive lights = 2; 100 | } 101 | 102 | message Chart3DTexturesMaterialArchive { 103 | required .TSCH.Chart3DVectorArchive color = 1; 104 | repeated .TSCH.Chart3DTSPImageDataTextureArchive textures = 2; 105 | } 106 | 107 | message Chart3DEmissiveMaterialArchive { 108 | required .TSCH.Chart3DTexturesMaterialArchive super = 1; 109 | repeated .TSCH.Chart3DImageTextureTilingArchive tilings = 2; 110 | } 111 | 112 | message Chart3DDiffuseMaterialArchive { 113 | required .TSCH.Chart3DTexturesMaterialArchive super = 1; 114 | repeated .TSCH.Chart3DImageTextureTilingArchive tilings = 2; 115 | } 116 | 117 | message Chart3DModulateMaterialArchive { 118 | required .TSCH.Chart3DTexturesMaterialArchive super = 1; 119 | repeated .TSCH.Chart3DImageTextureTilingArchive tilings = 2; 120 | } 121 | 122 | message Chart3DSpecularMaterialArchive { 123 | required .TSCH.Chart3DTexturesMaterialArchive super = 1; 124 | repeated .TSCH.Chart3DImageTextureTilingArchive tilings = 2; 125 | } 126 | 127 | message Chart3DShininessMaterialArchive { 128 | required .TSCH.Chart3DTexturesMaterialArchive super = 1; 129 | repeated .TSCH.Chart3DImageTextureTilingArchive tilings = 2; 130 | } 131 | 132 | message Chart3DEnvironmentMaterialArchive { 133 | required .TSCH.Chart3DTexturesMaterialArchive super = 1; 134 | repeated .TSCH.Chart3DBaseImageTextureTilingArchive OBSOLETE_tilings = 2; 135 | optional bool decalMode = 3; 136 | repeated .TSCH.Chart3DImageTextureTilingArchive tilings = 4; 137 | } 138 | 139 | message Chart3DFixedFunctionLightingModelArchive { 140 | required .TSCH.Chart3DPhongMaterialPackageArchive materials = 1; 141 | } 142 | 143 | message Chart3DPhongLightingModelArchive { 144 | required .TSCH.Chart3DPhongMaterialPackageArchive materials = 1; 145 | } 146 | 147 | message Chart3DPhongMaterialPackageArchive { 148 | optional .TSCH.Chart3DEmissiveMaterialArchive emissive = 1; 149 | optional .TSCH.Chart3DDiffuseMaterialArchive diffuse = 2; 150 | optional .TSCH.Chart3DModulateMaterialArchive modulate = 3; 151 | optional .TSCH.Chart3DSpecularMaterialArchive specular = 4; 152 | optional .TSCH.Chart3DShininessMaterialArchive shininess = 5; 153 | } 154 | 155 | message Chart3DTSPImageDataTextureArchive { 156 | optional .TSP.DataReference data = 3; 157 | optional .TSP.DataReference mipmapdata = 4; 158 | optional .TSP.Reference database_data = 1; 159 | optional .TSP.Reference database_mipmapdata = 2; 160 | } 161 | 162 | message Chart3DBaseImageTextureTilingArchive { 163 | optional .TSCH.Chart3DVectorArchive scale = 1; 164 | optional float rotation = 2; 165 | } 166 | 167 | message Chart3DImageTextureTilingArchive { 168 | required .TSCH.Chart3DBaseImageTextureTilingArchive super = 1; 169 | optional .TSCH.TextureTilingMode mode = 2; 170 | optional .TSCH.TextureTilingWrap wrap = 3; 171 | optional .TSCH.TextureTilingFace face = 4; 172 | optional .TSCH.TextureTilingXPosition xposition = 5; 173 | optional .TSCH.TextureTilingYPosition yposition = 6; 174 | optional .TSCH.TextureTilingContinuity scontinuity = 7; 175 | optional .TSCH.TextureTilingContinuity tcontinuity = 8; 176 | optional bool reveal = 9; 177 | } 178 | 179 | message Chart3DVectorArchive { 180 | required float x = 1; 181 | required float y = 2; 182 | required float z = 3; 183 | required float w = 4; 184 | } 185 | 186 | extend .TSD.FillArchive { 187 | optional .TSCH.Chart3DFillArchive fill3d = 100; 188 | } 189 | 190 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSCHArchives.Common.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSDArchives.proto"; 6 | import "TSSArchives.proto"; 7 | import "TSCH3DArchives.proto"; 8 | package TSCH; 9 | 10 | enum ChartType { 11 | undefinedChartType = 0; 12 | columnChartType2D = 1; 13 | barChartType2D = 2; 14 | lineChartType2D = 3; 15 | areaChartType2D = 4; 16 | pieChartType2D = 5; 17 | stackedColumnChartType2D = 6; 18 | stackedBarChartType2D = 7; 19 | stackedAreaChartType2D = 8; 20 | scatterChartType2D = 9; 21 | mixedChartType2D = 10; 22 | twoAxisChartType2D = 11; 23 | columnChartType3D = 12; 24 | barChartType3D = 13; 25 | lineChartType3D = 14; 26 | areaChartType3D = 15; 27 | pieChartType3D = 16; 28 | stackedColumnChartType3D = 17; 29 | stackedBarChartType3D = 18; 30 | stackedAreaChartType3D = 19; 31 | multiDataColumnChartType2D = 20; 32 | multiDataBarChartType2D = 21; 33 | bubbleChartType2D = 22; 34 | multiDataScatterChartType2D = 23; 35 | multiDataBubbleChartType2D = 24; 36 | } 37 | 38 | enum AxisType { 39 | axis_type_unknown = 0; 40 | axis_type_x = 1; 41 | axis_type_y = 2; 42 | axis_type_pie = 3; 43 | axis_type_size = 4; 44 | } 45 | 46 | enum ScatterFormat { 47 | scatter_format_unknown = 0; 48 | scatter_format_separate_x = 1; 49 | scatter_format_shared_x = 2; 50 | } 51 | 52 | enum SeriesDirection { 53 | series_direction_unknown = 0; 54 | series_direction_by_row = 1; 55 | series_direction_by_column = 2; 56 | } 57 | 58 | enum NumberValueType { 59 | numberValueTypeDecimal = 0; 60 | numberValueTypeCurrency = 1; 61 | numberValueTypePercentage = 2; 62 | numberValueTypeScientific = 3; 63 | numberValueTypeFraction = 4; 64 | numberValueTypeBase = 5; 65 | numberValueTypeUnknown = -999; 66 | } 67 | 68 | enum NegativeNumberStyle { 69 | negativeNumberStyleMinus = 0; 70 | negativeNumberStyleRed = 1; 71 | negativeNumberStyleParentheses = 2; 72 | negativeNumberStyleRedAndParentheses = 3; 73 | negativeNumberStyleNone = 4; 74 | } 75 | 76 | enum FractionAccuracy { 77 | fractionAccuracyConflicting = 0; 78 | fractionAccuracyUpToOneDigit = -1; 79 | fractionAccuracyUpToTwoDigits = -2; 80 | fractionAccuracyUpToThreeDigits = -3; 81 | fractionAccuracyHalves = 2; 82 | fractionAccuracyQuarters = 4; 83 | fractionAccuracyEighths = 8; 84 | fractionAccuracySixteenths = 16; 85 | fractionAccuracyTenths = 10; 86 | fractionAccuracyHundredths = 100; 87 | } 88 | 89 | message SparseReferenceArrayArchive { 90 | message Entry { 91 | required uint32 index = 1; 92 | required .TSP.Reference value = 2; 93 | } 94 | required uint32 num_entries = 1; 95 | repeated .TSCH.SparseReferenceArrayArchive.Entry entries = 2; 96 | } 97 | 98 | message RectArchive { 99 | required .TSP.Point origin = 1; 100 | required .TSP.Size size = 2; 101 | } 102 | 103 | message ChartsNSNumberDoubleArchive { 104 | optional double number_archive = 1; 105 | } 106 | 107 | message ChartsNSArrayOfNSNumberDoubleArchive { 108 | repeated double numbers = 1; 109 | } 110 | 111 | message DEPRECATEDChart3DFillArchive { 112 | optional .TSD.FillArchive fill = 1; 113 | optional .TSCH.Chart3DLightingModelArchive lightingmodel = 2; 114 | optional string textureset_id = 3; 115 | optional .TSCH.FillPropertyType fill_type = 4; 116 | optional uint32 series_index = 5; 117 | } 118 | 119 | message ChartStyleArchive { 120 | optional .TSS.StyleArchive super = 1; 121 | extensions 10000 to 536870911; 122 | } 123 | 124 | message ChartNonStyleArchive { 125 | optional .TSS.StyleArchive super = 1; 126 | extensions 10000 to 536870911; 127 | } 128 | 129 | message LegendStyleArchive { 130 | optional .TSS.StyleArchive super = 1; 131 | extensions 10000 to 536870911; 132 | } 133 | 134 | message LegendNonStyleArchive { 135 | optional .TSS.StyleArchive super = 1; 136 | extensions 10000 to 536870911; 137 | } 138 | 139 | message ChartAxisStyleArchive { 140 | optional .TSS.StyleArchive super = 1; 141 | extensions 10000 to 536870911; 142 | } 143 | 144 | message ChartAxisNonStyleArchive { 145 | optional .TSS.StyleArchive super = 1; 146 | extensions 10000 to 536870911; 147 | } 148 | 149 | message ChartSeriesStyleArchive { 150 | optional .TSS.StyleArchive super = 1; 151 | extensions 10000 to 536870911; 152 | } 153 | 154 | message ChartSeriesNonStyleArchive { 155 | optional .TSS.StyleArchive super = 1; 156 | extensions 10000 to 536870911; 157 | } 158 | 159 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSCHArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSDArchives.proto"; 6 | import "TSSArchives.proto"; 7 | import "TSCHArchives.Common.proto"; 8 | import "TSCHArchives.GEN.proto"; 9 | import "TSCH3DArchives.proto"; 10 | import "TSCHPreUFFArchives.proto"; 11 | package TSCH; 12 | 13 | message ChartDrawableArchive { 14 | optional .TSD.DrawableArchive super = 1; 15 | extensions 10000 to 536870911; 16 | } 17 | 18 | message ChartArchive { 19 | optional .TSCH.ChartType chart_type = 1; 20 | optional .TSCH.ScatterFormat scatter_format = 2; 21 | optional .TSCH.RectArchive legend_frame = 3; 22 | optional .TSP.Reference preset = 4; 23 | optional .TSCH.SeriesDirection series_direction = 5; 24 | optional bool contains_default_data = 6; 25 | optional .TSCH.ChartGridArchive grid = 7; 26 | optional .TSP.Reference mediator = 8; 27 | optional .TSP.Reference chart_style = 9; 28 | optional .TSP.Reference chart_non_style = 10; 29 | optional .TSP.Reference legend_style = 11; 30 | optional .TSP.Reference legend_non_style = 12; 31 | repeated .TSP.Reference value_axis_styles = 13; 32 | repeated .TSP.Reference value_axis_nonstyles = 14; 33 | repeated .TSP.Reference category_axis_styles = 15; 34 | repeated .TSP.Reference category_axis_nonstyles = 16; 35 | repeated .TSP.Reference series_theme_styles = 17; 36 | optional .TSCH.SparseReferenceArrayArchive series_private_styles = 18; 37 | optional .TSCH.SparseReferenceArrayArchive series_non_styles = 19; 38 | repeated .TSP.Reference paragraph_styles = 20; 39 | optional uint32 multidataset_index = 21; 40 | optional bool needs_calc_engine_deferred_import_action = 22; 41 | extensions 10000 to 536870911; 42 | extend .TSCH.ChartDrawableArchive { 43 | optional .TSCH.ChartArchive unity = 10000; 44 | } 45 | } 46 | 47 | message ChartPasteboardAdditionsArchive { 48 | extend .TSCH.ChartArchive { 49 | optional uint32 preset_index_for_pasteboard = 10000; 50 | optional bytes preset_uuid_for_pasteboard = 10001; 51 | } 52 | } 53 | 54 | message ChartGridArchive { 55 | message GridRow { 56 | message GridValue { 57 | optional double numeric_value = 1; 58 | optional double date_value = 2; 59 | } 60 | repeated .TSCH.ChartGridArchive.GridRow.GridValue value = 1; 61 | } 62 | repeated string row_name = 1; 63 | repeated string column_name = 2; 64 | repeated .TSCH.ChartGridArchive.GridRow grid_row = 3; 65 | } 66 | 67 | message ChartMediatorArchive { 68 | optional .TSP.Reference info = 1; 69 | repeated uint32 local_series_indexes = 2; 70 | repeated uint32 remote_series_indexes = 3; 71 | } 72 | 73 | message ChartStylePreset { 74 | optional .TSP.Reference chart_style = 1; 75 | optional .TSP.Reference legend_style = 2; 76 | repeated .TSP.Reference value_axis_styles = 3; 77 | repeated .TSP.Reference category_axis_styles = 4; 78 | repeated .TSP.Reference series_styles = 5; 79 | repeated .TSP.Reference paragraph_styles = 6; 80 | optional bytes uuid = 7; 81 | } 82 | 83 | message ChartPresetsArchive { 84 | repeated .TSP.Reference chart_presets = 1; 85 | extend .TSS.ThemeArchive { 86 | required .TSCH.ChartPresetsArchive extension = 120; 87 | } 88 | } 89 | 90 | message PropertyValueStorageContainerArchive { 91 | optional .TSP.Reference chart_style = 1; 92 | optional .TSP.Reference chart_nonstyle = 2; 93 | optional .TSP.Reference legend_style = 3; 94 | optional .TSP.Reference legend_nonstyle = 4; 95 | optional .TSCH.SparseReferenceArrayArchive value_axis_styles = 5; 96 | optional .TSCH.SparseReferenceArrayArchive value_axis_nonstyles = 6; 97 | optional .TSCH.SparseReferenceArrayArchive category_axis_styles = 7; 98 | optional .TSCH.SparseReferenceArrayArchive category_axis_nonstyles = 8; 99 | optional .TSCH.SparseReferenceArrayArchive series_theme_styles = 9; 100 | optional .TSCH.SparseReferenceArrayArchive series_private_styles = 10; 101 | optional .TSCH.SparseReferenceArrayArchive series_nonstyles = 11; 102 | optional .TSCH.SparseReferenceArrayArchive paragraph_styles = 12; 103 | } 104 | 105 | message StylePasteboardDataArchive { 106 | optional .TSS.StyleArchive super = 1; 107 | optional .TSCH.PropertyValueStorageContainerArchive style_network = 2; 108 | optional bool copied_from_entire_chart = 3; 109 | } 110 | 111 | message ChartSelectionPathTypeArchive { 112 | optional string path_type = 1; 113 | optional string path_name = 2; 114 | } 115 | 116 | message ChartAxisIDArchive { 117 | optional .TSCH.AxisType axis_type = 1; 118 | optional uint32 ordinal = 2; 119 | } 120 | 121 | message ChartSelectionPathArgumentArchive { 122 | optional uint32 number = 1; 123 | optional .TSCH.ChartAxisIDArchive axis_id = 2; 124 | } 125 | 126 | message ChartSelectionPathArchive { 127 | optional .TSCH.ChartSelectionPathTypeArchive path_type = 1; 128 | optional .TSCH.ChartSelectionPathArchive sub_selection = 2; 129 | repeated .TSCH.ChartSelectionPathArgumentArchive arguments = 3; 130 | } 131 | 132 | message ChartSelectionArchive { 133 | optional .TSP.Reference chart = 1; 134 | repeated .TSCH.ChartSelectionPathArchive paths = 2; 135 | } 136 | 137 | message ChartUIState { 138 | optional .TSP.Reference chart = 1; 139 | optional int32 cde_last_row_selected = 2; 140 | optional int32 cde_last_col_selected = 3; 141 | optional int32 cde_last_row_count = 4; 142 | optional int32 cde_last_col_count = 5; 143 | } 144 | 145 | message ChartFormatStructExtensions { 146 | extend .TSK.FormatStructArchive { 147 | optional string prefix = 10000; 148 | optional string suffix = 10001; 149 | } 150 | } 151 | 152 | extend .TSCH.ChartArchive { 153 | optional bool scene3d_settings_constant_depth = 10002; 154 | } 155 | 156 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSCHCommandArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSSArchives.proto"; 6 | import "TSDArchives.proto"; 7 | import "TSCHArchives.Common.proto"; 8 | import "TSCHArchives.GEN.proto"; 9 | import "TSCH3DArchives.proto"; 10 | import "TSCHArchives.proto"; 11 | package TSCH; 12 | 13 | enum StyleOwnerType { 14 | chart_info = 1; 15 | legend_model = 2; 16 | chart_axis = 3; 17 | chart_series = 4; 18 | } 19 | 20 | enum StyleSwapType { 21 | chart_style = 1; 22 | chart_non_style = 2; 23 | legend_style = 3; 24 | legend_non_style = 4; 25 | value_axis_style = 5; 26 | value_axis_non_style = 6; 27 | category_axis_style = 7; 28 | category_axis_non_style = 8; 29 | series_theme_style = 9; 30 | series_private_style = 10; 31 | series_non_style = 11; 32 | paragraph_style = 12; 33 | } 34 | 35 | enum ApplyPresetBehavior { 36 | remove_overrides = 1; 37 | preserve_overrides = 2; 38 | preserve_appearance = 3; 39 | } 40 | 41 | message CommandSetChartTypeArchive { 42 | required .TSCH.ChartCommandArchive super = 1; 43 | required .TSCH.ChartType oldChartType = 2; 44 | required .TSCH.ChartType newChartType = 3; 45 | optional .TSP.Reference info_geometry_command = 5; 46 | optional .TSP.Reference anchor_attachment_command = 6; 47 | optional .TSCH.RectArchive original_legend_rect = 7; 48 | } 49 | 50 | message PropertyMutationUndoTupleArchive { 51 | optional .TSP.Reference chart_info = 1; 52 | optional .TSCH.StyleOwnerType style_owner_type = 2; 53 | optional uint32 index = 3; 54 | optional .TSP.Reference old_style = 4; 55 | optional .TSP.Reference new_style = 5; 56 | optional .TSP.Reference old_non_style = 6; 57 | optional .TSP.Reference new_non_style = 7; 58 | } 59 | 60 | message StyleSwapUndoTupleArchive { 61 | optional .TSP.Reference chart_info = 1; 62 | optional .TSCH.StyleSwapType swap_type = 2; 63 | optional uint32 index = 3; 64 | optional .TSP.Reference old_value = 4; 65 | optional .TSP.Reference new_value = 5; 66 | } 67 | 68 | message CommandStyleSwapArchive { 69 | required .TSCH.ChartCommandArchive super = 1; 70 | repeated .TSCH.StyleSwapUndoTupleArchive undo_tuples = 2; 71 | } 72 | 73 | message CommandSetSeriesNameArchive { 74 | required .TSCH.ChartCommandArchive super = 1; 75 | required uint32 seriesIndex = 2; 76 | required string oldName = 3; 77 | required string newName = 4; 78 | } 79 | 80 | message CommandSetCategoryNameArchive { 81 | required .TSCH.ChartCommandArchive super = 1; 82 | required uint32 categoryIndex = 2; 83 | required string oldName = 3; 84 | required string newName = 4; 85 | optional bool isMultiDataIndex = 5; 86 | } 87 | 88 | message CommandAddGridRowsArchive { 89 | required .TSCH.ChartCommandArchive super = 1; 90 | required uint32 location = 2; 91 | optional uint32 length = 3; 92 | repeated string name = 4; 93 | } 94 | 95 | message CommandAddGridColumnsArchive { 96 | required .TSCH.ChartCommandArchive super = 1; 97 | required uint32 location = 2; 98 | optional uint32 length = 3; 99 | repeated string name = 4; 100 | } 101 | 102 | message CommandMoveGridRowsArchive { 103 | required .TSCH.ChartCommandArchive super = 1; 104 | required uint32 location = 2; 105 | required uint32 length = 3; 106 | required int32 afterrow = 4; 107 | } 108 | 109 | message CommandMoveGridColumnsArchive { 110 | required .TSCH.ChartCommandArchive super = 1; 111 | required uint32 location = 2; 112 | required uint32 length = 3; 113 | required int32 aftercolumn = 4; 114 | } 115 | 116 | message CommandDeleteGridRowsArchive { 117 | message ValueRow { 118 | repeated double value = 5; 119 | } 120 | required .TSCH.ChartCommandArchive super = 1; 121 | required uint32 location = 2; 122 | required uint32 length = 3; 123 | repeated string names = 4; 124 | repeated .TSCH.CommandDeleteGridRowsArchive.ValueRow value_row = 6; 125 | optional bool removed_all_rows = 7; 126 | repeated string column_names = 8; 127 | } 128 | 129 | message CommandDeleteGridColumnsArchive { 130 | message ValueRow { 131 | repeated double value = 5; 132 | } 133 | required .TSCH.ChartCommandArchive super = 1; 134 | required uint32 location = 2; 135 | required uint32 length = 3; 136 | repeated string names = 4; 137 | repeated .TSCH.CommandDeleteGridColumnsArchive.ValueRow value_row = 6; 138 | optional bool removed_all_columns = 7; 139 | repeated string row_names = 8; 140 | } 141 | 142 | message CommandSetPreviewLocArchive { 143 | required .TSCH.ChartCommandArchive super = 1; 144 | required .TSP.Point old_loc = 2; 145 | required .TSP.Point new_loc = 3; 146 | } 147 | 148 | message CommandSetGridValueArchive { 149 | required .TSCH.ChartCommandArchive super = 1; 150 | required uint32 rowindex = 2; 151 | required uint32 columnindex = 3; 152 | required double oldvalue = 4; 153 | required double newvalue = 5; 154 | } 155 | 156 | message CommandSetGridDirectionArchive { 157 | required .TSCH.ChartCommandArchive super = 1; 158 | required uint32 old_direction = 2; 159 | required uint32 new_direction = 3; 160 | } 161 | 162 | message CommandSetSeriesTypeArchive { 163 | required .TSCH.ChartCommandArchive super = 1; 164 | required uint32 seriesindex = 2; 165 | required uint32 oldtype = 3; 166 | required uint32 newtype = 4; 167 | } 168 | 169 | message CommandSetScatterFormatArchive { 170 | required .TSCH.ChartCommandArchive super = 1; 171 | required uint32 oldformat = 3; 172 | required uint32 newformat = 4; 173 | } 174 | 175 | message CommandSetMultiDataSetIndexArchive { 176 | required .TSCH.ChartCommandArchive super = 1; 177 | required uint32 oldindex = 2; 178 | required uint32 newindex = 3; 179 | } 180 | 181 | message CommandSetLegendFrameArchive { 182 | required .TSCH.ChartCommandArchive super = 1; 183 | required .TSCH.RectArchive old_legend_frame = 2; 184 | required .TSCH.RectArchive new_legend_frame = 3; 185 | } 186 | 187 | message CommandSetPieWedgeExplosion { 188 | required .TSCH.ChartCommandArchive super = 1; 189 | optional .TSP.Reference info_geometry_command = 2; 190 | optional .TSP.Reference style_mutation_command = 3; 191 | optional .TSP.Reference anchor_attachment_command = 4; 192 | } 193 | 194 | message SynchronousCommandArchive { 195 | required .TSK.CommandArchive super = 1; 196 | required .TSP.Reference command = 2; 197 | } 198 | 199 | message CommandReplaceAllArchive { 200 | required .TSK.ReplaceAllChildCommandArchive super = 1; 201 | required .TSP.Reference info = 2; 202 | repeated .TSP.Reference commands = 3; 203 | } 204 | 205 | message CommandChartApplyTheme { 206 | required .TSS.ApplyThemeChildCommandArchive super = 1; 207 | required .TSP.Reference info = 2; 208 | optional .TSP.Reference swap_command = 3; 209 | optional .TSP.Reference preset = 4; 210 | } 211 | 212 | message CommandChartApplyPreset { 213 | required .TSCH.ChartCommandArchive super = 1; 214 | optional .TSP.Reference swap_command = 2; 215 | optional .TSP.Reference preset = 3; 216 | optional .TSCH.ApplyPresetBehavior behavior = 4 [default = remove_overrides]; 217 | } 218 | 219 | message CommandCleanupGeometryArchive { 220 | required .TSCH.ChartCommandArchive super = 1; 221 | optional .TSP.Reference info_geometry_command = 2; 222 | optional .TSP.Reference anchor_attachment_command = 3; 223 | optional .TSCH.RectArchive original_legend_rect = 4; 224 | } 225 | 226 | message ChartCommandArchive { 227 | required .TSK.CommandArchive super = 1; 228 | required .TSP.Reference info = 2; 229 | optional bool original_dirty_state = 3; 230 | } 231 | 232 | message CommandReplaceGridValuesArchive { 233 | message ValueRow { 234 | repeated double value = 1; 235 | } 236 | required .TSCH.ChartCommandArchive super = 1; 237 | required uint32 rowindex = 2; 238 | required uint32 columnindex = 3; 239 | repeated .TSCH.CommandReplaceGridValuesArchive.ValueRow old_data_rows = 4; 240 | repeated .TSCH.CommandReplaceGridValuesArchive.ValueRow new_data_rows = 5; 241 | repeated string old_row_names = 6; 242 | repeated string new_row_names = 7; 243 | repeated string old_col_names = 8; 244 | repeated string new_col_names = 9; 245 | optional uint32 rows_to_add = 10; 246 | optional uint32 cols_to_add = 11; 247 | } 248 | 249 | message CommandReplaceThemePresetArchive { 250 | message StyleAndIdentifierPair { 251 | required .TSP.Reference style = 1; 252 | required string identifier = 2; 253 | } 254 | required .TSK.CommandArchive super = 1; 255 | required .TSP.Reference theme = 2; 256 | optional .TSP.Reference old_preset = 3; 257 | optional .TSP.Reference new_preset = 4; 258 | repeated .TSCH.CommandReplaceThemePresetArchive.StyleAndIdentifierPair identifiers = 5; 259 | optional uint64 old_preset_index = 6; 260 | } 261 | 262 | message CommandInvalidateWPCaches { 263 | required .TSK.CommandArchive super = 1; 264 | } 265 | 266 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSDCommandArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSKArchives.proto"; 5 | import "TSSArchives.proto"; 6 | import "TSDArchives.proto"; 7 | package TSD; 8 | 9 | message ConnectionLineConnectCommandArchive { 10 | required .TSK.CommandArchive super = 1; 11 | required .TSP.Reference connection_line = 2; 12 | optional .TSP.Reference connect_to = 3; 13 | optional .TSP.Reference connect_from = 4; 14 | optional .TSP.Reference old_connect_to = 5; 15 | optional .TSP.Reference old_connect_from = 6; 16 | } 17 | 18 | message GroupDrawablesCommandArchive { 19 | required .TSK.CommandArchive super = 1; 20 | repeated .TSP.Reference drawables = 2; 21 | optional .TSP.Reference group = 3; 22 | } 23 | 24 | message UngroupGroupCommandArchive { 25 | required .TSK.CommandArchive super = 1; 26 | repeated .TSP.Reference drawables = 2; 27 | optional .TSP.Reference group = 3; 28 | } 29 | 30 | message ContainerRemoveChildrenCommandArchive { 31 | required .TSK.CommandArchive super = 1; 32 | optional .TSP.Reference container = 2; 33 | repeated .TSP.Reference children = 3; 34 | optional .TSP.IndexSet children_indices = 4; 35 | } 36 | 37 | message ContainerInsertChildrenCommandArchive { 38 | required .TSK.CommandArchive super = 1; 39 | optional .TSP.Reference container = 2; 40 | repeated .TSP.Reference children = 3; 41 | optional uint64 index = 4; 42 | } 43 | 44 | message ContainerReorderChildrenCommandArchive { 45 | required .TSK.CommandArchive super = 1; 46 | optional .TSP.Reference container = 2; 47 | repeated .TSP.Reference children = 3; 48 | optional .TSP.IndexSet children_indices = 4; 49 | } 50 | 51 | message InfoGeometryCommandArchive { 52 | required .TSK.CommandArchive super = 1; 53 | optional .TSP.Reference info = 2; 54 | optional .TSD.GeometryArchive newGeometry = 3; 55 | optional .TSD.GeometryArchive oldGeometry = 4; 56 | optional bool shouldClearObjectPlaceholderFlag = 5; 57 | optional bool didMatchObjectPlaceholderGeometry = 6; 58 | } 59 | 60 | message DrawablePathSourceCommandArchive { 61 | required .TSK.CommandArchive super = 1; 62 | optional .TSP.Reference info = 2; 63 | optional .TSD.PathSourceArchive oldpathsource = 3; 64 | optional .TSD.PathSourceArchive newpathsource = 4; 65 | } 66 | 67 | message InstantAlphaCommandArchive { 68 | required .TSK.CommandArchive super = 1; 69 | optional .TSP.Reference image = 2; 70 | optional .TSP.Path oldpath = 3; 71 | optional .TSP.Path newpath = 4; 72 | } 73 | 74 | message DrawableShadowCommandArchive { 75 | required .TSK.CommandArchive super = 1; 76 | optional .TSP.Reference info = 2; 77 | optional .TSD.ShadowArchive oldshadow = 3; 78 | optional .TSD.ShadowArchive newshadow = 4; 79 | } 80 | 81 | message DrawableApplyThemeCommandArchive { 82 | required .TSS.ApplyThemeChildCommandArchive super = 1; 83 | optional .TSP.Reference info = 2; 84 | optional .TSP.Reference style = 3; 85 | } 86 | 87 | message StyledInfoSetStyleCommandArchive { 88 | required .TSK.CommandArchive super = 1; 89 | optional .TSP.Reference info = 2; 90 | optional .TSP.Reference style = 3; 91 | } 92 | 93 | message ShapePathSourceFlipCommandArchive { 94 | required .TSK.CommandArchive super = 1; 95 | optional .TSP.Reference info = 2; 96 | optional bool newHorizontalFlip = 3; 97 | optional bool newVerticalFlip = 4; 98 | optional bool oldHorizontalFlip = 5; 99 | optional bool oldVerticalFlip = 6; 100 | } 101 | 102 | message ShapeStyleSetValueCommandArchive { 103 | required .TSD.StyledInfoSetStyleCommandArchive super = 1; 104 | optional .TSD.ShapeStylePropertiesArchive shape_properties = 4; 105 | } 106 | 107 | message ShapeApplyPresetCommandArchive { 108 | required .TSD.StyledInfoSetStyleCommandArchive super = 1; 109 | optional .TSP.Reference stylepreset = 10; 110 | optional .TSP.Color colorpreset = 11; 111 | optional .TSD.FillArchive fillpreset = 12; 112 | optional .TSD.ShadowArchive shadowpreset = 13; 113 | } 114 | 115 | message ShapeSetLineEndCommandArchive { 116 | required .TSK.CommandArchive super = 1; 117 | optional .TSP.Reference shape = 2; 118 | optional .TSD.LineEndArchive line_end = 3; 119 | optional bool is_head_line_end = 4; 120 | } 121 | 122 | message MovieSetValueCommandArchive { 123 | message PropertyValue { 124 | optional double starttime = 1; 125 | optional double endtime = 2; 126 | optional double postertime = 3; 127 | optional .TSP.DataReference posterimagedata = 4; 128 | optional bool autoplay = 5; 129 | optional .TSD.MovieSetValueCommandArchive.LoopOption loopOption = 6; 130 | optional float volume = 7; 131 | optional .TSP.DataReference media = 8; 132 | optional .TSP.DataReference importedauxiliarymedia = 9; 133 | } 134 | enum LoopOption { 135 | None = 0; 136 | Repeat = 1; 137 | BackAndForth = 2; 138 | } 139 | required .TSK.CommandArchive super = 1; 140 | optional .TSP.Reference movie = 2; 141 | optional string deprecated_property = 3; 142 | optional int32 property = 4; 143 | optional .TSD.MovieSetValueCommandArchive.PropertyValue value = 5; 144 | } 145 | 146 | message MediaStyleSetValueCommandArchive { 147 | required .TSD.StyledInfoSetStyleCommandArchive super = 1; 148 | optional .TSD.MediaStylePropertiesArchive media_properties = 4; 149 | } 150 | 151 | message ImageMediaCommandArchive { 152 | required .TSK.CommandArchive super = 1; 153 | optional .TSP.Reference info = 2; 154 | optional .TSP.DataReference newImageData = 8; 155 | optional .TSP.DataReference oldImageData = 9; 156 | optional .TSP.DataReference oldOriginalImageData = 10; 157 | optional .TSP.Size oldNaturalSize = 6; 158 | optional .TSP.DataReference newOriginalImageData = 11; 159 | optional .TSD.ImageAdjustmentsArchive oldImageAdjustments = 12; 160 | optional .TSP.DataReference oldAdjustedImageData = 13; 161 | optional .TSP.DataReference oldEnhancedImageData = 14; 162 | optional .TSP.Reference database_newImageData = 3; 163 | optional .TSP.Reference database_oldImageData = 4; 164 | optional .TSP.Reference database_oldOriginalImageData = 5; 165 | optional .TSP.Reference database_newOriginalImageData = 7; 166 | } 167 | 168 | message MediaOriginalSizeCommandArchive { 169 | required .TSK.CommandArchive super = 1; 170 | optional .TSP.Reference info = 2; 171 | optional .TSP.Size newOriginalSize = 3; 172 | optional .TSP.Size oldOriginalSize = 4; 173 | } 174 | 175 | message ImageMaskCommandArchive { 176 | required .TSK.CommandArchive super = 1; 177 | optional .TSP.Reference info = 2; 178 | optional .TSP.Reference newMaskInfo = 3; 179 | optional .TSP.Reference oldMaskInfo = 4; 180 | } 181 | 182 | message MediaApplyPresetCommandArchive { 183 | required .TSD.StyledInfoSetStyleCommandArchive super = 1; 184 | optional .TSP.Reference stylepreset = 10; 185 | optional .TSD.ShadowArchive shadowpreset = 11; 186 | } 187 | 188 | message ImageAdjustmentsCommandArchive { 189 | required .TSK.CommandArchive super = 1; 190 | optional .TSD.ImageAdjustmentsArchive old_image_adjustments = 2; 191 | optional .TSD.ImageAdjustmentsArchive new_image_adjustments = 3; 192 | optional .TSP.DataReference adjustedImageData = 4; 193 | optional .TSP.DataReference replacedAdjustedImageData = 5; 194 | optional .TSP.DataReference enhancedImageData = 6; 195 | optional .TSP.DataReference replacedEnhancedImageData = 7; 196 | } 197 | 198 | message MediaFlagsCommandArchive { 199 | required .TSK.CommandArchive super = 1; 200 | optional .TSP.Reference info = 2; 201 | optional uint32 flags = 3; 202 | } 203 | 204 | message DrawablesCommandGroupArchive { 205 | enum DrawablesCommandGroupType { 206 | Constructive = 1; 207 | Destructive = 2; 208 | InPlace = 3; 209 | } 210 | required .TSK.CommandGroupArchive super = 1; 211 | optional .TSD.CanvasSelectionArchive obsolete_selection = 2; 212 | optional .TSP.Reference modelforselection = 3; 213 | optional .TSD.DrawablesCommandGroupArchive.DrawablesCommandGroupType type = 4; 214 | optional bool forDrag = 5; 215 | optional .TSP.Reference archivedselection = 6; 216 | } 217 | 218 | message ExteriorTextWrapCommandArchive { 219 | message InfoAndWrap { 220 | optional .TSP.Reference info = 1; 221 | optional .TSD.ExteriorTextWrapArchive exterior_text_wrap = 2; 222 | } 223 | required .TSK.CommandArchive super = 1; 224 | repeated .TSD.ExteriorTextWrapCommandArchive.InfoAndWrap info_and_wrap = 2; 225 | } 226 | 227 | message DrawableHyperlinkCommandArchive { 228 | required .TSK.CommandArchive super = 1; 229 | optional .TSP.Reference info = 2; 230 | optional string oldhyperlink_url = 3; 231 | optional string newhyperlink_url = 4; 232 | } 233 | 234 | message CommandSelectionBehaviorArchive { 235 | enum CommandSelectionBehaviorType { 236 | Constructive = 1; 237 | Destructive = 2; 238 | InPlace = 3; 239 | Replace = 4; 240 | } 241 | optional .TSP.Reference model_for_selection = 2; 242 | optional .TSD.CommandSelectionBehaviorArchive.CommandSelectionBehaviorType type = 3; 243 | optional .TSP.Reference archived_selection = 4; 244 | optional .TSP.Reference archived_new_selection = 5; 245 | } 246 | 247 | message ImageReplaceCommandArchive { 248 | required .TSK.CommandArchive super = 1; 249 | } 250 | 251 | message DrawableLockCommandArchive { 252 | required .TSK.CommandArchive super = 1; 253 | optional .TSP.Reference info = 2; 254 | optional bool lock = 3; 255 | } 256 | 257 | message DrawableInfoCommentCommandArchive { 258 | required .TSK.CommandArchive super = 1; 259 | optional .TSP.Reference info = 2; 260 | optional .TSP.Reference old_comment = 3; 261 | optional .TSP.Reference new_comment = 4; 262 | } 263 | 264 | message CommentStorageApplyCommandArchive { 265 | required .TSK.CommandArchive super = 1; 266 | required .TSP.Reference comment_storage = 2; 267 | required string comment_string = 3; 268 | required string old_comment_string = 4; 269 | } 270 | 271 | message GuideCommandArchive { 272 | enum GuideCommandMode { 273 | Add = 1; 274 | Replace = 2; 275 | Delete = 3; 276 | } 277 | required .TSK.CommandArchive super = 1; 278 | optional .TSP.Reference storage = 2; 279 | optional .TSD.UserDefinedGuideArchive old_guide = 3; 280 | optional .TSD.UserDefinedGuideArchive new_guide = 4; 281 | optional .TSD.GuideCommandArchive.GuideCommandMode mode = 5; 282 | } 283 | 284 | message DrawableAspectRatioLockedCommandArchive { 285 | required .TSK.CommandArchive super = 1; 286 | optional .TSP.Reference info = 2; 287 | optional bool aspect_ratio_locked = 3; 288 | } 289 | 290 | message DrawableAccessibilityDescriptionCommandArchive { 291 | required .TSK.CommandArchive super = 1; 292 | optional .TSP.Reference info = 2; 293 | optional string accessibility_description = 3; 294 | } 295 | 296 | message PasteStyleCommandArchive { 297 | required .TSK.CommandArchive super = 1; 298 | required .TSP.Reference dest_info = 2; 299 | optional .TSP.Reference shape_style = 3; 300 | } 301 | 302 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSKArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | package TSK; 5 | 6 | message TreeNode { 7 | optional string name = 1; 8 | repeated .TSP.Reference children = 2; 9 | optional .TSP.Reference object = 3; 10 | } 11 | 12 | message CommandHistory { 13 | required uint32 undo_count = 1; 14 | repeated .TSP.Reference commands = 2; 15 | repeated .TSP.Reference marked_redo_commands = 3; 16 | optional .TSP.Reference pending_preflight_command = 4; 17 | optional bool fixed_radar_13365177 = 10; 18 | } 19 | 20 | message DocumentArchive { 21 | optional string locale_identifier = 4; 22 | optional .TSP.Reference annotation_author_storage = 7; 23 | } 24 | 25 | message DocumentSupportArchive { 26 | optional .TSP.Reference command_history = 1; 27 | optional .TSP.Reference command_selection_behavior_history = 2; 28 | optional uint32 undo_count = 4; 29 | optional uint32 redo_count = 5; 30 | optional string undo_action_string = 6; 31 | optional string redo_action_string = 7; 32 | optional .TSP.Reference web_state = 8; 33 | } 34 | 35 | message ViewStateArchive { 36 | required .TSP.Reference view_state_root = 1; 37 | } 38 | 39 | message CommandArchive { 40 | optional .TSP.Reference undoRedoState = 1; 41 | optional .TSP.Reference undoCollection = 2; 42 | } 43 | 44 | message CommandGroupArchive { 45 | required .TSK.CommandArchive super = 1; 46 | repeated .TSP.Reference commands = 2; 47 | optional .TSP.IndexSet process_results = 3; 48 | } 49 | 50 | message CommandContainerArchive { 51 | repeated .TSP.Reference commands = 1; 52 | } 53 | 54 | message ReplaceAllChildCommandArchive { 55 | required .TSK.CommandArchive super = 1; 56 | } 57 | 58 | message ReplaceAllCommandArchive { 59 | required .TSK.CommandArchive super = 1; 60 | repeated .TSP.Reference commands = 2; 61 | required string find_string = 3; 62 | required string replace_string = 4; 63 | required uint32 options = 5; 64 | } 65 | 66 | message ShuffleMappingArchive { 67 | message Entry { 68 | required uint32 from = 1; 69 | required uint32 to = 2; 70 | } 71 | required uint32 start_index = 1; 72 | required uint32 end_index = 2; 73 | repeated .TSK.ShuffleMappingArchive.Entry entries = 3; 74 | optional bool is_vertical = 4 [default = true]; 75 | optional bool is_move_operation = 5 [default = false]; 76 | optional uint32 first_moved_index = 6 [default = 0]; 77 | optional uint32 destination_index_for_move = 7 [default = 0]; 78 | optional uint32 number_of_indices_moved = 8 [default = 0]; 79 | } 80 | 81 | message ProgressiveCommandGroupArchive { 82 | required .TSK.CommandGroupArchive super = 1; 83 | } 84 | 85 | message CommandSelectionBehaviorHistoryArchive { 86 | message Entry { 87 | required .TSP.Reference command = 1; 88 | required .TSP.Reference command_selection_behavior = 2; 89 | } 90 | repeated .TSK.CommandSelectionBehaviorHistoryArchive.Entry entries = 1; 91 | } 92 | 93 | message UndoRedoStateCommandSelectionBehaviorArchive { 94 | optional .TSP.Reference undo_redo_state = 2; 95 | } 96 | 97 | message FormatStructArchive { 98 | required uint32 format_type = 1; 99 | optional uint32 decimal_places = 2; 100 | optional string currency_code = 3; 101 | optional uint32 negative_style = 4; 102 | optional bool show_thousands_separator = 5; 103 | optional bool use_accounting_style = 6; 104 | optional uint32 duration_style = 7; 105 | optional uint32 base = 8; 106 | optional uint32 base_places = 9; 107 | optional bool base_use_minus_sign = 10; 108 | optional uint32 fraction_accuracy = 11; 109 | optional bool suppress_date_format = 12; 110 | optional bool suppress_time_format = 13; 111 | optional string date_time_format = 14; 112 | optional uint32 duration_unit_largest = 15; 113 | optional uint32 duration_unit_smallest = 16; 114 | optional uint32 custom_id = 17; 115 | optional string custom_format_string = 18; 116 | optional double scale_factor = 19; 117 | optional bool requires_fraction_replacement = 20; 118 | optional double control_minimum = 21; 119 | optional double control_maximum = 22; 120 | optional double control_increment = 23; 121 | optional uint32 control_format_type = 24; 122 | optional uint32 slider_orientation = 25; 123 | optional uint32 slider_position = 26; 124 | optional uint32 decimal_width = 27; 125 | optional uint32 min_integer_width = 28; 126 | optional uint32 num_nonspace_integer_digits = 29; 127 | optional uint32 num_nonspace_decimal_digits = 30; 128 | optional uint32 index_from_right_last_integer = 31; 129 | repeated string interstitial_strings = 32; 130 | optional .TSP.IndexSet inters_str_insertion_indexes = 33; 131 | optional uint32 num_hash_decimal_digits = 34; 132 | optional uint32 total_num_decimal_digits = 35; 133 | optional bool is_complex = 36; 134 | optional bool contains_integer_token = 37; 135 | optional uint32 multiple_choice_list_initial_value = 38; 136 | optional uint32 multiple_choice_list_id = 39; 137 | optional bool use_automatic_duration_units = 40; 138 | extensions 10000 to 19999; 139 | } 140 | 141 | message CustomFormatArchive { 142 | message Condition { 143 | required uint32 condition_type = 1; 144 | optional float condition_value = 2; 145 | required .TSK.FormatStructArchive condition_format = 3; 146 | optional double condition_value_dbl = 4; 147 | } 148 | required string name = 1; 149 | required uint32 format_type = 2; 150 | required .TSK.FormatStructArchive default_format = 3; 151 | repeated .TSK.CustomFormatArchive.Condition conditions = 4; 152 | } 153 | 154 | message AnnotationAuthorArchive { 155 | optional string name = 1; 156 | optional .TSP.Color color = 2; 157 | } 158 | 159 | message DeprecatedChangeAuthorArchive { 160 | optional string name = 1; 161 | optional .TSP.Color change_color = 2; 162 | } 163 | 164 | message AnnotationAuthorStorageArchive { 165 | repeated .TSP.Reference annotation_author = 1; 166 | } 167 | 168 | message AddAnnotationAuthorCommandArchive { 169 | required .TSK.CommandArchive super = 1; 170 | optional .TSP.Reference document_root = 2; 171 | optional .TSP.Reference annotation_author = 3; 172 | } 173 | 174 | message SetAnnotationAuthorColorCommandArchive { 175 | required .TSK.CommandArchive super = 1; 176 | optional .TSP.Reference annotation_author = 2; 177 | optional .TSP.Color color = 3; 178 | } 179 | 180 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSPArchiveMessages.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | option go_package = "code.sajari.com/docconv/v2/tsp"; 4 | 5 | package TSP; 6 | 7 | message ArchiveInfo { 8 | optional uint64 identifier = 1; 9 | repeated .TSP.MessageInfo message_infos = 2; 10 | } 11 | 12 | message MessageInfo { 13 | required uint32 type = 1; 14 | repeated uint32 version = 2 [packed = true]; 15 | required uint32 length = 3; 16 | repeated .TSP.FieldInfo field_infos = 4; 17 | repeated uint64 object_references = 5 [packed = true]; 18 | repeated uint64 data_references = 6 [packed = true]; 19 | } 20 | 21 | message FieldInfo { 22 | enum Type { 23 | Value = 0; 24 | ObjectReference = 1; 25 | DataReference = 2; 26 | Message = 3; 27 | } 28 | enum Rule { 29 | IgnoreAndDrop = 0; 30 | IgnoreAndPreserve = 1; 31 | MustUnderstand = 2; 32 | NotSupported = -1; 33 | } 34 | required .TSP.FieldPath path = 1; 35 | optional .TSP.FieldInfo.Type type = 2 [default = Value]; 36 | optional .TSP.FieldInfo.Rule rule = 3 [default = IgnoreAndDrop]; 37 | repeated uint64 object_references = 4 [packed = true]; 38 | repeated uint64 data_references = 5 [packed = true]; 39 | } 40 | 41 | message FieldPath { 42 | repeated uint32 path = 1 [packed = true]; 43 | } 44 | 45 | message ComponentInfo { 46 | required uint64 identifier = 1; 47 | required string preferred_locator = 2; 48 | optional string locator = 3; 49 | repeated uint32 read_version = 4 [packed = true]; 50 | repeated uint32 write_version = 5 [packed = true]; 51 | repeated .TSP.ComponentExternalReference external_references = 6; 52 | repeated .TSP.ComponentDataReference data_references = 7; 53 | optional bool allows_duplicates_outside_of_document_package = 8 [default = false]; 54 | optional bool dirties_document_package = 9 [default = true]; 55 | optional bool is_stored_outside_object_archive = 10 [default = false]; 56 | } 57 | 58 | message ComponentExternalReference { 59 | required uint64 component_identifier = 1; 60 | optional uint64 object_identifier = 2; 61 | optional bool is_weak = 3; 62 | } 63 | 64 | message ComponentDataReference { 65 | required uint64 data_identifier = 1; 66 | } 67 | 68 | message PackageMetadata { 69 | required uint64 last_object_identifier = 1; 70 | repeated .TSP.ComponentInfo components = 3; 71 | repeated .TSP.DataInfo datas = 4; 72 | repeated uint32 read_version = 5 [packed = true]; 73 | repeated uint32 write_version = 6 [packed = true]; 74 | } 75 | 76 | message PasteboardMetadata { 77 | repeated uint32 version = 1 [packed = true]; 78 | required string app_name = 2; 79 | repeated .TSP.DataInfo datas = 3; 80 | optional string source_document_uuid = 4; 81 | } 82 | 83 | message DataInfo { 84 | required uint64 identifier = 1; 85 | required bytes digest = 2; 86 | required string preferred_file_name = 3; 87 | optional string file_name = 4; 88 | optional string document_resource_locator = 5; 89 | optional bytes source_bookmark_data = 6; 90 | optional string pasteboard_external_file_path = 99; 91 | } 92 | 93 | message ViewStateMetadata { 94 | repeated uint32 version = 1 [packed = true]; 95 | required string document_version_uuid = 2; 96 | required .TSP.ComponentInfo component = 3; 97 | } 98 | 99 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSPDatabaseMessages.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | option go_package = "code.sajari.com/docconv/v2/tsp"; 4 | 5 | import "TSPMessages.proto"; 6 | package TSP; 7 | 8 | message DatabaseData { 9 | required .TSP.DataReference data = 1; 10 | } 11 | 12 | message DatabaseDataArchive { 13 | optional .TSP.Reference data = 1; 14 | optional string app_relative_path = 2; 15 | required string display_name = 3; 16 | optional uint64 length = 4; 17 | optional uint32 hash = 5; 18 | required bool sharable = 6 [default = true]; 19 | } 20 | 21 | message DatabaseImageDataArchive { 22 | enum ImageType { 23 | unknown = 0; 24 | bitmap = 1; 25 | pdf = 2; 26 | } 27 | required .TSP.DatabaseDataArchive super = 1; 28 | required .TSP.DatabaseImageDataArchive.ImageType type = 2; 29 | } 30 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSPMessages.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | option go_package = "code.sajari.com/docconv/v2/tsp"; 4 | 5 | package TSP; 6 | 7 | message Reference { 8 | required uint64 identifier = 1; 9 | optional int32 deprecated_type = 2; 10 | optional bool deprecated_is_external = 3; 11 | } 12 | 13 | message DataReference { 14 | required uint64 identifier = 1; 15 | } 16 | 17 | message Point { 18 | required float x = 1; 19 | required float y = 2; 20 | } 21 | 22 | message Size { 23 | required float width = 1; 24 | required float height = 2; 25 | } 26 | 27 | message Range { 28 | required uint32 location = 1; 29 | required uint32 length = 2; 30 | } 31 | 32 | message Date { 33 | required double seconds = 1; 34 | } 35 | 36 | message IndexSet { 37 | repeated .TSP.Range ranges = 1; 38 | } 39 | 40 | message Color { 41 | enum ColorModel { 42 | rgb = 1; 43 | cmyk = 2; 44 | white = 3; 45 | } 46 | required .TSP.Color.ColorModel model = 1; 47 | optional float r = 3; 48 | optional float g = 4; 49 | optional float b = 5; 50 | optional float a = 6 [default = 1]; 51 | optional float c = 7; 52 | optional float m = 8; 53 | optional float y = 9; 54 | optional float k = 10; 55 | optional float w = 11; 56 | } 57 | 58 | message Path { 59 | message Element { 60 | required .TSP.Path.ElementType type = 1; 61 | repeated .TSP.Point points = 2; 62 | } 63 | enum ElementType { 64 | moveTo = 1; 65 | lineTo = 2; 66 | quadCurveTo = 3; 67 | curveTo = 4; 68 | closeSubpath = 5; 69 | } 70 | repeated .TSP.Path.Element elements = 1; 71 | } 72 | 73 | message ReferenceDictionary { 74 | message Entry { 75 | required .TSP.Reference key = 1; 76 | required .TSP.Reference value = 2; 77 | } 78 | repeated .TSP.ReferenceDictionary.Entry entries = 1; 79 | } 80 | 81 | message PasteboardObject { 82 | optional .TSP.Reference stylesheet = 1; 83 | repeated .TSP.Reference drawables = 2; 84 | repeated .TSP.Reference styles = 3; 85 | optional .TSP.Reference theme = 4; 86 | optional .TSP.Reference wp_storage = 5; 87 | optional .TSP.Reference guide_storage = 9; 88 | optional .TSP.Reference app_native_object = 6; 89 | optional bool is_text_primary = 7 [default = false]; 90 | optional bool is_smart = 8 [default = false]; 91 | } 92 | 93 | message ObjectContainer { 94 | optional uint32 identifier = 1; 95 | repeated .TSP.Reference objects = 2; 96 | } 97 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSSArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSKArchives.proto"; 5 | package TSS; 6 | 7 | message StyleArchive { 8 | optional string name = 1; 9 | optional string style_identifier = 2; 10 | optional .TSP.Reference parent = 3; 11 | optional bool is_variation = 4 [default = false]; 12 | optional .TSP.Reference stylesheet = 5; 13 | } 14 | 15 | message StylesheetArchive { 16 | message IdentifiedStyleEntry { 17 | required string identifier = 1; 18 | required .TSP.Reference style = 2; 19 | } 20 | message StyleChildrenEntry { 21 | required .TSP.Reference parent = 1; 22 | repeated .TSP.Reference children = 2; 23 | } 24 | repeated .TSP.Reference styles = 1; 25 | repeated .TSS.StylesheetArchive.IdentifiedStyleEntry identifier_to_style_map = 2; 26 | optional .TSP.Reference parent = 3; 27 | optional bool is_locked = 4 [default = true]; 28 | repeated .TSS.StylesheetArchive.StyleChildrenEntry parent_to_children_style_map = 5; 29 | optional bool can_cull_styles = 6 [default = false]; 30 | } 31 | 32 | message ThemeArchive { 33 | optional .TSP.Reference stylesheet = 1; 34 | optional string theme_identifier = 3; 35 | repeated .TSP.Color color_presets = 10; 36 | extensions 100 to 536870911; 37 | } 38 | 39 | message ApplyThemeCommandArchive { 40 | required .TSK.CommandArchive super = 1; 41 | repeated .TSP.Reference commands = 2; 42 | optional .TSP.Reference old_theme = 3; 43 | optional .TSP.Reference new_theme = 4; 44 | } 45 | 46 | message ApplyThemeChildCommandArchive { 47 | required .TSK.CommandArchive super = 1; 48 | optional .TSP.Reference parent = 2; 49 | } 50 | 51 | message ReapUnusedStyleCommandArchive { 52 | message IdentifiedStyleEntry { 53 | required string identifier = 1; 54 | required .TSP.Reference style = 2; 55 | } 56 | required .TSK.CommandArchive super = 1; 57 | optional .TSP.Reference stylesheet = 2; 58 | repeated .TSP.Reference styles = 3; 59 | repeated .TSS.ReapUnusedStyleCommandArchive.IdentifiedStyleEntry identified_styles = 4; 60 | } 61 | 62 | message StyleUpdatePropertyMapCommandArchive { 63 | required .TSK.CommandArchive super = 1; 64 | required .TSP.Reference current_style = 2; 65 | required .TSP.Reference style_with_old_property_map = 3; 66 | required .TSP.Reference style_with_new_property_map = 4; 67 | optional bool notify_for_style_clients = 6 [default = true]; 68 | } 69 | 70 | message ThemeReplacePresetCommandArchive { 71 | required .TSK.CommandArchive super = 1; 72 | required .TSP.Reference theme = 2; 73 | required .TSP.Reference preset = 3; 74 | required .TSP.Reference oldPreset = 4; 75 | required uint32 index = 5; 76 | } 77 | 78 | message ThemeReplaceColorPresetCommandArchive { 79 | required .TSK.CommandArchive super = 1; 80 | required .TSP.Reference theme = 2; 81 | required .TSP.Color color = 3; 82 | required .TSP.Color old_color = 4; 83 | required uint32 index = 5; 84 | } 85 | 86 | message ThemeAddStylePresetCommandArchive { 87 | required .TSK.CommandArchive super = 1; 88 | required .TSP.Reference theme = 2; 89 | required .TSP.Reference preset = 3; 90 | required string preset_kind = 4; 91 | optional string identifier = 5; 92 | optional bool add_preset_to_stylesheet = 6; 93 | } 94 | 95 | message ThemeRemoveStylePresetCommandArchive { 96 | required .TSK.CommandArchive super = 1; 97 | required .TSP.Reference theme = 2; 98 | required .TSP.Reference preset = 3; 99 | required uint32 preset_index = 4; 100 | required string preset_kind = 5; 101 | optional string identifier = 6; 102 | } 103 | 104 | message ThemeMovePresetCommandArchive { 105 | required .TSK.CommandArchive super = 1; 106 | required .TSP.Reference theme = 2; 107 | required .TSP.Reference preset = 3; 108 | required uint32 new_index = 4; 109 | required uint32 old_index = 5; 110 | } 111 | 112 | message ThemeReplaceStylePresetCommandArchive { 113 | required .TSK.CommandArchive super = 1; 114 | required .TSP.Reference preset = 3; 115 | required .TSP.Reference old_preset = 4; 116 | } 117 | 118 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSTStylePropertyArchiving.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSDArchives.proto"; 5 | import "TSKArchives.proto"; 6 | import "TSSArchives.proto"; 7 | import "TSWPArchives.proto"; 8 | package TST; 9 | 10 | message Deprecated_TableStrokeArchive { 11 | optional .TSD.StrokeArchive stroke = 1; 12 | optional bool background = 2 [deprecated = true]; 13 | optional float opacity = 3 [deprecated = true]; 14 | optional bool empty = 4 [deprecated = true]; 15 | } 16 | 17 | message CellStylePropertiesArchive { 18 | optional .TSD.FillArchive cell_fill = 1; 19 | optional bool text_wrap = 3; 20 | optional .TST.Deprecated_TableStrokeArchive deprecated_top_stroke = 4; 21 | optional .TST.Deprecated_TableStrokeArchive deprecated_right_stroke = 5; 22 | optional .TST.Deprecated_TableStrokeArchive deprecated_bottom_stroke = 6; 23 | optional .TST.Deprecated_TableStrokeArchive deprecated_left_stroke = 7; 24 | optional int32 vertical_alignment = 8; 25 | optional .TSWP.PaddingArchive padding = 9; 26 | optional .TSD.StrokeArchive top_stroke = 10; 27 | optional .TSD.StrokeArchive right_stroke = 11; 28 | optional .TSD.StrokeArchive bottom_stroke = 12; 29 | optional .TSD.StrokeArchive left_stroke = 13; 30 | } 31 | 32 | message Deprecated_StrokePresetDataArchive { 33 | required .TST.Deprecated_TableStrokeArchive deprecated_horizontal_stroke = 2; 34 | required .TST.Deprecated_TableStrokeArchive deprecated_vertical_stroke = 1; 35 | required .TST.Deprecated_TableStrokeArchive deprecated_exterior_stroke = 3; 36 | required int32 deprecated_visible_mask = 5; 37 | } 38 | 39 | message StrokePresetDataArchive { 40 | optional .TSD.StrokeArchive horizontal_stroke = 1; 41 | optional .TSD.StrokeArchive vertical_stroke = 2; 42 | optional .TSD.StrokeArchive exterior_stroke = 3; 43 | optional int32 visible_mask = 4; 44 | } 45 | 46 | message StrokePresetListArchive { 47 | required int32 count = 1; 48 | repeated .TST.Deprecated_StrokePresetDataArchive deprecated_preset = 2; 49 | repeated .TST.StrokePresetDataArchive preset = 3; 50 | } 51 | 52 | message TableStylePropertiesArchive { 53 | optional bool banded_rows = 1; 54 | optional .TSD.FillArchive banded_fill = 2; 55 | optional bool behaves_like_spreadsheet = 21; 56 | optional bool auto_resize = 22; 57 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_row_separator_stroke = 4; 58 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_row_border_stroke = 5; 59 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_row_horizontal_stroke = 23; 60 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_row_vertical_stroke = 24; 61 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_column_border_stroke = 7; 62 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_column_separator_stroke = 8; 63 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_column_horizontal_stroke = 25; 64 | optional .TST.Deprecated_TableStrokeArchive deprecated_header_column_vertical_stroke = 26; 65 | optional .TST.Deprecated_TableStrokeArchive deprecated_footer_row_separator_stroke = 10; 66 | optional .TST.Deprecated_TableStrokeArchive deprecated_footer_row_border_stroke = 11; 67 | optional .TST.Deprecated_TableStrokeArchive deprecated_footer_row_horizontal_stroke = 27; 68 | optional .TST.Deprecated_TableStrokeArchive deprecated_footer_row_vertical_stroke = 28; 69 | optional .TST.Deprecated_TableStrokeArchive deprecated_table_body_horizontal_border_stroke = 12; 70 | optional .TST.Deprecated_TableStrokeArchive deprecated_table_body_vertical_border_stroke = 29; 71 | optional .TST.Deprecated_TableStrokeArchive deprecated_table_body_horizontal_stroke = 30; 72 | optional .TST.Deprecated_TableStrokeArchive deprecated_table_body_vertical_stroke = 31; 73 | optional .TST.StrokePresetListArchive stroke_preset_list = 32; 74 | optional bool v_strokes_visible = 33; 75 | optional bool h_strokes_visible = 34; 76 | optional bool hr_separator_visible = 35; 77 | optional bool hc_separator_visible = 36; 78 | optional bool footer_separator_visible = 37; 79 | optional bool table_border_visible = 38; 80 | optional bool table_header_border_visible = 39; 81 | optional bool table_hc_divider_visible = 42; 82 | optional bool table_hr_divider_visible = 43; 83 | optional bool table_footer_divider_visible = 44; 84 | optional int32 OBSOLETE_master_font_size = 40; 85 | optional string master_font_family = 41; 86 | optional .TSWP.WritingDirectionType writing_direction = 45; 87 | optional .TSD.StrokeArchive header_row_separator_stroke = 46; 88 | optional .TSD.StrokeArchive header_row_border_stroke = 47; 89 | optional .TSD.StrokeArchive header_row_horizontal_stroke = 48; 90 | optional .TSD.StrokeArchive header_row_vertical_stroke = 49; 91 | optional .TSD.StrokeArchive header_column_border_stroke = 50; 92 | optional .TSD.StrokeArchive header_column_separator_stroke = 51; 93 | optional .TSD.StrokeArchive header_column_horizontal_stroke = 52; 94 | optional .TSD.StrokeArchive header_column_vertical_stroke = 53; 95 | optional .TSD.StrokeArchive footer_row_separator_stroke = 54; 96 | optional .TSD.StrokeArchive footer_row_border_stroke = 55; 97 | optional .TSD.StrokeArchive footer_row_horizontal_stroke = 56; 98 | optional .TSD.StrokeArchive footer_row_vertical_stroke = 57; 99 | optional .TSD.StrokeArchive table_body_horizontal_border_stroke = 58; 100 | optional .TSD.StrokeArchive table_body_vertical_border_stroke = 59; 101 | optional .TSD.StrokeArchive table_body_horizontal_stroke = 60; 102 | optional .TSD.StrokeArchive table_body_vertical_stroke = 61; 103 | } 104 | 105 | message TableStylePresetArchive { 106 | required int32 index = 1; 107 | optional .TSP.Reference image = 2; 108 | optional .TSP.Reference style_network = 3; 109 | } 110 | 111 | message TableStrokePresetArchive { 112 | required int32 index = 1; 113 | } 114 | 115 | message ThemePresetsArchive { 116 | repeated .TSP.Reference table_style_presets = 1; 117 | repeated .TSP.Reference table_cell_stroke_presets = 2; 118 | extend .TSS.ThemeArchive { 119 | required .TST.ThemePresetsArchive extension = 200; 120 | } 121 | } 122 | 123 | -------------------------------------------------------------------------------- /iWork/pb-schema/TSWPCommandArchives.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "TSPMessages.proto"; 4 | import "TSSArchives.proto"; 5 | import "TSDArchives.proto"; 6 | import "TSDCommandArchives.proto"; 7 | import "TSKArchives.proto"; 8 | import "TSWPArchives.proto"; 9 | package TSWP; 10 | 11 | message DummyCommandArchive { 12 | required uint32 foo = 1; 13 | } 14 | 15 | message TextCommandArchive { 16 | enum Kind { 17 | kKindContained = 0; 18 | kKindReplaceText = 1; 19 | kKindPaste = 2; 20 | kKindParagraphStyle = 3; 21 | kKindListStyle = 4; 22 | kKindIndentParagraphLevel = 5; 23 | kKindDragText = 6; 24 | kKindPasteStyle = 7; 25 | kKindApplyChangesInRange = 8; 26 | kKindSetParagraphLevel = 9; 27 | kKindInsertCitationField = 10; 28 | kKindInsertBibliographyEntry = 11; 29 | kKindFormatCitationFields = 12; 30 | kKindInsertTOCSmartField = 13; 31 | kKindInsertDateTimeField = 14; 32 | kKindUpdateDateTimeField = 15; 33 | kKindSetParagraphFirstTopicNumber = 16; 34 | kKindCharacterStyle = 17; 35 | kKindRevertStyles = 18; 36 | kKindSetParagraphBidi = 19; 37 | } 38 | optional .TSK.CommandArchive super = 1; 39 | optional .TSP.Reference storage = 2; 40 | optional .TSWP.UndoTransaction undo_transaction = 3; 41 | optional uint32 restore_range_location = 4; 42 | optional uint32 restore_range_length = 5; 43 | optional uint32 redo_restore_range_location = 6; 44 | optional uint32 redo_restore_range_length = 7; 45 | optional uint32 text_command_flags = 9; 46 | optional .TSWP.TextCommandArchive.Kind kind = 10 [default = kKindContained]; 47 | } 48 | 49 | message ReplaceAllTextCommandArchive { 50 | optional .TSK.ReplaceAllChildCommandArchive super = 1; 51 | optional .TSP.Reference storage = 2; 52 | optional .TSWP.UndoTransaction undo_transaction = 3; 53 | } 54 | 55 | message FormatTextCommandArchive { 56 | optional .TSK.CommandArchive super = 1; 57 | optional .TSP.Reference storage = 2; 58 | optional .TSWP.UndoTransaction undo_transaction = 3; 59 | optional uint32 selection_range_location = 8; 60 | optional uint32 selection_range_length = 9; 61 | optional uint32 text_command_flags = 11; 62 | } 63 | 64 | message SetColumnStyleCommandArchive { 65 | optional .TSK.CommandArchive super = 1; 66 | optional .TSP.Reference storage = 2; 67 | optional .TSWP.UndoTransaction undo_transaction = 3; 68 | } 69 | 70 | message ReplaceAllUsesOfStyleCommandArchive { 71 | optional .TSK.CommandArchive super = 1; 72 | optional .TSP.Reference storage = 2; 73 | optional .TSWP.UndoTransaction undo_transaction = 3; 74 | } 75 | 76 | message InsertAttachmentCommandArchive { 77 | optional .TSK.CommandArchive super = 1; 78 | optional .TSP.Reference storage = 2; 79 | optional uint32 selection_range_location = 3; 80 | optional uint32 selection_range_length = 4; 81 | optional .TSP.Reference attachment = 5; 82 | optional .TSWP.UndoTransaction undo_transaction = 6; 83 | optional uint32 text_command_flags = 7; 84 | optional uint32 redo_restore_range_location = 8; 85 | optional uint32 redo_restore_range_length = 9; 86 | } 87 | 88 | message InsertColumnsCommandArchive { 89 | optional .TSK.CommandArchive super = 1; 90 | optional .TSP.Reference storage = 2; 91 | optional uint32 column_index = 3; 92 | optional uint32 column_count = 4; 93 | optional .TSWP.UndoTransaction undo_transaction = 5; 94 | } 95 | 96 | message InsertRowsCommandArchive { 97 | optional .TSK.CommandArchive super = 1; 98 | optional .TSP.Reference storage = 2; 99 | optional uint32 row_index = 3; 100 | optional uint32 row_count = 4; 101 | optional .TSWP.UndoTransaction undo_transaction = 5; 102 | } 103 | 104 | message RemoveColumnsCommandArchive { 105 | optional .TSK.CommandArchive super = 1; 106 | optional .TSP.Reference storage = 2; 107 | optional uint32 column_index = 3; 108 | optional uint32 column_count = 4; 109 | optional .TSWP.UndoTransaction undo_transaction = 5; 110 | } 111 | 112 | message RemoveRowsCommandArchive { 113 | optional .TSK.CommandArchive super = 1; 114 | optional .TSP.Reference storage = 2; 115 | optional uint32 row_index = 3; 116 | optional uint32 row_count = 4; 117 | optional .TSWP.UndoTransaction undo_transaction = 5; 118 | } 119 | 120 | message MergeCellsCommandArchive { 121 | optional .TSK.CommandArchive super = 1; 122 | optional .TSP.Reference storage = 2; 123 | optional uint32 row = 3; 124 | optional uint32 column = 4; 125 | optional uint32 row_count = 5; 126 | optional uint32 column_count = 6; 127 | optional .TSWP.UndoTransaction undo_transaction = 7; 128 | } 129 | 130 | message ApplyPlaceholderTextCommandArchive { 131 | optional .TSK.CommandArchive super = 1; 132 | optional .TSP.Reference storage = 2; 133 | optional uint32 selection_range_location = 3; 134 | optional uint32 selection_range_length = 4; 135 | optional .TSWP.UndoTransaction undo_transaction = 5; 136 | } 137 | 138 | message ApplyHighlightTextCommandArchive { 139 | optional .TSK.CommandArchive super = 1; 140 | optional .TSP.Reference storage = 2; 141 | optional uint32 selection_range_location = 3; 142 | optional uint32 selection_range_length = 4; 143 | optional .TSWP.UndoTransaction undo_transaction = 5; 144 | optional bool remove = 6; 145 | } 146 | 147 | message CreateHyperlinkCommandArchive { 148 | optional .TSK.CommandArchive super = 1; 149 | optional .TSP.Reference storage = 2; 150 | optional uint32 selection_range_location = 3; 151 | optional uint32 selection_range_length = 4; 152 | optional string url_ref = 5; 153 | optional .TSWP.UndoTransaction undo_transaction = 6; 154 | } 155 | 156 | message RemoveHyperlinkCommandArchive { 157 | optional .TSK.CommandArchive super = 1; 158 | optional .TSP.Reference storage = 2; 159 | optional uint32 selection_range_location = 3; 160 | optional uint32 selection_range_length = 4; 161 | optional .TSWP.UndoTransaction undo_transaction = 5; 162 | optional bool is_remove_character_style = 6 [default = true]; 163 | } 164 | 165 | message ModifyHyperlinkCommandArchive { 166 | optional .TSK.CommandArchive super = 1; 167 | optional .TSP.Reference hyperlink = 2; 168 | optional string display_text = 3; 169 | optional string url_ref = 4; 170 | optional .TSWP.UndoTransaction undo_transaction = 5; 171 | } 172 | 173 | message UpdateDateTimeFieldCommandArchive { 174 | optional .TSWP.TextCommandArchive super = 1; 175 | optional .TSP.Reference date_time_field = 2; 176 | optional .TSP.Date date = 3; 177 | optional .TSWP.DateTimeSmartFieldArchive.DateTimeFormatterStyle date_style = 4; 178 | optional .TSWP.DateTimeSmartFieldArchive.DateTimeFormatterStyle time_style = 5; 179 | } 180 | 181 | message ApplyRubyTextCommandArchive { 182 | optional .TSK.CommandArchive super = 1; 183 | optional .TSP.Reference storage = 2; 184 | optional uint32 selection_range_location = 3; 185 | optional uint32 selection_range_length = 4; 186 | optional string ruby_text = 5; 187 | optional .TSWP.UndoTransaction undo_transaction = 6; 188 | } 189 | 190 | message RemoveRubyTextCommandArchive { 191 | optional .TSK.CommandArchive super = 1; 192 | optional .TSP.Reference storage = 2; 193 | optional uint32 selection_range_location = 3; 194 | optional uint32 selection_range_length = 4; 195 | optional .TSWP.UndoTransaction undo_transaction = 5; 196 | } 197 | 198 | message ModifyRubyTextCommandArchive { 199 | optional .TSK.CommandArchive super = 1; 200 | optional .TSP.Reference ruby_field = 2; 201 | optional string ruby_text = 3; 202 | optional string base_text = 4; 203 | optional .TSWP.UndoTransaction undo_transaction = 5; 204 | } 205 | 206 | message ModifyTOCSettingsBaseCommandArchive { 207 | optional .TSK.CommandArchive super = 1; 208 | optional .TSP.Reference old_toc_settings = 2; 209 | optional .TSP.Reference new_toc_settings = 3; 210 | } 211 | 212 | message ModifyTOCSettingsForTOCInfoCommandArchive { 213 | optional .TSWP.ModifyTOCSettingsBaseCommandArchive super = 1; 214 | optional .TSP.Reference toc_info = 2; 215 | } 216 | 217 | message ModifyTOCSettingsPresetForThemeCommandArchive { 218 | optional .TSWP.ModifyTOCSettingsBaseCommandArchive super = 1; 219 | optional .TSP.Reference theme = 2; 220 | optional uint32 preset_index = 3; 221 | } 222 | 223 | message AnchorAttachmentCommandArchive { 224 | optional .TSK.CommandArchive super = 1; 225 | optional .TSP.Reference storage = 2; 226 | optional .TSP.Reference attachment = 3; 227 | optional uint32 h_offset_type = 4; 228 | optional float h_offset = 5; 229 | optional uint32 v_offset_type = 6; 230 | optional float v_offset = 7; 231 | optional .TSWP.UndoTransaction undo_transaction = 8; 232 | optional bool is_html_wrap = 9; 233 | } 234 | 235 | message TextApplyThemeCommandArchive { 236 | optional .TSS.ApplyThemeChildCommandArchive super = 1; 237 | optional .TSP.Reference storage = 2; 238 | optional .TSWP.UndoTransaction undo_transaction = 6; 239 | } 240 | 241 | message MoveColumnsCommandArchive { 242 | optional .TSK.CommandArchive super = 1; 243 | optional .TSP.Reference storage = 2; 244 | optional uint32 src_index = 3; 245 | optional uint32 dst_index = 4; 246 | optional uint32 count = 5; 247 | optional .TSWP.UndoTransaction undo_transaction = 6; 248 | } 249 | 250 | message MoveRowsCommandArchive { 251 | optional .TSK.CommandArchive super = 1; 252 | optional .TSP.Reference storage = 2; 253 | optional uint32 src_index = 3; 254 | optional uint32 dst_index = 4; 255 | optional uint32 count = 5; 256 | optional .TSWP.UndoTransaction undo_transaction = 6; 257 | } 258 | 259 | message ShapeApplyPresetCommandArchive { 260 | required .TSD.ShapeApplyPresetCommandArchive super = 1; 261 | } 262 | 263 | message ShapePasteStyleCommandArchive { 264 | required .TSD.PasteStyleCommandArchive super = 1; 265 | optional .TSP.Reference paragraph_style = 2; 266 | optional .TSP.Reference list_style = 3; 267 | optional .TSP.Reference character_style = 4; 268 | } 269 | 270 | message StyleBaseCommandArchive { 271 | optional .TSK.CommandArchive super = 1; 272 | optional .TSP.Reference theme = 2; 273 | optional .TSP.Reference style = 3; 274 | } 275 | 276 | message StyleCreateCommandArchive { 277 | optional .TSWP.StyleBaseCommandArchive super = 1; 278 | optional uint32 preset_index = 2; 279 | } 280 | 281 | message StyleRenameCommandArchive { 282 | optional .TSWP.StyleBaseCommandArchive super = 1; 283 | optional string updated_name = 2; 284 | optional string old_name = 3; 285 | } 286 | 287 | message StyleUpdateCommandArchive { 288 | optional .TSWP.StyleBaseCommandArchive super = 1; 289 | optional .TSP.Reference original_style = 2; 290 | optional .TSP.Reference updated_style = 3; 291 | } 292 | 293 | message StyleDeleteCommandArchive { 294 | optional .TSWP.StyleBaseCommandArchive super = 1; 295 | optional uint32 old_preset_index = 2; 296 | } 297 | 298 | message StyleReorderCommandArchive { 299 | optional .TSWP.StyleBaseCommandArchive super = 1; 300 | optional uint32 old_preset_index = 2; 301 | optional uint32 new_preset_index = 3; 302 | } 303 | 304 | message StyleUpdatePropertyMapCommandArchive { 305 | required .TSS.StyleUpdatePropertyMapCommandArchive super = 1; 306 | } 307 | 308 | -------------------------------------------------------------------------------- /image.go: -------------------------------------------------------------------------------- 1 | //go:build !ocr 2 | 3 | package docconv 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | ) 9 | 10 | // ConvertImage converts images to text. 11 | // Requires gosseract (ocr build tag). 12 | func ConvertImage(r io.Reader) (string, map[string]string, error) { 13 | return "", nil, fmt.Errorf("docconv not built with `ocr` build tag") 14 | } 15 | 16 | // SetImageLanguages sets the languages parameter passed to gosseract. 17 | func SetImageLanguages(...string) {} 18 | -------------------------------------------------------------------------------- /image_ocr.go: -------------------------------------------------------------------------------- 1 | //go:build ocr 2 | 3 | package docconv 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | "sync" 9 | 10 | "github.com/otiai10/gosseract/v2" 11 | ) 12 | 13 | var config = struct { 14 | langs []string 15 | sync.Mutex 16 | }{ 17 | langs: []string{"eng"}, 18 | } 19 | 20 | func SetImageLanguages(l ...string) { 21 | config.Lock() 22 | config.langs = l 23 | config.Unlock() 24 | } 25 | 26 | // ConvertImage converts images to text. 27 | // Requires gosseract. 28 | func ConvertImage(r io.Reader) (string, map[string]string, error) { 29 | f, err := NewLocalFile(r) 30 | if err != nil { 31 | return "", nil, fmt.Errorf("error creating local file: %v", err) 32 | } 33 | defer f.Done() 34 | 35 | meta := make(map[string]string) 36 | 37 | client := gosseract.NewClient() 38 | defer client.Close() 39 | 40 | config.Lock() 41 | defer config.Unlock() 42 | 43 | client.SetLanguage(config.langs...) 44 | client.SetImage(f.Name()) 45 | text, err := client.Text() 46 | if err != nil { 47 | return "", nil, err 48 | } 49 | 50 | return text, meta, nil 51 | } 52 | -------------------------------------------------------------------------------- /image_ocr_test.go: -------------------------------------------------------------------------------- 1 | //go:build ocr 2 | 3 | package docconv 4 | 5 | import ( 6 | "os" 7 | "path" 8 | "reflect" 9 | "testing" 10 | ) 11 | 12 | func TestConvertImage(t *testing.T) { 13 | tests := []struct { 14 | image string 15 | wantText string 16 | wantMeta map[string]string 17 | wantErr bool 18 | }{ 19 | { 20 | image: "001-helloworld.png", 21 | wantText: "Hello, World!", 22 | wantMeta: map[string]string{}, 23 | }, 24 | } 25 | for _, tt := range tests { 26 | t.Run(tt.image, func(t *testing.T) { 27 | image, err := os.Open(path.Join("testdata", tt.image)) 28 | if err != nil { 29 | t.Fatal(err) 30 | } 31 | defer image.Close() 32 | 33 | gotText, gotMeta, err := ConvertImage(image) 34 | if (err != nil) != tt.wantErr { 35 | t.Errorf("ConvertImage() error = %v, wantErr %v", err, tt.wantErr) 36 | return 37 | } 38 | if gotText != tt.wantText { 39 | t.Errorf("ConvertImage() text = %v, want %v", gotText, tt.wantText) 40 | } 41 | if !reflect.DeepEqual(gotMeta, tt.wantMeta) { 42 | t.Errorf("ConvertImage() meta = %v, want %v", gotMeta, tt.wantMeta) 43 | } 44 | }) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /limit.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | const maxBytes = 20 << 20 // 20MB 4 | -------------------------------------------------------------------------------- /local.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | ) 8 | 9 | // LocalFile is a type which wraps an *os.File. See NewLocalFile for more details. 10 | type LocalFile struct { 11 | *os.File 12 | 13 | unlink bool 14 | } 15 | 16 | // NewLocalFile ensures that there is a file which contains the data provided by r. If r is 17 | // actually an instance of *os.File then this file is used, otherwise a temporary file is 18 | // created and the data from r copied into it. Callers must call Done() when 19 | // the LocalFile is no longer needed to ensure all resources are cleaned up. 20 | func NewLocalFile(r io.Reader) (*LocalFile, error) { 21 | if f, ok := r.(*os.File); ok { 22 | return &LocalFile{ 23 | File: f, 24 | }, nil 25 | } 26 | 27 | f, err := os.CreateTemp(os.TempDir(), "docconv") 28 | if err != nil { 29 | return nil, fmt.Errorf("error creating temporary file: %v", err) 30 | } 31 | _, err = io.Copy(f, r) 32 | if err != nil { 33 | f.Close() 34 | os.Remove(f.Name()) 35 | return nil, fmt.Errorf("error copying data into temporary file: %v", err) 36 | } 37 | 38 | return &LocalFile{ 39 | File: f, 40 | unlink: true, 41 | }, nil 42 | } 43 | 44 | // Done cleans up all resources. 45 | func (l *LocalFile) Done() { 46 | l.Close() 47 | if l.unlink { 48 | os.Remove(l.Name()) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /local_test.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestNewLocalFile(t *testing.T) { 9 | _, err := NewLocalFile(strings.NewReader("test")) 10 | if err != nil { 11 | t.Fatalf("got error %v, want nil", err) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /odt.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "time" 9 | ) 10 | 11 | // ConvertODT converts a ODT file to text 12 | func ConvertODT(r io.Reader) (string, map[string]string, error) { 13 | meta := make(map[string]string) 14 | var textBody string 15 | 16 | b, err := io.ReadAll(io.LimitReader(r, maxBytes)) 17 | if err != nil { 18 | return "", nil, err 19 | } 20 | zr, err := zip.NewReader(bytes.NewReader(b), int64(len(b))) 21 | if err != nil { 22 | return "", nil, fmt.Errorf("error unzipping data: %v", err) 23 | } 24 | 25 | for _, f := range zr.File { 26 | switch f.Name { 27 | case "meta.xml": 28 | rc, err := f.Open() 29 | if err != nil { 30 | return "", nil, fmt.Errorf("error extracting '%v' from archive: %v", f.Name, err) 31 | } 32 | defer rc.Close() 33 | 34 | info, err := XMLToMap(rc) 35 | if err != nil { 36 | return "", nil, fmt.Errorf("error parsing '%v': %v", f.Name, err) 37 | } 38 | 39 | if tmp, ok := info["creator"]; ok { 40 | meta["Author"] = tmp 41 | } 42 | if tmp, ok := info["date"]; ok { 43 | if t, err := time.Parse("2006-01-02T15:04:05", tmp); err == nil { 44 | meta["ModifiedDate"] = fmt.Sprintf("%d", t.Unix()) 45 | } 46 | } 47 | if tmp, ok := info["creation-date"]; ok { 48 | if t, err := time.Parse("2006-01-02T15:04:05", tmp); err == nil { 49 | meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix()) 50 | } 51 | } 52 | 53 | case "content.xml": 54 | rc, err := f.Open() 55 | if err != nil { 56 | return "", nil, fmt.Errorf("error extracting '%v' from archive: %v", f.Name, err) 57 | } 58 | defer rc.Close() 59 | 60 | textBody, err = XMLToText(rc, []string{"br", "p", "tab"}, []string{}, true) 61 | if err != nil { 62 | return "", nil, fmt.Errorf("error parsing '%v': %v", f.Name, err) 63 | } 64 | } 65 | } 66 | 67 | return textBody, meta, nil 68 | } 69 | -------------------------------------------------------------------------------- /pages.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "archive/zip" 5 | "bufio" 6 | "bytes" 7 | "encoding/binary" 8 | "fmt" 9 | "io" 10 | "strings" 11 | 12 | "google.golang.org/protobuf/proto" 13 | 14 | TSP "code.sajari.com/docconv/v2/iWork" 15 | "code.sajari.com/docconv/v2/snappy" 16 | ) 17 | 18 | // ConvertPages converts a Pages file to text. 19 | func ConvertPages(r io.Reader) (string, map[string]string, error) { 20 | meta := make(map[string]string) 21 | var textBody string 22 | 23 | b, err := io.ReadAll(io.LimitReader(r, maxBytes)) 24 | if err != nil { 25 | return "", nil, fmt.Errorf("error reading data: %v", err) 26 | } 27 | 28 | zr, err := zip.NewReader(bytes.NewReader(b), int64(len(b))) 29 | if err != nil { 30 | return "", nil, fmt.Errorf("error unzipping data: %v", err) 31 | } 32 | 33 | for _, f := range zr.File { 34 | if strings.HasSuffix(f.Name, "Preview.pdf") { 35 | // There is a preview PDF version we can use 36 | if rc, err := f.Open(); err == nil { 37 | return ConvertPDF(rc) 38 | } 39 | } 40 | if f.Name == "index.xml" { 41 | // There's an XML version we can use 42 | if rc, err := f.Open(); err == nil { 43 | return ConvertXML(rc) 44 | } 45 | } 46 | if f.Name == "Index/Document.iwa" { 47 | rc, _ := f.Open() 48 | defer rc.Close() 49 | bReader := bufio.NewReader(snappy.NewReader(io.MultiReader(strings.NewReader("\xff\x06\x00\x00sNaPpY"), rc))) 50 | 51 | // Ignore error. 52 | // NOTE: This error was unchecked. Need to revisit this to see if it 53 | // should be acted on. 54 | archiveLength, _ := binary.ReadVarint(bReader) 55 | 56 | // Ignore error. 57 | // NOTE: This error was unchecked. Need to revisit this to see if it 58 | // should be acted on. 59 | archiveInfoData, _ := io.ReadAll(io.LimitReader(bReader, archiveLength)) 60 | 61 | archiveInfo := &TSP.ArchiveInfo{} 62 | err = proto.Unmarshal(archiveInfoData, archiveInfo) 63 | fmt.Println("archiveInfo:", archiveInfo, err) 64 | } 65 | } 66 | 67 | return textBody, meta, nil 68 | } 69 | -------------------------------------------------------------------------------- /pdf.go: -------------------------------------------------------------------------------- 1 | //go:build !ocr 2 | 3 | package docconv 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | ) 9 | 10 | func ConvertPDF(r io.Reader) (string, map[string]string, error) { 11 | f, err := NewLocalFile(r) 12 | if err != nil { 13 | return "", nil, fmt.Errorf("error creating local file: %v", err) 14 | } 15 | defer f.Done() 16 | 17 | bodyResult, metaResult, convertErr := ConvertPDFText(f.Name()) 18 | if convertErr != nil { 19 | return "", nil, convertErr 20 | } 21 | if bodyResult.err != nil { 22 | return "", nil, bodyResult.err 23 | } 24 | if metaResult.err != nil { 25 | return "", nil, metaResult.err 26 | } 27 | return bodyResult.body, metaResult.meta, nil 28 | 29 | } 30 | -------------------------------------------------------------------------------- /pdf_ocr.go: -------------------------------------------------------------------------------- 1 | //go:build ocr 2 | 3 | package docconv 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | "os" 9 | "os/exec" 10 | "path/filepath" 11 | "regexp" 12 | "strings" 13 | "sync" 14 | ) 15 | 16 | var ( 17 | exts = []string{".jpg", ".tif", ".tiff", ".png", ".pbm"} 18 | ) 19 | 20 | func compareExt(ext string, exts []string) bool { 21 | for _, e := range exts { 22 | if ext == e { 23 | return true 24 | } 25 | } 26 | return false 27 | } 28 | 29 | func ConvertPDFImages(path string) (BodyResult, error) { 30 | bodyResult := BodyResult{} 31 | 32 | tmp, err := os.MkdirTemp(os.TempDir(), "tmp-imgs-") 33 | if err != nil { 34 | bodyResult.err = err 35 | return bodyResult, err 36 | } 37 | tmpDir := fmt.Sprintf("%s/", tmp) 38 | 39 | defer func() { 40 | _ = os.RemoveAll(tmpDir) // ignore error 41 | }() 42 | 43 | _, err = exec.Command("pdfimages", "-j", path, tmpDir).Output() 44 | if err != nil { 45 | return bodyResult, err 46 | } 47 | 48 | filePaths := []string{} 49 | 50 | walkFunc := func(path string, info os.FileInfo, err error) error { 51 | path, err = filepath.Abs(path) 52 | if err != nil { 53 | return err 54 | } 55 | 56 | if compareExt(filepath.Ext(path), exts) { 57 | filePaths = append(filePaths, path) 58 | } 59 | return nil 60 | } 61 | filepath.Walk(tmpDir, walkFunc) 62 | 63 | fileLength := len(filePaths) 64 | 65 | if fileLength < 1 { 66 | return bodyResult, nil 67 | } 68 | 69 | var wg sync.WaitGroup 70 | 71 | data := make(chan string, fileLength) 72 | 73 | wg.Add(fileLength) 74 | 75 | for _, p := range filePaths { 76 | go func(pathFile string) { 77 | defer wg.Done() 78 | f, err := os.Open(pathFile) 79 | if err != nil { 80 | return 81 | } 82 | 83 | defer f.Close() 84 | out, _, err := ConvertImage(f) 85 | if err != nil { 86 | return 87 | } 88 | 89 | data <- out 90 | 91 | }(p) 92 | } 93 | 94 | wg.Wait() 95 | 96 | close(data) 97 | 98 | for str := range data { 99 | bodyResult.body += str + " " 100 | } 101 | 102 | return bodyResult, nil 103 | } 104 | 105 | // PdfHasImage verify if `path` (PDF) has images 106 | func PDFHasImage(path string) (bool, error) { 107 | cmd := "pdffonts -l 5 %s | tail -n +3 | cut -d' ' -f1 | sort | uniq" 108 | out, err := exec.Command("bash", "-c", fmt.Sprintf(cmd, shellEscape(path))).CombinedOutput() 109 | 110 | if err != nil { 111 | return false, err 112 | } 113 | if string(out) == "" { 114 | return true, nil 115 | } 116 | return false, nil 117 | } 118 | 119 | func ConvertPDF(r io.Reader) (string, map[string]string, error) { 120 | f, err := NewLocalFile(r) 121 | if err != nil { 122 | return "", nil, fmt.Errorf("error creating local file: %v", err) 123 | } 124 | defer f.Done() 125 | 126 | bodyResult, metaResult, textConvertErr := ConvertPDFText(f.Name()) 127 | if textConvertErr != nil { 128 | return "", nil, textConvertErr 129 | } 130 | if bodyResult.err != nil { 131 | return "", nil, bodyResult.err 132 | } 133 | if metaResult.err != nil { 134 | return "", nil, metaResult.err 135 | } 136 | 137 | hasImage, err := PDFHasImage(f.Name()) 138 | if err != nil { 139 | return "", nil, fmt.Errorf("could not check if PDF has image: %w", err) 140 | } 141 | if !hasImage { 142 | return bodyResult.body, metaResult.meta, nil 143 | } 144 | 145 | imageConvertResult, imageConvertErr := ConvertPDFImages(f.Name()) 146 | if imageConvertErr != nil { 147 | return bodyResult.body, metaResult.meta, nil // ignore error, return what we have 148 | } 149 | if imageConvertResult.err != nil { 150 | return bodyResult.body, metaResult.meta, nil // ignore error, return what we have 151 | } 152 | 153 | fullBody := strings.Join([]string{bodyResult.body, imageConvertResult.body}, " ") 154 | 155 | return fullBody, metaResult.meta, nil 156 | 157 | } 158 | 159 | var shellEscapePattern *regexp.Regexp 160 | 161 | func init() { 162 | shellEscapePattern = regexp.MustCompile(`[^\w@%+=:,./-]`) 163 | } 164 | 165 | // shellEscape returns a shell-escaped version of the string s. The returned value 166 | // is a string that can safely be used as one token in a shell command line. 167 | func shellEscape(s string) string { 168 | if len(s) == 0 { 169 | return "''" 170 | } 171 | if shellEscapePattern.MatchString(s) { 172 | return "'" + strings.Replace(s, "'", "'\"'\"'", -1) + "'" 173 | } 174 | 175 | return s 176 | } 177 | -------------------------------------------------------------------------------- /pdf_ocr_test.go: -------------------------------------------------------------------------------- 1 | //go:build ocr 2 | 3 | package docconv 4 | 5 | import ( 6 | "os" 7 | "testing" 8 | ) 9 | 10 | func TestPDFHasImage_CannotExecuteCode(t *testing.T) { 11 | // Try to inject code by passing a bad file path. 12 | // If the code was successful it will create a file called foo in the working directory 13 | badFilePath := "$(id >> foo).pdf" 14 | got, err := PDFHasImage(badFilePath) 15 | if err != nil { 16 | t.Fatal(err) 17 | } 18 | if want := false; got != want { 19 | t.Errorf("got %v, want %v", got, want) 20 | } 21 | 22 | if got, want := fileExists("foo"), false; got != want { 23 | t.Errorf("got bad file exists, want not file to exist") 24 | } 25 | } 26 | 27 | func fileExists(filename string) bool { 28 | info, err := os.Stat(filename) 29 | if os.IsNotExist(err) { 30 | return false 31 | } 32 | return !info.IsDir() 33 | } 34 | -------------------------------------------------------------------------------- /pdf_text.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "fmt" 5 | "os/exec" 6 | "strings" 7 | "time" 8 | ) 9 | 10 | // Meta data 11 | type MetaResult struct { 12 | meta map[string]string 13 | err error 14 | } 15 | 16 | type BodyResult struct { 17 | body string 18 | err error 19 | } 20 | 21 | // Convert PDF 22 | 23 | func ConvertPDFText(path string) (BodyResult, MetaResult, error) { 24 | metaResult := MetaResult{meta: make(map[string]string)} 25 | bodyResult := BodyResult{} 26 | mr := make(chan MetaResult, 1) 27 | go func() { 28 | metaStr, err := exec.Command("pdfinfo", path).Output() 29 | if err != nil { 30 | metaResult.err = err 31 | mr <- metaResult 32 | return 33 | } 34 | 35 | // Parse meta output 36 | for _, line := range strings.Split(string(metaStr), "\n") { 37 | if parts := strings.SplitN(line, ":", 2); len(parts) > 1 { 38 | metaResult.meta[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1]) 39 | } 40 | } 41 | 42 | // Convert parsed meta 43 | if x, ok := metaResult.meta["ModDate"]; ok { 44 | if t, ok := pdfTimeLayouts.Parse(x); ok { 45 | metaResult.meta["ModifiedDate"] = fmt.Sprintf("%d", t.Unix()) 46 | } 47 | } 48 | if x, ok := metaResult.meta["CreationDate"]; ok { 49 | if t, ok := pdfTimeLayouts.Parse(x); ok { 50 | metaResult.meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix()) 51 | } 52 | } 53 | 54 | mr <- metaResult 55 | }() 56 | 57 | br := make(chan BodyResult, 1) 58 | go func() { 59 | body, err := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output() 60 | if err != nil { 61 | bodyResult.err = err 62 | } 63 | 64 | bodyResult.body = string(body) 65 | 66 | br <- bodyResult 67 | }() 68 | 69 | return <-br, <-mr, nil 70 | } 71 | 72 | var pdfTimeLayouts = timeLayouts{time.ANSIC, "Mon Jan _2 15:04:05 2006 MST"} 73 | 74 | type timeLayouts []string 75 | 76 | func (tl timeLayouts) Parse(x string) (time.Time, bool) { 77 | for _, layout := range tl { 78 | t, err := time.Parse(layout, x) 79 | if err == nil { 80 | return t, true 81 | } 82 | } 83 | return time.Time{}, false 84 | } 85 | -------------------------------------------------------------------------------- /pptx.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "os" 9 | "strings" 10 | ) 11 | 12 | // ConvertPptx converts an MS PowerPoint pptx file to text. 13 | func ConvertPptx(r io.Reader) (string, map[string]string, error) { 14 | var size int64 15 | 16 | // Common case: if the reader is a file (or trivial wrapper), avoid 17 | // loading it all into memory. 18 | var ra io.ReaderAt 19 | if f, ok := r.(interface { 20 | io.ReaderAt 21 | Stat() (os.FileInfo, error) 22 | }); ok { 23 | si, err := f.Stat() 24 | if err != nil { 25 | return "", nil, err 26 | } 27 | size = si.Size() 28 | ra = f 29 | } else { 30 | b, err := io.ReadAll(r) 31 | if err != nil { 32 | return "", nil, nil 33 | } 34 | size = int64(len(b)) 35 | ra = bytes.NewReader(b) 36 | } 37 | 38 | zr, err := zip.NewReader(ra, size) 39 | if err != nil { 40 | return "", nil, fmt.Errorf("could not unzip: %v", err) 41 | } 42 | 43 | zipFiles := mapZipFiles(zr.File) 44 | 45 | contentTypeDefinition, err := getContentTypeDefinition(zipFiles["[Content_Types].xml"]) 46 | if err != nil { 47 | return "", nil, err 48 | } 49 | 50 | meta := make(map[string]string) 51 | var textBody string 52 | for _, override := range contentTypeDefinition.Overrides { 53 | f := zipFiles[override.PartName] 54 | 55 | switch override.ContentType { 56 | case "application/vnd.openxmlformats-officedocument.presentationml.slide+xml", 57 | "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml": 58 | body, err := parseDocxText(f) 59 | if err != nil { 60 | return "", nil, fmt.Errorf("could not parse pptx: %v", err) 61 | } 62 | textBody += body + "\n" 63 | } 64 | } 65 | return strings.TrimSuffix(textBody, "\n"), meta, nil 66 | } 67 | -------------------------------------------------------------------------------- /pptx_test/pptx_test.go: -------------------------------------------------------------------------------- 1 | package docx_test 2 | 3 | import ( 4 | "encoding/xml" 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "code.sajari.com/docconv/v2" 10 | ) 11 | 12 | func TestConvertPptx(t *testing.T) { 13 | f, err := os.Open("./testdata/sample.pptx") 14 | if err != nil { 15 | t.Fatal(err) 16 | } 17 | defer f.Close() 18 | 19 | if err != nil { 20 | t.Fatalf("got error = %v, want nil", err) 21 | } 22 | 23 | resp, _, err := docconv.ConvertPptx(f) 24 | if err != nil { 25 | t.Fatalf("got error = %v, want nil", err) 26 | } 27 | if want := "Get text from pptx"; !strings.Contains(resp, want) { 28 | t.Errorf("expected %v to contain %v", resp, want) 29 | } 30 | if want := "First"; !strings.Contains(resp, want) { 31 | t.Errorf("expected %v to contain %v", resp, want) 32 | } 33 | } 34 | 35 | func TestConvertPptxDecompressionSizeLimit(t *testing.T) { 36 | f, err := os.Open("./testdata/decompression_size_limit.pptx") 37 | if err != nil { 38 | t.Fatalf("got error = %v, want nil", err) 39 | } 40 | _, _, err = docconv.ConvertPptx(f) 41 | if _, ok := err.(*xml.SyntaxError); !ok { 42 | t.Errorf("got error = %T, want *xml.SyntaxError", err) 43 | } 44 | if want := "EOF"; !strings.Contains(err.Error(), want) { 45 | t.Errorf("got error = %v, want %v", err, want) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /pptx_test/testdata/decompression_size_limit.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/pptx_test/testdata/decompression_size_limit.pptx -------------------------------------------------------------------------------- /pptx_test/testdata/sample.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/pptx_test/testdata/sample.pptx -------------------------------------------------------------------------------- /rtf.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os/exec" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | // ConvertRTF converts RTF files to text. 12 | func ConvertRTF(r io.Reader) (string, map[string]string, error) { 13 | f, err := NewLocalFile(r) 14 | if err != nil { 15 | return "", nil, fmt.Errorf("error creating local file: %v", err) 16 | } 17 | defer f.Done() 18 | 19 | var output string 20 | tmpOutput, err := exec.Command("unrtf", "--nopict", "--text", f.Name()).Output() 21 | if err != nil { 22 | return "", nil, fmt.Errorf("unrtf error: %v", err) 23 | } 24 | 25 | // Step through content looking for meta data and stripping out comments 26 | meta := make(map[string]string) 27 | for _, line := range strings.Split(string(tmpOutput), "\n") { 28 | if parts := strings.SplitN(line, ":", 2); len(parts) > 1 { 29 | meta[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1]) 30 | } 31 | if !strings.HasPrefix(line, "### ") { 32 | output += line + "\n" 33 | } 34 | } 35 | 36 | // Identify meta data 37 | if tmp, ok := meta["AUTHOR"]; ok { 38 | meta["Author"] = tmp 39 | } 40 | if tmp, ok := meta["### creation date"]; ok { 41 | if t, err := time.Parse("02 January 2006 15:04", tmp); err == nil { 42 | meta["CreatedDate"] = fmt.Sprintf("%d", t.Unix()) 43 | } 44 | } 45 | if tmp, ok := meta["### revision date"]; ok { 46 | if t, err := time.Parse("02 January 2006 15:04", tmp); err == nil { 47 | meta["ModifiedDate"] = fmt.Sprintf("%d", t.Unix()) 48 | } 49 | } 50 | 51 | return output, meta, nil 52 | } 53 | -------------------------------------------------------------------------------- /rtf_test/rtf_test.go: -------------------------------------------------------------------------------- 1 | package rtf_test 2 | 3 | import ( 4 | "bytes" 5 | "os" 6 | "strings" 7 | "testing" 8 | 9 | "code.sajari.com/docconv/v2" 10 | ) 11 | 12 | func TestConvertRTF(t *testing.T) { 13 | data, err := os.ReadFile("testdata/test.rtf") 14 | if err != nil { 15 | t.Fatalf("got error %v, want nil", err) 16 | } 17 | res, _, err := docconv.ConvertRTF(bytes.NewReader(data)) 18 | if err != nil { 19 | t.Fatalf("got error %v, want nil", err) 20 | } 21 | lines := strings.Split(res, "\n")[2:] 22 | line, expectedLine := lines[0], "hello world" 23 | if line != expectedLine { 24 | t.Fatalf("got %s, want %s", line, expectedLine) 25 | } 26 | line, expectedLine = lines[1], "helo" 27 | if line != expectedLine { 28 | t.Fatalf("got %s, want %s", line, expectedLine) 29 | } 30 | line, expectedLine = lines[2], "" 31 | if line != expectedLine { 32 | t.Fatalf("got %s, want %s", line, expectedLine) 33 | } 34 | line, expectedLine = lines[3], "1" 35 | if line != expectedLine { 36 | t.Fatalf("got %s, want %s", line, expectedLine) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /rtf_test/testdata/test.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\deff3\adeflang1025 2 | {\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\froman\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f6\fnil\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}} 3 | {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;} 4 | {\stylesheet{\s0\snext0\dbch\af5\langfe1081\dbch\af6\afs24\alang1081\ql\widctlpar\hyphpar0\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1 Normal;} 5 | {\s15\sbasedon0\snext16\dbch\af5\langfe1081\dbch\af6\afs28\ql\widctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf0\loch\f4\fs28\lang1033\kerning1 Heading;} 6 | {\s16\sbasedon0\snext16\dbch\af5\langfe1081\dbch\af6\afs24\ql\sl276\slmult1\widctlpar\hyphpar0\sb0\sa140\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1 Text Body;} 7 | {\s17\sbasedon16\snext17\dbch\af5\langfe1081\dbch\af6\afs24\ql\sl276\slmult1\widctlpar\hyphpar0\sb0\sa140\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1 List;} 8 | {\s18\sbasedon0\snext18\dbch\af5\langfe1081\dbch\af6\afs24\ai\ql\widctlpar\hyphpar0\sb120\sa120\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1 Caption;} 9 | {\s19\sbasedon0\snext19\dbch\af5\langfe1081\dbch\af6\afs24\ql\widctlpar\hyphpar0\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1 Index;} 10 | }{\*\generator LibreOffice/6.4.6.2$Linux_X86_64 LibreOffice_project/40$Build-2}{\info{\creatim\yr2021\mo3\dy29\hr11\min30}{\revtim\yr2021\mo4\dy9\hr16\min36}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709 11 | \hyphauto1\viewscale100 12 | {\*\pgdsctbl 13 | {\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Default Style;}} 14 | \formshade{\*\pgdscno0}\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc 15 | {\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\dbch\af5\langfe1081\dbch\af6\afs24\alang1081\ql\widctlpar\hyphpar0\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1\ql{\rtlch\dbch\af5\langfe1081\dbch\af6\afs24 \ltrch\cf0\fs24\lang1033\kerning1\loch 16 | hello world} 17 | \par \pard\plain \s0\dbch\af5\langfe1081\dbch\af6\afs24\alang1081\ql\widctlpar\hyphpar0\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1\ql{\rtlch\dbch\af5\langfe1081\dbch\af6\afs24 \ltrch\cf0\fs24\lang1033\kerning1\loch 18 | helo} 19 | \par \pard\plain \s0\dbch\af5\langfe1081\dbch\af6\afs24\alang1081\ql\widctlpar\hyphpar0\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1\ql{\rtlch\dbch\af5\langfe1081\dbch\af6\afs24 \ltrch\cf0\fs24\lang1033\kerning1\loch 20 | } 21 | \par \pard\plain \s0\dbch\af5\langfe1081\dbch\af6\afs24\alang1081\ql\widctlpar\hyphpar0\ltrpar\cf0\loch\f3\fs24\lang1033\kerning1\ql{\rtlch\dbch\af5\langfe1081\dbch\af6\afs24 \ltrch\cf0\fs24\lang1033\kerning1\loch 22 | 1} 23 | \par } -------------------------------------------------------------------------------- /snappy/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /snappy/README: -------------------------------------------------------------------------------- 1 | This is a Snappy library for the Go programming language that has been modified to work with Apple files, which fail to set CRC checks and stream identifiers. This version is a total hack, so if you want to use snappy for other projects **DO NOT USE THIS VERSION**. Use the proper version as per below: 2 | 3 | To download and install from source: 4 | $ go get code.google.com/p/snappy-go/snappy 5 | 6 | 7 | 8 | Unless otherwise noted, the Snappy-Go source files are distributed 9 | under the BSD-style license found in the LICENSE file. 10 | 11 | Contributions should follow the same procedure as for the Go project: 12 | http://golang.org/doc/contribute.html 13 | 14 | -------------------------------------------------------------------------------- /snappy/decode.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Snappy-Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package snappy 6 | 7 | import ( 8 | "encoding/binary" 9 | "errors" 10 | "io" 11 | ) 12 | 13 | var ( 14 | // ErrCorrupt reports that the input is invalid. 15 | ErrCorrupt = errors.New("snappy: corrupt input") 16 | // ErrUnsupported reports that the input isn't supported. 17 | ErrUnsupported = errors.New("snappy: unsupported input") 18 | ) 19 | 20 | // DecodedLen returns the length of the decoded block. 21 | func DecodedLen(src []byte) (int, error) { 22 | v, _, err := decodedLen(src) 23 | return v, err 24 | } 25 | 26 | // decodedLen returns the length of the decoded block and the number of bytes 27 | // that the length header occupied. 28 | func decodedLen(src []byte) (blockLen, headerLen int, err error) { 29 | v, n := binary.Uvarint(src) 30 | if n == 0 { 31 | return 0, 0, ErrCorrupt 32 | } 33 | if uint64(int(v)) != v { 34 | return 0, 0, errors.New("snappy: decoded block is too large") 35 | } 36 | return int(v), n, nil 37 | } 38 | 39 | // Decode returns the decoded form of src. The returned slice may be a sub- 40 | // slice of dst if dst was large enough to hold the entire decoded block. 41 | // Otherwise, a newly allocated slice will be returned. 42 | // It is valid to pass a nil dst. 43 | func Decode(dst, src []byte) ([]byte, error) { 44 | dLen, s, err := decodedLen(src) 45 | if err != nil { 46 | return nil, err 47 | } 48 | if len(dst) < dLen { 49 | dst = make([]byte, dLen) 50 | } 51 | 52 | var d, offset, length int 53 | for s < len(src) { 54 | switch src[s] & 0x03 { 55 | case tagLiteral: 56 | x := uint(src[s] >> 2) 57 | switch { 58 | case x < 60: 59 | s += 1 60 | case x == 60: 61 | s += 2 62 | if s > len(src) { 63 | return nil, ErrCorrupt 64 | } 65 | x = uint(src[s-1]) 66 | case x == 61: 67 | s += 3 68 | if s > len(src) { 69 | return nil, ErrCorrupt 70 | } 71 | x = uint(src[s-2]) | uint(src[s-1])<<8 72 | case x == 62: 73 | s += 4 74 | if s > len(src) { 75 | return nil, ErrCorrupt 76 | } 77 | x = uint(src[s-3]) | uint(src[s-2])<<8 | uint(src[s-1])<<16 78 | case x == 63: 79 | s += 5 80 | if s > len(src) { 81 | return nil, ErrCorrupt 82 | } 83 | x = uint(src[s-4]) | uint(src[s-3])<<8 | uint(src[s-2])<<16 | uint(src[s-1])<<24 84 | } 85 | length = int(x + 1) 86 | if length <= 0 { 87 | return nil, errors.New("snappy: unsupported literal length") 88 | } 89 | if length > len(dst)-d || length > len(src)-s { 90 | return nil, ErrCorrupt 91 | } 92 | copy(dst[d:], src[s:s+length]) 93 | d += length 94 | s += length 95 | continue 96 | 97 | case tagCopy1: 98 | s += 2 99 | if s > len(src) { 100 | return nil, ErrCorrupt 101 | } 102 | length = 4 + int(src[s-2])>>2&0x7 103 | offset = int(src[s-2])&0xe0<<3 | int(src[s-1]) 104 | 105 | case tagCopy2: 106 | s += 3 107 | if s > len(src) { 108 | return nil, ErrCorrupt 109 | } 110 | length = 1 + int(src[s-3])>>2 111 | offset = int(src[s-2]) | int(src[s-1])<<8 112 | 113 | case tagCopy4: 114 | return nil, errors.New("snappy: unsupported COPY_4 tag") 115 | } 116 | 117 | end := d + length 118 | if offset > d || end > len(dst) { 119 | return nil, ErrCorrupt 120 | } 121 | for ; d < end; d++ { 122 | dst[d] = dst[d-offset] 123 | } 124 | } 125 | if d != dLen { 126 | return nil, ErrCorrupt 127 | } 128 | return dst[:d], nil 129 | } 130 | 131 | // NewReader returns a new Reader that decompresses from r, using the framing 132 | // format described at 133 | // https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt 134 | func NewReader(r io.Reader) *Reader { 135 | return &Reader{ 136 | r: r, 137 | decoded: make([]byte, maxUncompressedChunkLen), 138 | buf: make([]byte, MaxEncodedLen(maxUncompressedChunkLen)+checksumSize), 139 | } 140 | } 141 | 142 | // Reader is an io.Reader than can read Snappy-compressed bytes. 143 | type Reader struct { 144 | r io.Reader 145 | err error 146 | decoded []byte 147 | buf []byte 148 | // decoded[i:j] contains decoded bytes that have not yet been passed on. 149 | i, j int 150 | readHeader bool 151 | } 152 | 153 | // Reset discards any buffered data, resets all state, and switches the Snappy 154 | // reader to read from r. This permits reusing a Reader rather than allocating 155 | // a new one. 156 | func (r *Reader) Reset(reader io.Reader) { 157 | r.r = reader 158 | r.err = nil 159 | r.i = 0 160 | r.j = 0 161 | r.readHeader = false 162 | } 163 | 164 | func (r *Reader) readFull(p []byte) (ok bool) { 165 | if _, r.err = io.ReadFull(r.r, p); r.err != nil { 166 | if r.err == io.ErrUnexpectedEOF { 167 | r.err = ErrCorrupt 168 | } 169 | return false 170 | } 171 | return true 172 | } 173 | 174 | // Read satisfies the io.Reader interface. 175 | func (r *Reader) Read(p []byte) (int, error) { 176 | if r.err != nil { 177 | return 0, r.err 178 | } 179 | for { 180 | if r.i < r.j { 181 | n := copy(p, r.decoded[r.i:r.j]) 182 | r.i += n 183 | return n, nil 184 | } 185 | if !r.readFull(r.buf[:4]) { 186 | return 0, r.err 187 | } 188 | chunkType := r.buf[0] 189 | if !r.readHeader { 190 | if chunkType != chunkTypeStreamIdentifier { 191 | r.err = ErrCorrupt 192 | return 0, r.err 193 | } 194 | r.readHeader = true 195 | } 196 | chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 197 | if chunkLen > len(r.buf) { 198 | r.err = ErrUnsupported 199 | return 0, r.err 200 | } 201 | 202 | // The chunk types are specified at 203 | // https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt 204 | switch chunkType { 205 | case chunkTypeCompressedData: 206 | // Section 4.2. Compressed data (chunk type 0x00). 207 | /* 208 | if chunkLen < checksumSize { 209 | r.err = ErrCorrupt 210 | return 0, r.err 211 | } 212 | */ 213 | buf := r.buf[:chunkLen] 214 | if !r.readFull(buf) { 215 | return 0, r.err 216 | } 217 | /* 218 | checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 219 | buf = buf[checksumSize:] 220 | */ 221 | n, err := DecodedLen(buf) 222 | if err != nil { 223 | r.err = err 224 | return 0, r.err 225 | } 226 | if n > len(r.decoded) { 227 | r.err = ErrCorrupt 228 | return 0, r.err 229 | } 230 | if _, err := Decode(r.decoded, buf); err != nil { 231 | r.err = err 232 | return 0, r.err 233 | } 234 | /* 235 | if crc(r.decoded[:n]) != checksum { 236 | r.err = ErrCorrupt 237 | return 0, r.err 238 | } 239 | */ 240 | r.i, r.j = 0, n 241 | continue 242 | 243 | case chunkTypeUncompressedData: 244 | // Section 4.3. Uncompressed data (chunk type 0x01). 245 | if chunkLen < checksumSize { 246 | r.err = ErrCorrupt 247 | return 0, r.err 248 | } 249 | buf := r.buf[:checksumSize] 250 | if !r.readFull(buf) { 251 | return 0, r.err 252 | } 253 | checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 254 | // Read directly into r.decoded instead of via r.buf. 255 | n := chunkLen - checksumSize 256 | if !r.readFull(r.decoded[:n]) { 257 | return 0, r.err 258 | } 259 | if crc(r.decoded[:n]) != checksum { 260 | r.err = ErrCorrupt 261 | return 0, r.err 262 | } 263 | r.i, r.j = 0, n 264 | continue 265 | 266 | case chunkTypeStreamIdentifier: 267 | // Section 4.1. Stream identifier (chunk type 0xff). 268 | if chunkLen != len(magicBody) { 269 | r.err = ErrCorrupt 270 | return 0, r.err 271 | } 272 | if !r.readFull(r.buf[:len(magicBody)]) { 273 | return 0, r.err 274 | } 275 | for i := 0; i < len(magicBody); i++ { 276 | if r.buf[i] != magicBody[i] { 277 | r.err = ErrCorrupt 278 | return 0, r.err 279 | } 280 | } 281 | continue 282 | } 283 | 284 | if chunkType <= 0x7f { 285 | // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). 286 | r.err = ErrUnsupported 287 | return 0, r.err 288 | 289 | } else { 290 | // Section 4.4 Padding (chunk type 0xfe). 291 | // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). 292 | if !r.readFull(r.buf[:chunkLen]) { 293 | return 0, r.err 294 | } 295 | } 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /snappy/encode.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Snappy-Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package snappy 6 | 7 | import ( 8 | "encoding/binary" 9 | "io" 10 | ) 11 | 12 | // We limit how far copy back-references can go, the same as the C++ code. 13 | const maxOffset = 1 << 15 14 | 15 | // emitLiteral writes a literal chunk and returns the number of bytes written. 16 | func emitLiteral(dst, lit []byte) int { 17 | i, n := 0, uint(len(lit)-1) 18 | switch { 19 | case n < 60: 20 | dst[0] = uint8(n)<<2 | tagLiteral 21 | i = 1 22 | case n < 1<<8: 23 | dst[0] = 60<<2 | tagLiteral 24 | dst[1] = uint8(n) 25 | i = 2 26 | case n < 1<<16: 27 | dst[0] = 61<<2 | tagLiteral 28 | dst[1] = uint8(n) 29 | dst[2] = uint8(n >> 8) 30 | i = 3 31 | case n < 1<<24: 32 | dst[0] = 62<<2 | tagLiteral 33 | dst[1] = uint8(n) 34 | dst[2] = uint8(n >> 8) 35 | dst[3] = uint8(n >> 16) 36 | i = 4 37 | case int64(n) < 1<<32: 38 | dst[0] = 63<<2 | tagLiteral 39 | dst[1] = uint8(n) 40 | dst[2] = uint8(n >> 8) 41 | dst[3] = uint8(n >> 16) 42 | dst[4] = uint8(n >> 24) 43 | i = 5 44 | default: 45 | panic("snappy: source buffer is too long") 46 | } 47 | if copy(dst[i:], lit) != len(lit) { 48 | panic("snappy: destination buffer is too short") 49 | } 50 | return i + len(lit) 51 | } 52 | 53 | // emitCopy writes a copy chunk and returns the number of bytes written. 54 | func emitCopy(dst []byte, offset, length int) int { 55 | i := 0 56 | for length > 0 { 57 | x := length - 4 58 | if 0 <= x && x < 1<<3 && offset < 1<<11 { 59 | dst[i+0] = uint8(offset>>8)&0x07<<5 | uint8(x)<<2 | tagCopy1 60 | dst[i+1] = uint8(offset) 61 | i += 2 62 | break 63 | } 64 | 65 | x = length 66 | if x > 1<<6 { 67 | x = 1 << 6 68 | } 69 | dst[i+0] = uint8(x-1)<<2 | tagCopy2 70 | dst[i+1] = uint8(offset) 71 | dst[i+2] = uint8(offset >> 8) 72 | i += 3 73 | length -= x 74 | } 75 | return i 76 | } 77 | 78 | // Encode returns the encoded form of src. The returned slice may be a sub- 79 | // slice of dst if dst was large enough to hold the entire encoded block. 80 | // Otherwise, a newly allocated slice will be returned. 81 | // It is valid to pass a nil dst. 82 | func Encode(dst, src []byte) ([]byte, error) { 83 | if n := MaxEncodedLen(len(src)); len(dst) < n { 84 | dst = make([]byte, n) 85 | } 86 | 87 | // The block starts with the varint-encoded length of the decompressed bytes. 88 | d := binary.PutUvarint(dst, uint64(len(src))) 89 | 90 | // Return early if src is short. 91 | if len(src) <= 4 { 92 | if len(src) != 0 { 93 | d += emitLiteral(dst[d:], src) 94 | } 95 | return dst[:d], nil 96 | } 97 | 98 | // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. 99 | const maxTableSize = 1 << 14 100 | shift, tableSize := uint(32-8), 1<<8 101 | for tableSize < maxTableSize && tableSize < len(src) { 102 | shift-- 103 | tableSize *= 2 104 | } 105 | var table [maxTableSize]int 106 | 107 | // Iterate over the source bytes. 108 | var ( 109 | s int // The iterator position. 110 | t int // The last position with the same hash as s. 111 | lit int // The start position of any pending literal bytes. 112 | ) 113 | for s+3 < len(src) { 114 | // Update the hash table. 115 | b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3] 116 | h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24 117 | p := &table[(h*0x1e35a7bd)>>shift] 118 | // We need to to store values in [-1, inf) in table. To save 119 | // some initialization time, (re)use the table's zero value 120 | // and shift the values against this zero: add 1 on writes, 121 | // subtract 1 on reads. 122 | t, *p = *p-1, s+1 123 | // If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte. 124 | if t < 0 || s-t >= maxOffset || b0 != src[t] || b1 != src[t+1] || b2 != src[t+2] || b3 != src[t+3] { 125 | s++ 126 | continue 127 | } 128 | // Otherwise, we have a match. First, emit any pending literal bytes. 129 | if lit != s { 130 | d += emitLiteral(dst[d:], src[lit:s]) 131 | } 132 | // Extend the match to be as long as possible. 133 | s0 := s 134 | s, t = s+4, t+4 135 | for s < len(src) && src[s] == src[t] { 136 | s++ 137 | t++ 138 | } 139 | // Emit the copied bytes. 140 | d += emitCopy(dst[d:], s-t, s-s0) 141 | lit = s 142 | } 143 | 144 | // Emit any final pending literal bytes and return. 145 | if lit != len(src) { 146 | d += emitLiteral(dst[d:], src[lit:]) 147 | } 148 | return dst[:d], nil 149 | } 150 | 151 | // MaxEncodedLen returns the maximum length of a snappy block, given its 152 | // uncompressed length. 153 | func MaxEncodedLen(srcLen int) int { 154 | // Compressed data can be defined as: 155 | // compressed := item* literal* 156 | // item := literal* copy 157 | // 158 | // The trailing literal sequence has a space blowup of at most 62/60 159 | // since a literal of length 60 needs one tag byte + one extra byte 160 | // for length information. 161 | // 162 | // Item blowup is trickier to measure. Suppose the "copy" op copies 163 | // 4 bytes of data. Because of a special check in the encoding code, 164 | // we produce a 4-byte copy only if the offset is < 65536. Therefore 165 | // the copy op takes 3 bytes to encode, and this type of item leads 166 | // to at most the 62/60 blowup for representing literals. 167 | // 168 | // Suppose the "copy" op copies 5 bytes of data. If the offset is big 169 | // enough, it will take 5 bytes to encode the copy op. Therefore the 170 | // worst case here is a one-byte literal followed by a five-byte copy. 171 | // That is, 6 bytes of input turn into 7 bytes of "compressed" data. 172 | // 173 | // This last factor dominates the blowup, so the final estimate is: 174 | return 32 + srcLen + srcLen/6 175 | } 176 | 177 | // NewWriter returns a new Writer that compresses to w, using the framing 178 | // format described at 179 | // https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt 180 | func NewWriter(w io.Writer) *Writer { 181 | return &Writer{ 182 | w: w, 183 | enc: make([]byte, MaxEncodedLen(maxUncompressedChunkLen)), 184 | } 185 | } 186 | 187 | // Writer is an io.Writer than can write Snappy-compressed bytes. 188 | type Writer struct { 189 | w io.Writer 190 | err error 191 | enc []byte 192 | buf [checksumSize + chunkHeaderSize]byte 193 | wroteHeader bool 194 | } 195 | 196 | // Reset discards the writer's state and switches the Snappy writer to write to 197 | // w. This permits reusing a Writer rather than allocating a new one. 198 | func (w *Writer) Reset(writer io.Writer) { 199 | w.w = writer 200 | w.err = nil 201 | w.wroteHeader = false 202 | } 203 | 204 | // Write satisfies the io.Writer interface. 205 | func (w *Writer) Write(p []byte) (n int, errRet error) { 206 | if w.err != nil { 207 | return 0, w.err 208 | } 209 | if !w.wroteHeader { 210 | copy(w.enc, magicChunk) 211 | if _, err := w.w.Write(w.enc[:len(magicChunk)]); err != nil { 212 | w.err = err 213 | return n, err 214 | } 215 | w.wroteHeader = true 216 | } 217 | for len(p) > 0 { 218 | var uncompressed []byte 219 | if len(p) > maxUncompressedChunkLen { 220 | uncompressed, p = p[:maxUncompressedChunkLen], p[maxUncompressedChunkLen:] 221 | } else { 222 | uncompressed, p = p, nil 223 | } 224 | checksum := crc(uncompressed) 225 | 226 | // Compress the buffer, discarding the result if the improvement 227 | // isn't at least 12.5%. 228 | chunkType := uint8(chunkTypeCompressedData) 229 | chunkBody, err := Encode(w.enc, uncompressed) 230 | if err != nil { 231 | w.err = err 232 | return n, err 233 | } 234 | if len(chunkBody) >= len(uncompressed)-len(uncompressed)/8 { 235 | chunkType, chunkBody = chunkTypeUncompressedData, uncompressed 236 | } 237 | 238 | chunkLen := 4 + len(chunkBody) 239 | w.buf[0] = chunkType 240 | w.buf[1] = uint8(chunkLen >> 0) 241 | w.buf[2] = uint8(chunkLen >> 8) 242 | w.buf[3] = uint8(chunkLen >> 16) 243 | w.buf[4] = uint8(checksum >> 0) 244 | w.buf[5] = uint8(checksum >> 8) 245 | w.buf[6] = uint8(checksum >> 16) 246 | w.buf[7] = uint8(checksum >> 24) 247 | if _, err = w.w.Write(w.buf[:]); err != nil { 248 | w.err = err 249 | return n, err 250 | } 251 | if _, err = w.w.Write(chunkBody); err != nil { 252 | w.err = err 253 | return n, err 254 | } 255 | n += len(uncompressed) 256 | } 257 | return n, nil 258 | } 259 | -------------------------------------------------------------------------------- /snappy/snappy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Snappy-Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package snappy implements the snappy block-based compression format. 6 | // It aims for very high speeds and reasonable compression. 7 | // 8 | // The C++ snappy implementation is at http://code.google.com/p/snappy/ 9 | package snappy 10 | 11 | import ( 12 | "hash/crc32" 13 | ) 14 | 15 | /* 16 | Each encoded block begins with the varint-encoded length of the decoded data, 17 | followed by a sequence of chunks. Chunks begin and end on byte boundaries. The 18 | first byte of each chunk is broken into its 2 least and 6 most significant bits 19 | called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. 20 | Zero means a literal tag. All other values mean a copy tag. 21 | 22 | For literal tags: 23 | - If m < 60, the next 1 + m bytes are literal bytes. 24 | - Otherwise, let n be the little-endian unsigned integer denoted by the next 25 | m - 59 bytes. The next 1 + n bytes after that are literal bytes. 26 | 27 | For copy tags, length bytes are copied from offset bytes ago, in the style of 28 | Lempel-Ziv compression algorithms. In particular: 29 | - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). 30 | The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 31 | of the offset. The next byte is bits 0-7 of the offset. 32 | - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). 33 | The length is 1 + m. The offset is the little-endian unsigned integer 34 | denoted by the next 2 bytes. 35 | - For l == 3, this tag is a legacy format that is no longer supported. 36 | */ 37 | const ( 38 | tagLiteral = 0x00 39 | tagCopy1 = 0x01 40 | tagCopy2 = 0x02 41 | tagCopy4 = 0x03 42 | ) 43 | 44 | const ( 45 | checksumSize = 4 46 | chunkHeaderSize = 4 47 | magicChunk = "\xff\x06\x00\x00" + magicBody 48 | magicBody = "sNaPpY" 49 | // https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt says 50 | // that "the uncompressed data in a chunk must be no longer than 65536 bytes". 51 | maxUncompressedChunkLen = 65536 52 | ) 53 | 54 | const ( 55 | chunkTypeCompressedData = 0x00 56 | chunkTypeUncompressedData = 0x01 57 | chunkTypePadding = 0xfe 58 | chunkTypeStreamIdentifier = 0xff 59 | ) 60 | 61 | var crcTable = crc32.MakeTable(crc32.Castagnoli) 62 | 63 | // crc implements the checksum specified in section 3 of 64 | // https://code.google.com/p/snappy/source/browse/trunk/framing_format.txt 65 | func crc(b []byte) uint32 { 66 | c := crc32.Update(0, crcTable, b) 67 | return uint32(c>>15|c<<17) + 0xa282ead8 68 | } 69 | -------------------------------------------------------------------------------- /testdata/001-helloworld.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/testdata/001-helloworld.png -------------------------------------------------------------------------------- /testdata/001-test.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sajari/docconv/785a29a00de4b976c379fd38299c220307220684/testdata/001-test.doc -------------------------------------------------------------------------------- /tidy.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "io" 5 | "os" 6 | "os/exec" 7 | ) 8 | 9 | // Tidy attempts to tidy up XML. 10 | // Errors & warnings are deliberately suppressed as underlying tools 11 | // throw warnings very easily. 12 | func Tidy(r io.Reader, xmlIn bool) ([]byte, error) { 13 | f, err := os.CreateTemp(os.TempDir(), "docconv") 14 | if err != nil { 15 | return nil, err 16 | } 17 | defer os.Remove(f.Name()) 18 | io.Copy(f, r) 19 | 20 | var output []byte 21 | if xmlIn { 22 | output, err = exec.Command("tidy", "-xml", "-numeric", "-asxml", "-quiet", "-utf8", f.Name()).Output() 23 | } else { 24 | output, err = exec.Command("tidy", "-numeric", "-asxml", "-quiet", "-utf8", f.Name()).Output() 25 | } 26 | 27 | if err != nil && err.Error() != "exit status 1" { 28 | return nil, err 29 | } 30 | return output, nil 31 | } 32 | -------------------------------------------------------------------------------- /url.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | 7 | "github.com/advancedlogic/GoOse" 8 | ) 9 | 10 | // ConvertURL fetches the HTML page at the URL given in the io.Reader. 11 | func ConvertURL(input io.Reader, readability bool) (string, map[string]string, error) { 12 | meta := make(map[string]string) 13 | 14 | buf := new(bytes.Buffer) 15 | _, err := buf.ReadFrom(input) 16 | if err != nil { 17 | return "", nil, err 18 | } 19 | 20 | g := goose.New() 21 | article, err := g.ExtractFromURL(buf.String()) 22 | if err != nil { 23 | return "", nil, err 24 | } 25 | 26 | meta["title"] = article.Title 27 | meta["description"] = article.MetaDescription 28 | meta["image"] = article.TopImage 29 | 30 | return article.CleanedText, meta, nil 31 | } 32 | -------------------------------------------------------------------------------- /xml.go: -------------------------------------------------------------------------------- 1 | package docconv 2 | 3 | import ( 4 | "bytes" 5 | "encoding/xml" 6 | "fmt" 7 | "io" 8 | ) 9 | 10 | // ConvertXML converts an XML file to text. 11 | func ConvertXML(r io.Reader) (string, map[string]string, error) { 12 | meta := make(map[string]string) 13 | cleanXML, err := Tidy(r, true) 14 | if err != nil { 15 | return "", nil, fmt.Errorf("tidy error: %v", err) 16 | } 17 | result, err := XMLToText(bytes.NewReader(cleanXML), []string{}, []string{}, true) 18 | if err != nil { 19 | return "", nil, fmt.Errorf("error from XMLToText: %v", err) 20 | } 21 | return result, meta, nil 22 | } 23 | 24 | // XMLToText converts XML to plain text given how to treat elements. 25 | func XMLToText(r io.Reader, breaks []string, skip []string, strict bool) (string, error) { 26 | var result string 27 | 28 | dec := xml.NewDecoder(io.LimitReader(r, maxBytes)) 29 | dec.Strict = strict 30 | for { 31 | t, err := dec.Token() 32 | if err != nil { 33 | if err == io.EOF { 34 | break 35 | } 36 | return "", err 37 | } 38 | 39 | switch v := t.(type) { 40 | case xml.CharData: 41 | result += string(v) 42 | case xml.StartElement: 43 | for _, breakElement := range breaks { 44 | if v.Name.Local == breakElement { 45 | result += "\n" 46 | } 47 | } 48 | for _, skipElement := range skip { 49 | if v.Name.Local == skipElement { 50 | depth := 1 51 | for { 52 | t, err := dec.Token() 53 | if err != nil { 54 | // An io.EOF here is actually an error. 55 | return "", err 56 | } 57 | 58 | switch t.(type) { 59 | case xml.StartElement: 60 | depth++ 61 | case xml.EndElement: 62 | depth-- 63 | } 64 | 65 | if depth == 0 { 66 | break 67 | } 68 | } 69 | } 70 | } 71 | } 72 | } 73 | return result, nil 74 | } 75 | 76 | // XMLToMap converts XML to a nested string map. 77 | func XMLToMap(r io.Reader) (map[string]string, error) { 78 | m := make(map[string]string) 79 | dec := xml.NewDecoder(io.LimitReader(r, maxBytes)) 80 | var tagName string 81 | for { 82 | t, err := dec.Token() 83 | if err != nil { 84 | if err == io.EOF { 85 | break 86 | } 87 | return nil, err 88 | } 89 | 90 | switch v := t.(type) { 91 | case xml.StartElement: 92 | tagName = string(v.Name.Local) 93 | case xml.CharData: 94 | m[tagName] = string(v) 95 | } 96 | } 97 | return m, nil 98 | } 99 | --------------------------------------------------------------------------------