├── .appveyor.yml
├── .drone.yml
├── .gitattributes
├── .travis.yml
├── LICENSE.md
├── README.md
├── cmd
├── dictgen
│ └── main.go
└── dictutil
│ ├── install.go
│ ├── main.go
│ ├── pack.go
│ ├── prefix.go
│ ├── uninstall.go
│ └── unpack.go
├── dictgen
├── dictfile.go
├── dictfile_test.go
├── dictgen.go
├── image.go
└── image_test.go
├── docs
├── _config.yml
├── _includes
│ └── head_custom.html
├── dictgen
│ └── index.md
├── dicthtml
│ ├── format.md
│ ├── index.md
│ ├── install.md
│ ├── matching.md
│ ├── prefixes.md
│ ├── v1v2-1.png
│ ├── v1v2-2.png
│ └── v1v2.md
├── dictutil
│ ├── index.md
│ ├── install.md
│ ├── pack.md
│ ├── prefix.md
│ ├── uninstall.md
│ └── unpack.md
├── examples
│ ├── bgl-convert.md
│ ├── dictzip-decompile.md
│ ├── gotdict-convert.md
│ ├── index.md
│ └── webster1913-convert.md
└── index.md
├── examples
├── bgl-convert
│ └── index.html
├── dictzip-decompile
│ ├── main.go
│ └── parse.go
├── gotdict-convert
│ ├── gotdict
│ │ └── parser.go
│ └── main.go
└── webster1913-convert
│ ├── main.go
│ └── webster1913
│ └── parser.go
├── go.mod
├── go.sum
├── kobodict
├── crypt.go
├── crypt_test.go
├── fs.go
├── fs_test.go
├── marisa.go
├── marisa
│ ├── marisa.go
│ ├── marisa_cgo.go
│ └── marisa_test.go
├── reader.go
├── reader_test.go
├── util.go
├── util_test.go
├── writer.go
└── writer_test.go
└── marisa
├── libmarisa.cc
├── libmarisa.h
├── libmarisa_generate.go
├── marisa.cc
├── marisa.go
├── marisa_test.go
├── shim.go
└── shim.h
/.appveyor.yml:
--------------------------------------------------------------------------------
1 | image: ubuntu
2 | version: "{build}"
3 |
4 | environment:
5 | GO111MODULE: on
6 |
7 | install:
8 | - go mod download
9 |
10 | build_script:
11 | - mkdir bin gotdict webster1913
12 | - CGO_ENABLED=1 go build -o ./bin/dictgen ./cmd/dictgen
13 | - CGO_ENABLED=0 go build -o ./bin/gotdict-convert ./examples/gotdict-convert
14 | - CGO_ENABLED=0 go build -o ./bin/webster1913-convert ./examples/webster1913-convert
15 | - curl -#Lo ./webster1913/webster1913.txt http://www.gutenberg.org/ebooks/29765.txt.utf-8
16 | - curl -#Lo - https://github.com/wjdp/gotdict/archive/6b4d6cdbb1f5d899d418783ab842f487aafa79ec.tar.gz | tar -xzf - --strip-components=1 -C ./gotdict
17 | - ./bin/gotdict-convert -o ./gotdict/gotdict.df -g ./gotdict --images
18 | - ./bin/gotdict-convert -o ./gotdict/gotdict.noimg.df -g ./gotdict
19 | - ./bin/webster1913-convert -o ./webster1913/webster1913.df ./webster1913/webster1913.txt
20 | - ./bin/dictgen -Ibase64 -o ./gotdict/dicthtml-gt.zip ./gotdict/gotdict.df
21 | - ./bin/dictgen -Iremove -o ./gotdict/dicthtml-gt.noimg.zip ./gotdict/gotdict.noimg.df
22 | - ./bin/dictgen -Iremove -o ./webster1913/dicthtml-wb.zip ./webster1913/webster1913.df
23 |
24 | test_script:
25 | - go test -v -cover ./...
26 | - mkdir tmp
27 | - CGO_ENABLED=1 go build -o ./bin/dictutil ./cmd/dictutil
28 | - ./bin/dictutil u -o ./tmp/1 ./gotdict/dicthtml-gt.zip
29 | - ./bin/dictutil u -o ./tmp/2 ./gotdict/dicthtml-gt.noimg.zip
30 | - ./bin/dictutil u -o ./tmp/3 ./webster1913/dicthtml-wb.zip
31 | - ./bin/dictutil p -o ./tmp/1.zip ./tmp/1
32 | - ./bin/dictutil p -o ./tmp/2.zip ./tmp/2
33 | - ./bin/dictutil p -o ./tmp/3.zip ./tmp/3
34 | - sha1sum ./gotdict/dicthtml-gt.zip ./gotdict/dicthtml-gt.noimg.zip ./webster1913/dicthtml-wb.zip
35 | - sha1sum ./tmp/1.zip ./tmp/2.zip ./tmp/3.zip
36 | - cmp ./tmp/1.zip ./gotdict/dicthtml-gt.zip
37 | - cmp ./tmp/2.zip ./gotdict/dicthtml-gt.noimg.zip
38 | - cmp ./tmp/3.zip ./webster1913/dicthtml-wb.zip
39 |
40 | artifacts:
41 | - path: gotdict/gotdict.df
42 | - path: gotdict/gotdict.noimg.df
43 | - path: gotdict/dicthtml-gt.zip
44 | - path: gotdict/dicthtml-gt.noimg.zip
45 | - path: webster1913/webster1913.df
46 | - path: webster1913/dicthtml-wb.zip
47 |
48 | deploy: off
49 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | marisa/libmarisa.cc linguist-generated=true
2 | marisa/libmarisa.h linguist-generated=true
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | os:
2 | - osx
3 |
4 | language: go
5 |
6 | go:
7 | - 1.14.x
8 |
9 | env:
10 | GO111MODULE: "on"
11 |
12 | script:
13 | - go run -mod=readonly ./cmd/dictutil --help
14 | - go run -mod=readonly ./cmd/dictgen --help
15 | - go run -mod=readonly ./examples/dictzip-decompile --help
16 | - go run -mod=readonly ./examples/gotdict-convert --help
17 | - go run -mod=readonly ./examples/webster1913-convert --help
18 | - go test -mod=readonly -v ./...
19 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Patrick Gaskin
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
dictutil
2 |
3 | [](https://github.com/pgaskin/dictutil/releases) [](https://cloud.drone.io/pgaskin/dictutil) [](https://pkg.go.dev/mod/github.com/pgaskin/dictutil?tab=versions) [](https://goreportcard.com/report/github.com/pgaskin/dictutil)
4 |
5 | This repository contains a collection of tools and libraries to work with Kobo dictionaries, plus comprehensive documentation of Kobo's dictionary format.
6 |
7 | Unlike previous attempts at working with Kobo dictionaries, dictutil has full support for all features supported by nickel (word prefixes, unicode, variants, images, etc), with a focus on simplicity, correctness (prefix generation and other features are directly tested against libnickel's code and regexps, v1/v2 dictionaries are differentiated), and completeness (most of the research was done by reverse-engineering libnickel).
8 |
9 | Dictutil consists of multiple tools and libraries:
10 | - [**dictutil**](https://pgaskin.net/dictutil/dictutil/) provides commands for installing, removing, unpacking, packing, and performing low-level modifications and tests on Kobo dictionaries. All operations are intended to be correct, lossless, and deterministic.
11 | - [**dictgen**](https://pgaskin.net/dictutil/dictgen/) simplifies creating full-featured dictionaries for Kobo eReaders, with support for images, unicode prefixes, raw html, markdown, and more.
12 | - [**dicthtml**](https://pgaskin.net/dictutil/dicthtml/) documents Kobo's dictionary format and how it works.
13 | - [**examples/gotdict-convert**](https://pgaskin.net/dictutil/examples/gotdict-convert.html) is a working example of using dictutil to convert [GOTDict](https://github.com/wjdp/gotdict) into a Kobo dictionary.
14 | - [**examples/webster1913-convert**](https://pgaskin.net/dictutil/examples/webster1913-convert.html) is a working example of using dictutil to convert [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) into a Kobo dictionary.
15 | - [**examples/dictzip-decompile**](https://pgaskin.net/dictutil/examples/dictzip-decompile.html) is an **experimental** tool to convert a dictzip into a dictfile.
16 | - [**examples/bgl-convert**](https://pgaskin.net/dictutil/examples/bgl-convert.html) is a simple tool to convert Babylon BGL dictionaries to a dictfile.
17 | - *Library:* [**kobodict**](https://pkg.go.dev/github.com/pgaskin/dictutil/kobodict) provides support for reading, writing, encrypting, and decrypting Kobo dictionaries.
18 | - *Library:* [**dictgen**](https://pkg.go.dev/github.com/pgaskin/dictutil/dictgen) provides the functionality of dictgen as a library.
19 | - *Library:* [**marisa**](./marisa) provides a simplified self-contained CGO wrapper for [marisa-trie](https://github.com/s-yata/marisa-trie).
20 |
21 | Dictutil implements [version 2](https://pgaskin.net/dictutil/dicthtml/v1v2.html) of the Kobo dictionary format, which supports firmware versions 4.7.10364+.
22 |
23 | For more information, see the [documentation](https://pgaskin.net/dictutil/). If you just want a quick overview of the utilities provided, continue reading below.
24 |
25 | ## Download
26 | - **Documentation** can be found on the [website](https://pgaskin.net/dictutil/).
27 | - **Tools** (dictutil, dictgen, gotdict-convert, webster1913-convert) can be downloaded from the [releases](https://github.com/pgaskin/dictutil/releases) page.
28 | - **Pre-built dictionaries** from gotdict-convert and webster1913-convert can be downloaded from [AppVeyor](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts) or from the links below:
29 | - GOTDict *(with images, firmware 4.20.14601+)*: [dictzip (dicthtml-gt.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.df?branch=master&all=false&pr=false)
30 | - GOTDict *(without images)*: [dictzip (dicthtml-gt.noimg.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.noimg.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.noimg.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.noimg.df?branch=master&all=false&pr=false)
31 | - Webster's 1913 Dictionary: [dictzip (dicthtml-wb.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/dicthtml-wb.zip?branch=master&all=false&pr=false), [source dictfile (webster1913.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/webster1913.df?branch=master&all=false&pr=false)
32 | - **API documentation** for the Go libraries can be found on [pkg.go.dev](https://pkg.go.dev/github.com/pgaskin/dictutil).
33 |
34 | ## Usage
35 | See the [documentation](https://pgaskin.net/dictutil/) for more detailed information and examples.
36 |
37 | ### dictutil
38 |
39 | ```
40 | Usage: dictutil command [options] [arguments]
41 |
42 | Dictutil provides low-level utilities to manipulate Kobo dictionaries (v2).
43 |
44 | Commands:
45 | install (I) Install a dictzip file
46 | pack (p) Pack a dictzip file
47 | prefix (x) Calculate the prefix for a word
48 | uninstall (U) Uninstall a dictzip file
49 | unpack (u) Unpack a dictzip file
50 | help Show help for all commands
51 |
52 | Options:
53 | -h, --help Show this help text
54 | ```
55 |
56 | ```
57 | Usage: dictutil install [options] dictzip
58 |
59 | Options:
60 | -k, --kobo string KOBOeReader path (default: automatically detected)
61 | -l, --locale string Locale name to use (format: ALPHANUMERIC{2}[-ALPHANUMERIC{2}]) (default: detected from filename if in format dicthtml-**.zip)
62 | -n, --name string Custom additional label for dictionary (ignored when replacing built-in dictionaries) (doesn't have any effect on 4.20.14601+)
63 | -b, --builtin string How to handle built-in locales [replace = replace and prevent from syncing] [ignore = replace and leave syncing as-is] (doesn't have any effect on 4.24.15672+) (default "replace")
64 | -B, --no-custom Whether to force installation to .kobo/dict instead of .kobo/custom-dict (4.24.15672+ only)
65 | --use-extra-locales Whether to use ExtraLocales on 4.24.15672+ if not a built-in dictionary (this is not required anymore since 4.24.15672) (4.24.15672+ only)
66 | -h, --help Show this help text
67 |
68 | Note:
69 | If you are not replacing a built-in dictionary and are using a firmware
70 | version before 4.24.15672, the 'Enable searches on extra dictionaries patch'
71 | must be installed or you will not be able to select your custom dictionary.
72 | ```
73 |
74 | ```
75 | Usage: dictutil uninstall [options] locale
76 |
77 | Options:
78 | -k, --kobo string KOBOeReader path (default: automatically detected)
79 | -b, --builtin string How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+) (default "normal")
80 | -B, --no-custom Uninstall built-in dictionaries instead of custom ones on 4.24.15672+
81 | -h, --help Show this help text
82 | ```
83 |
84 | ```
85 | Usage: dictutil pack [options] dictdir
86 |
87 | Options:
88 | -o, --output string The output dictzip filename (will be overwritten if it exists) (default "dicthtml.zip")
89 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex)
90 | -h, --help Show this help text
91 | ```
92 |
93 | ```
94 | Usage: dictutil unpack [options] dictzip
95 |
96 | Options:
97 | -o, --output string The output directory (must not exist) (default: the basename of the input without the extension)
98 | -c, --crypt string Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex)
99 | -h, --help Show this help text
100 | ```
101 |
102 | ```
103 | Usage: dictutil prefix [options] word...
104 |
105 | Options:
106 | -f, --format string The output format (go-slice, go-map, csv, tsv, json-array, json-object) (default "json-array")
107 | -h, --help Show this help text
108 | ```
109 |
110 | ### dictgen
111 |
112 | ```
113 | Usage: dictgen [options] dictfile...
114 |
115 | Options:
116 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "dicthtml.zip")
117 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex)
118 | -I, --image-method string How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove) (default "base64")
119 | --remove-footer Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary)
120 | -h, --help Show this help text
121 |
122 | If multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename.
123 |
124 | Note that the only usable image method is currently removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details.
125 |
126 | See https://pgaskin.net/dictutil/dictgen for more information about the dictfile format.
127 | ```
128 |
129 | **See [here](https://pgaskin.net/dictutil/dictgen/) for information and examples of the dictfile format.**
130 |
131 | ### gotdict-convert
132 |
133 | ```
134 | Usage: gotdict-convert [options]
135 |
136 | Options:
137 | -g, --gotdict string The path to the local copy of github.com/wjdp/gotdict. (default "./gotdict")
138 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./gotdict.df")
139 | -I, --images Include images in dictfile
140 | -h, --help Show this help text
141 |
142 | To convert the resulting dictfile into a dictzip, use dictgen.
143 | ```
144 |
145 | ### webster1913-convert
146 |
147 | ```
148 | Usage: webster1913-convert [options] gutenberg_webster1913_path
149 |
150 | Options:
151 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./webster1913.df")
152 | --dump Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging)
153 | -h, --help Show this help text
154 |
155 | Arguments:
156 | gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin.
157 |
158 | To convert the resulting dictfile into a dictzip, use dictgen.
159 | ```
160 |
161 | The original dictionary can be downloaded [here](http://www.gutenberg.org/ebooks/29765.txt.utf-8) or [here](https://github.com/pgaskin/dictserver/raw/master/data/dictionary.txt).
162 |
163 | ### dictzip-decompile
164 |
165 | ```
166 | Usage: dictzip-decompile [options] dictzip
167 |
168 | Options:
169 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./decompiled.df")
170 | -r, --resources Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled)
171 | -h, --help Show this help text
172 |
173 | Arguments:
174 | dictzip is the path to the dictzip to decompile.
175 |
176 | To convert the resulting dictfile into a dictzip, use dictgen.
177 |
178 | Note: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown.
179 |
180 | This is an experimental tool, and the output may not be perfect on complex dictionaries.
181 | ```
182 |
--------------------------------------------------------------------------------
/cmd/dictgen/main.go:
--------------------------------------------------------------------------------
1 | // Command dictgen is a CLI wrapper around package dictgen.
2 | package main
3 |
4 | import (
5 | "encoding/hex"
6 | "fmt"
7 | "io"
8 | "os"
9 | "strings"
10 |
11 | _ "image/gif"
12 | _ "image/jpeg"
13 | _ "image/png"
14 |
15 | "github.com/pgaskin/dictutil/dictgen"
16 | "github.com/pgaskin/dictutil/kobodict"
17 | "github.com/spf13/pflag"
18 |
19 | _ "github.com/pgaskin/dictutil/kobodict/marisa"
20 | )
21 |
22 | var version = "dev"
23 |
24 | func main() {
25 | pflag.CommandLine.SortFlags = false
26 | output := pflag.StringP("output", "o", "dicthtml.zip", "The output filename (will be overwritten if it exists) (- is stdout)")
27 | crypt := pflag.StringP("crypt", "c", "", "Encrypt the dictzip using the specified encryption method (format: method:keyhex)")
28 | imageMethod := pflag.StringP("image-method", "I", "base64", "How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove)")
29 | removeFooter := pflag.Bool("remove-footer", false, "Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary)")
30 | help := pflag.BoolP("help", "h", false, "Show this help text")
31 | pflag.Parse()
32 |
33 | if *help || pflag.NArg() == 0 {
34 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictfile...\n\nVersion: dictgen %s\n\nOptions:\n%s\nIf multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename.\n\nNote that currently, the only usable image method is removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary if images are enabled), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details.\n\nSee https://pgaskin.net/dictutil/dictgen for more information about the dictfile format.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
35 | os.Exit(0)
36 | return
37 | }
38 |
39 | var e kobodict.Crypter
40 | if *crypt != "" {
41 | if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 {
42 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n")
43 | os.Exit(2)
44 | return
45 | } else if key, err := hex.DecodeString(spl[1]); err != nil {
46 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err)
47 | os.Exit(2)
48 | return
49 | } else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil {
50 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err)
51 | os.Exit(2)
52 | return
53 | } else {
54 | e = enc
55 | }
56 | }
57 |
58 | var ih dictgen.ImageHandler
59 | switch *imageMethod {
60 | case "base64":
61 | ih = new(dictgen.ImageHandlerBase64)
62 | case "embed":
63 | ih = new(dictgen.ImageHandlerEmbed)
64 | case "remove":
65 | ih = new(dictgen.ImageHandlerRemove)
66 | default:
67 | fmt.Fprintf(os.Stderr, "Error: invalid value for --image-method, see --help for details.")
68 | os.Exit(2)
69 | return
70 | }
71 |
72 | var tdf dictgen.DictFile
73 |
74 | fmt.Fprintf(os.Stderr, "Parsing dictfiles.\n")
75 | var seenStdin bool
76 | for _, fn := range pflag.Args() {
77 | if fn == "-" {
78 | if seenStdin {
79 | fmt.Fprintf(os.Stderr, "Error: stdin can only be specified once.\n")
80 | os.Exit(1)
81 | return
82 | }
83 | seenStdin = true
84 | }
85 |
86 | if err := func() error {
87 | var fr io.Reader
88 | if fn == "-" {
89 | fr = os.Stdin
90 | } else {
91 | f, err := os.OpenFile(fn, os.O_RDONLY, 0)
92 | if err != nil {
93 | return err
94 | }
95 | defer f.Close()
96 | fr = f
97 | }
98 |
99 | if df, err := dictgen.ParseDictFile(fr); err != nil {
100 | return err
101 | } else if err := df.Validate(); err != nil {
102 | return err
103 | } else {
104 | tdf = append(tdf, df...)
105 | }
106 |
107 | return nil
108 | }(); err != nil {
109 | fmt.Fprintf(os.Stderr, "Error: input %#v: %v.\n", fn, err)
110 | os.Exit(1)
111 | return
112 | }
113 | }
114 |
115 | if *removeFooter {
116 | fmt.Fprintf(os.Stderr, "Appending HTML code to remove entry footers (note: you don't need this and should not use it unless you are replacing a dictionary which adds it, such as the French one).\n")
117 | for _, dfe := range tdf {
118 | dfe.PostRawHTML += ``
119 | }
120 | }
121 |
122 | fmt.Fprintf(os.Stderr, "Opening output.\n")
123 | var f io.WriteCloser
124 | switch *output {
125 | case "-":
126 | f = os.Stdout
127 | default:
128 | ff, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
129 | if err != nil {
130 | fmt.Fprintf(os.Stderr, "Error: create dictzip: %v\n", err)
131 | os.Exit(1)
132 | return
133 | }
134 | f = ff
135 | }
136 |
137 | fmt.Fprintf(os.Stderr, "Generating dictzip.\n")
138 | dw := kobodict.NewWriter(f)
139 | dw.SetEncrypter(e)
140 | if e != nil {
141 | fmt.Fprintf(os.Stderr, " Using encryption.\n")
142 | }
143 | if ih != nil {
144 | fmt.Fprintf(os.Stderr, " Using image method: %s.\n", ih.Description())
145 | }
146 | if err := tdf.WriteDictzip(dw, ih, dictgen.ImageFuncFilesystem); err != nil {
147 | f.Close()
148 | fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err)
149 | os.Exit(1)
150 | return
151 | } else if err := dw.Close(); err != nil {
152 | f.Close()
153 | fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err)
154 | os.Exit(1)
155 | return
156 | } else if err := f.Close(); err != nil {
157 | fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err)
158 | os.Exit(1)
159 | return
160 | }
161 |
162 | fmt.Fprintf(os.Stderr, "Successfully wrote %d entries from %d dictfile(s) to dictzip %s.\n", len(tdf), pflag.NArg(), *output)
163 | os.Exit(0)
164 | }
165 |
--------------------------------------------------------------------------------
/cmd/dictutil/main.go:
--------------------------------------------------------------------------------
1 | // Command dictutil provides commands for installing, removing, unpacking,
2 | // packing, and performing low-level modifications and tests on Kobo
3 | // dictionaries.
4 | package main
5 |
6 | import (
7 | "fmt"
8 | "os"
9 | "sort"
10 |
11 | "github.com/spf13/pflag"
12 |
13 | _ "github.com/pgaskin/dictutil/kobodict/marisa"
14 | )
15 |
16 | var version = "dev"
17 |
18 | var commands []*command
19 |
20 | type command struct {
21 | Name string
22 | Short string
23 | Description string
24 | Main func(args []string, fs *pflag.FlagSet) int
25 | }
26 |
27 | func main() {
28 | sort.Slice(commands, func(i, j int) bool {
29 | return commands[i].Name < commands[j].Name
30 | })
31 |
32 | cmdMap := map[string]*command{}
33 | for _, cmd := range commands {
34 | for _, v := range []string{cmd.Name, cmd.Short} {
35 | if _, seen := cmdMap[v]; seen {
36 | panic("command already set: " + v)
37 | }
38 | cmdMap[v] = cmd
39 | }
40 | }
41 |
42 | if len(os.Args) < 2 {
43 | globalHelp()
44 | os.Exit(0)
45 | }
46 |
47 | if os.Args[1] == "help" {
48 | globalHelp()
49 | for _, cmd := range commands {
50 | fmt.Printf("\n### Help for %s:\n\n", cmd.Name)
51 | z := os.Args[0] + " " + cmd.Name
52 | cmd.Main([]string{z, "--help"}, pflag.NewFlagSet(z, pflag.ExitOnError))
53 | }
54 | } else if cmd, ok := cmdMap[os.Args[1]]; !ok {
55 | globalHelp()
56 | os.Exit(0)
57 | } else {
58 | args := append([]string{os.Args[0] + " " + os.Args[1]}, os.Args[2:]...)
59 | fs := pflag.NewFlagSet(args[0], pflag.ExitOnError)
60 | os.Exit(cmd.Main(args, fs))
61 | }
62 | }
63 |
64 | func globalHelp() {
65 | fmt.Fprintf(os.Stderr, "Usage: %s command [options] [arguments]\n\nDictutil provides low-level utilities to manipulate Kobo dictionaries (v2).\n\nVersion: dictutil %s\n\nCommands:\n", os.Args[0], version)
66 | for _, cmd := range commands {
67 | fmt.Fprintf(os.Stderr, " %-20s %s\n", fmt.Sprintf("%s (%s)", cmd.Name, cmd.Short), cmd.Description)
68 | }
69 | fmt.Fprintf(os.Stderr, " %-20s %s\n", "help", "Show help for all commands")
70 | fmt.Fprintf(os.Stderr, "\nOptions:\n -h, --help Show this help text\n")
71 | }
72 |
--------------------------------------------------------------------------------
/cmd/dictutil/pack.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/hex"
5 | "fmt"
6 | "io/ioutil"
7 | "os"
8 | "path/filepath"
9 | "runtime"
10 | "strings"
11 |
12 | "github.com/pgaskin/dictutil/kobodict"
13 | "github.com/spf13/pflag"
14 | )
15 |
16 | func init() {
17 | commands = append(commands, &command{Name: "pack", Short: "p", Description: "Pack a dictzip file", Main: packMain})
18 | }
19 |
20 | func packMain(args []string, fs *pflag.FlagSet) int {
21 | fs.SortFlags = false
22 | output := fs.StringP("output", "o", "dicthtml.zip", "The output dictzip filename (will be overwritten if it exists)")
23 | crypt := fs.StringP("crypt", "c", "", "Encrypt the dictzip using the specified encryption method (format: method:keyhex)")
24 | help := fs.BoolP("help", "h", false, "Show this help text")
25 | fs.Parse(args[1:])
26 |
27 | if *help || fs.NArg() != 1 {
28 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictdir\n\nOptions:\n%s", args[0], fs.FlagUsages())
29 | return 0
30 | }
31 |
32 | var c kobodict.Crypter
33 | if *crypt != "" {
34 | if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 {
35 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n")
36 | return 2
37 | } else if key, err := hex.DecodeString(spl[1]); err != nil {
38 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err)
39 | return 2
40 | } else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil {
41 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err)
42 | return 2
43 | } else {
44 | c = enc
45 | }
46 | }
47 |
48 | fn, err := filepath.Abs(fs.Args()[0])
49 | if err != nil {
50 | fmt.Fprintf(os.Stderr, "Error: resolve input path %#v: %v.\n", fs.Args()[0], err)
51 | return 2
52 | }
53 |
54 | ofn, err := filepath.Abs(*output)
55 | if err != nil {
56 | fmt.Fprintf(os.Stderr, "Error: resolve output path %#v: %v.\n", *output, err)
57 | return 2
58 | }
59 |
60 | if fi, err := os.Stat(fn); err != nil {
61 | fmt.Fprintf(os.Stderr, "Error: inaccessible input dir %#v: %v.\n", fn, err)
62 | return 2
63 | } else if !fi.IsDir() {
64 | fmt.Fprintf(os.Stderr, "Error: input %#v is not a dir.\n", fn)
65 | return 2
66 | }
67 |
68 | fmt.Printf("Creating output temp file\n")
69 | f, err := ioutil.TempFile(filepath.Dir(ofn), "tmp_dicthtml.*.zip")
70 | if err != nil {
71 | fmt.Fprintf(os.Stderr, "Error: create output temp file: %v.\n", err)
72 | return 2
73 | }
74 | defer os.Remove(f.Name())
75 | defer f.Close()
76 |
77 | fmt.Printf("Packing dictzip.\n")
78 | dw := kobodict.NewWriter(f)
79 | defer dw.Close()
80 |
81 | dw.SetEncrypter(c)
82 |
83 | if err := kobodict.Pack(dw, fn); err != nil {
84 | fmt.Fprintf(os.Stderr, "Error: pack input dir %#v to %#v: %v.\n", fn, ofn, err)
85 | return 1
86 | }
87 |
88 | if err := dw.Close(); err != nil {
89 | fmt.Fprintf(os.Stderr, "Error: pack input dir %#v to %#v: %v.\n", fn, ofn, err)
90 | return 1
91 | }
92 |
93 | fmt.Printf("Renaming output file.\n")
94 | if err := f.Chmod(0644); err != nil && runtime.GOOS != "windows" {
95 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
96 | return 2
97 | }
98 | if err := f.Sync(); err != nil {
99 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
100 | return 2
101 | }
102 | if err := f.Close(); err != nil {
103 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
104 | return 2
105 | }
106 | if err := os.Rename(f.Name(), ofn); err != nil { // this will replace existing files properly on Go1.5+
107 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
108 | return 2
109 | }
110 |
111 | fmt.Printf("Successfully packed dictdir %#v to dictzip %#v.\n", fn, ofn)
112 | return 0
113 | }
114 |
--------------------------------------------------------------------------------
/cmd/dictutil/prefix.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/pgaskin/dictutil/kobodict"
8 | "github.com/spf13/pflag"
9 | )
10 |
11 | func init() {
12 | commands = append(commands, &command{Name: "prefix", Short: "x", Description: "Calculate the prefix for a word", Main: prefixMain})
13 | }
14 |
15 | func prefixMain(args []string, fs *pflag.FlagSet) int {
16 | fs.SortFlags = false
17 | format := fs.StringP("format", "f", "json-array", "The output format (go-slice, go-map, csv, tsv, json-array, json-object)")
18 | help := fs.BoolP("help", "h", false, "Show this help text")
19 | fs.Parse(args[1:])
20 |
21 | if *help || fs.NArg() == 0 {
22 | fmt.Fprintf(os.Stderr, "Usage: %s [options] word...\n\nOptions:\n%s", args[0], fs.FlagUsages())
23 | return 0
24 | }
25 |
26 | if *format != "go-slice" && *format != "go-map" && *format != "csv" && *format != "tsv" && *format != "json-array" && *format != "json-object" {
27 | fmt.Fprintf(os.Stderr, "Error: invalid format %#v, see --help for more details.\n", *format)
28 | return 2
29 | }
30 |
31 | switch *format {
32 | case "go-slice":
33 | fmt.Printf("[][]string{\n")
34 | case "go-map":
35 | fmt.Printf("map[string]string{\n")
36 | case "csv", "tsv":
37 | break
38 | case "json-array":
39 | fmt.Printf("[\n")
40 | case "json-object":
41 | fmt.Printf("{\n")
42 | default:
43 | panic("invalid output format")
44 | }
45 |
46 | for i, word := range fs.Args() {
47 | prefix := kobodict.WordPrefix(word)
48 | last := i == fs.NArg()-1
49 |
50 | switch *format {
51 | case "go-slice":
52 | fmt.Printf("\t{%#v, %#v},\n", word, prefix)
53 | case "go-map":
54 | fmt.Printf("\t%#v: %#v,\n", word, prefix)
55 | case "csv":
56 | fmt.Printf("%s,%s\n", word, prefix)
57 | case "tsv":
58 | fmt.Printf("%s\t%s\n", word, prefix)
59 | case "json-array":
60 | fmt.Printf(" [%#v, %#v]", word, prefix)
61 | if last {
62 | fmt.Printf("\n")
63 | } else {
64 | fmt.Printf(",\n")
65 | }
66 | case "json-object":
67 | fmt.Printf(" %#v: %#v", word, prefix)
68 | if last {
69 | fmt.Printf("\n")
70 | } else {
71 | fmt.Printf(",\n")
72 | }
73 | default:
74 | panic("invalid output format")
75 | }
76 | }
77 |
78 | switch *format {
79 | case "csv", "tsv":
80 | break
81 | case "json-array":
82 | fmt.Printf("]\n")
83 | case "json-object", "go-slice", "go-map":
84 | fmt.Printf("}\n")
85 | default:
86 | panic("invalid output format")
87 | }
88 |
89 | return 0
90 | }
91 |
--------------------------------------------------------------------------------
/cmd/dictutil/uninstall.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "bytes"
6 | "database/sql"
7 | "fmt"
8 | "io"
9 | "net/http"
10 | "os"
11 | "path/filepath"
12 | "regexp"
13 | "sort"
14 | "strings"
15 |
16 | "github.com/pgaskin/koboutils/v2/kobo"
17 | "github.com/spf13/pflag"
18 | )
19 |
20 | func init() {
21 | commands = append(commands, &command{Name: "uninstall", Short: "U", Description: "Uninstall a dictzip file", Main: uninstallMain})
22 | }
23 |
24 | func uninstallMain(args []string, fs *pflag.FlagSet) int {
25 | fs.SortFlags = false
26 | root := fs.StringP("kobo", "k", "", "KOBOeReader path (default: automatically detected)")
27 | builtin := fs.StringP("builtin", "b", "normal", "How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+)")
28 | noCustom := fs.BoolP("no-custom", "B", false, "Uninstall built-in dictionaries instead of custom ones on 4.24.15672+")
29 | help := fs.BoolP("help", "h", false, "Show this help text")
30 | fs.Parse(args[1:])
31 |
32 | if *help || fs.NArg() != 1 {
33 | fmt.Fprintf(os.Stderr, "Usage: %s [options] locale|dicthtml-name.zip\n\nOptions:\n%s\n", args[0], fs.FlagUsages())
34 | builtinHelp()
35 | return 0
36 | }
37 |
38 | if *builtin != "normal" && *builtin != "delete" && *builtin != "restore" {
39 | fmt.Fprintf(os.Stderr, "Error: invalid built-in dictionary mode %#v, see --help for more details.\n", *builtin)
40 | return 2
41 | }
42 |
43 | kobopath, version, err := findDevice(*root)
44 | if err != nil {
45 | fmt.Fprintf(os.Stderr, "Error: could not detect a Kobo eReader (you can specify one manually with --kobo): %v.\n", err)
46 | return 1
47 | }
48 |
49 | fmt.Printf("Found Kobo eReader at %s with firmware version %s.\n", kobopath, version)
50 | if kobo.VersionCompare(version, "4.7.10364") < 0 {
51 | fmt.Fprintf(os.Stderr, "Error: firmware version too old (v2 dictionaries were only introduced in 4.7.10364).\n")
52 | return 1
53 | }
54 |
55 | fw14601 := kobo.VersionCompare(version, "4.20.14601") >= 0 // https://github.com/pgaskin/kobopatch-patches/issues/49
56 | fw15672 := kobo.VersionCompare(version, "4.24.15672") >= 0 // https://github.com/pgaskin/kobopatch-patches/issues/76
57 |
58 | var dictPath, dictLocale string
59 | if dictLocale = strings.TrimLeft(fs.Args()[0], "-"); dictLocale == "en" {
60 | if fw15672 && !*noCustom {
61 | dictPath = filepath.Join(kobopath, ".kobo", "custom-dict", "dicthtml.zip")
62 | } else {
63 | dictPath = filepath.Join(kobopath, ".kobo", "dict", "dicthtml.zip")
64 | }
65 | } else if regexp.MustCompile(`^[a-zA-Z0-9-]+$`).MatchString(dictLocale) {
66 | if fw15672 && !*noCustom {
67 | dictPath = filepath.Join(kobopath, ".kobo", "custom-dict", "dicthtml-"+dictLocale+".zip")
68 | } else {
69 | dictPath = filepath.Join(kobopath, ".kobo", "dict", "dicthtml-"+dictLocale+".zip")
70 | }
71 | } else {
72 | fmt.Fprintf(os.Stderr, "Error: invalid locale name.\n")
73 | return 1
74 | }
75 | dictSuffix := "-" + dictLocale
76 | _, dictBuiltin := builtinDict[dictLocale]
77 |
78 | fmt.Printf("Uninstalling dictionary %#v (locale: %s).\n\n", dictPath, dictLocale)
79 |
80 | fmt.Printf("Updating database.\n")
81 | if fw15672 {
82 | // We won't bother to check the DB anymore since it's been a while since
83 | // 4.20.14601, and everyone who would be confused by the dictionary
84 | // table probaby would have already seen the message.
85 | fmt.Printf(" No need to update dictionary table on 4.24.15672+, skipping.\n")
86 | } else {
87 | if err := func() error {
88 | db, err := sql.Open("sqlite3", filepath.Join(kobopath, ".kobo", "KoboReader.sqlite"))
89 | if err != nil {
90 | return fmt.Errorf("open database: %w", err)
91 | }
92 | defer db.Close()
93 |
94 | if exists, err := func() (bool, error) {
95 | res, err := db.Query(`SELECT name FROM sqlite_master WHERE type="table" AND name="Dictionary";`)
96 | if err != nil {
97 | return false, fmt.Errorf("check dictionary table: %w", err)
98 | }
99 | defer res.Close()
100 |
101 | if !res.Next() { // if no rows are returned, there was an error or the table didn't exist
102 | if err := res.Err(); err != nil {
103 | return false, fmt.Errorf("check dictionary table: %w", err)
104 | }
105 | return false, nil
106 | }
107 | return true, nil
108 | }(); err != nil {
109 | return fmt.Errorf("check dictionary table: %w", err)
110 | } else if exists {
111 | if fw14601 {
112 | fmt.Printf(" Note: the dictionary table is unnecessary and inconsequential in firmware 4.20.14601+ and can be safely removed.\n")
113 | }
114 | } else {
115 | if fw14601 {
116 | // show a message to prevent confusion
117 | fmt.Printf(" No need to update dictionary table on 4.20.14601+, skipping.\n")
118 | return nil
119 | } else {
120 | return fmt.Errorf("check dictionary table: not found, and version < 4.20.14123")
121 | }
122 | }
123 |
124 | if !dictBuiltin || *builtin == "delete" {
125 | if res, err := db.Exec("DELETE FROM Dictionary WHERE Suffix = ?", dictSuffix); err != nil {
126 | return fmt.Errorf("delete row from database: %w", err)
127 | } else if ra, _ := res.RowsAffected(); ra == 0 {
128 | fmt.Printf(" Row already removed from database (suffix=%s).\n", dictSuffix)
129 | } else {
130 | fmt.Printf(" Removed row from database (suffix=%s).\n", dictSuffix)
131 | }
132 | }
133 |
134 | if dictBuiltin && *builtin == "normal" {
135 | if _, err := db.Exec("UPDATE Dictionary SET Installed = ? WHERE Suffix = ?", "false", dictSuffix); err != nil {
136 | return fmt.Errorf("update row in database: %w", err)
137 | } else {
138 | fmt.Printf(" Set IsInstalled to false in database for built-in dictionary (suffix=%s).\n", dictSuffix)
139 | }
140 | }
141 |
142 | if dictBuiltin && *builtin == "restore" {
143 | if _, err := db.Exec("UPDATE Dictionary SET Installed = ? WHERE Suffix = ?", "true", dictSuffix); err != nil {
144 | return fmt.Errorf("update row in database: %w", err)
145 | } else {
146 | fmt.Printf(" Set IsInstalled to true in database for built-in dictionary (suffix=%s).\n", dictSuffix)
147 | }
148 | }
149 |
150 | if err := db.Close(); err != nil {
151 | return fmt.Errorf("close database: %w", err)
152 | }
153 |
154 | return nil
155 | }(); err != nil {
156 | fmt.Fprintf(os.Stderr, "Error: update database: %v.\n", err)
157 | return 1
158 | }
159 | }
160 |
161 | fmt.Printf("Updating ExtraLocales.\n")
162 | if dictBuiltin {
163 | fmt.Printf(" No need; built-in dictionary.\n")
164 | } else {
165 | if err := func() error {
166 | cfg := filepath.Join(kobopath, ".kobo", "Kobo", "Kobo eReader.conf")
167 |
168 | f, err := os.OpenFile(cfg, os.O_RDONLY, 0)
169 | if err != nil {
170 | return fmt.Errorf("open config file: %w", err)
171 | }
172 | defer f.Close()
173 |
174 | var locales []string
175 | var filtered bool
176 | buf := bytes.NewBuffer(nil)
177 |
178 | fs := bufio.NewScanner(f)
179 | for fs.Scan() {
180 | if bytes.HasPrefix(fs.Bytes(), []byte("ExtraLocales=")) {
181 | for _, loc := range strings.Split(strings.SplitN(fs.Text(), "=", 2)[1], ",") {
182 | loc = strings.TrimSpace(loc)
183 | if loc == dictLocale {
184 | filtered = true
185 | } else {
186 | locales = append(locales, loc)
187 | }
188 | }
189 | continue
190 | }
191 | _, _ = buf.Write(fs.Bytes()) // err is always nil
192 | buf.WriteRune('\n')
193 | }
194 |
195 | if !filtered {
196 | fmt.Printf(" Locale %#v already removed from ExtraLocales (or wasn't there to begin with).\n", dictLocale)
197 | return nil
198 | }
199 |
200 | fmt.Printf(" Removing locale %#v from ExtraLocales.\n", dictLocale)
201 | sort.Strings(locales)
202 |
203 | buf.WriteString("\n[ApplicationPreferences]\n") // this will get merged by Qt
204 | buf.WriteString("ExtraLocales=" + strings.Join(locales, ","))
205 |
206 | f.Close()
207 |
208 | fo, err := os.OpenFile(cfg+".tmp", os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
209 | if err != nil {
210 | return fmt.Errorf("open new config file: %w", err)
211 | }
212 | defer os.Remove(cfg + ".tmp")
213 | defer fo.Close()
214 |
215 | if _, err := fo.Write(buf.Bytes()); err != nil {
216 | return fmt.Errorf("write new config file: %w", err)
217 | }
218 |
219 | if err := fo.Sync(); err != nil {
220 | return fmt.Errorf("write new config file: %w", err)
221 | }
222 |
223 | if err := fo.Close(); err != nil {
224 | return fmt.Errorf("write new config file: %w", err)
225 | }
226 |
227 | if err := os.Rename(cfg+".tmp", cfg); err != nil {
228 | return fmt.Errorf("rename new config file: %w", err)
229 | }
230 |
231 | return nil
232 | }(); err != nil {
233 | fmt.Fprintf(os.Stderr, "Error: update ExtraLocales: %v.\n", err)
234 | return 1
235 | }
236 | }
237 |
238 | fmt.Printf("Removing dictzip.\n")
239 | if err := os.Remove(dictPath); os.IsNotExist(err) { // this will still remove it if it's readonly on Windows (golang/go@2ffb3e5d905b5622204d199128dec06cefd57790)
240 | fmt.Printf(" Already removed.\n")
241 | } else if err != nil {
242 | fmt.Fprintf(os.Stderr, "Error: remove dictzip: %v.\n", err)
243 | return 1
244 | } else {
245 | fmt.Printf(" Removed.\n")
246 | }
247 |
248 | if *builtin == "restore" {
249 | // TODO: reconsider whether this belongs in uninstall, as:
250 | // - This doesn't update the file size.
251 | // - This doesn't ensure there is actually a DB entry for the restored
252 | // dict.
253 | // - This isn't really uninstalling.
254 | // - It might not even belong in dictutil at all because the URLs may
255 | // change (and it isn't that hard to manually download a dictionary
256 | // to install it with dictutil install)
257 |
258 | url := "https://kbdownload1-a.akamaihd.net/ereader/dictionaries/v2/"
259 | if fw15672 {
260 | url = "https://kbdownload1-a.akamaihd.net/ereader/dictionaries/v3/"
261 | }
262 | url += filepath.Base(dictPath)
263 |
264 | fmt.Printf("Restoring original dictionary from %#v.\n", url)
265 |
266 | if err := func() error {
267 | resp, err := http.Get(url)
268 | if err != nil {
269 | return fmt.Errorf("get dictionary: %w", err)
270 | }
271 | defer resp.Body.Close()
272 |
273 | if resp.StatusCode != http.StatusOK {
274 | return fmt.Errorf("get dictionary: response status %s", resp.Status)
275 | }
276 |
277 | df, err := os.OpenFile(dictPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
278 | if err != nil {
279 | return fmt.Errorf("open output dictzip: %w", err)
280 | }
281 | defer df.Close()
282 |
283 | if _, err := io.Copy(df, resp.Body); err != nil {
284 | return fmt.Errorf("write output dictzip: %w", err)
285 | }
286 |
287 | if err := df.Close(); err != nil {
288 | return fmt.Errorf("write output dictzip: %w", err)
289 | }
290 |
291 | return nil
292 | }(); err != nil {
293 | fmt.Fprintf(os.Stderr, "Error: download dictionary: %v.\n", err)
294 | return 1
295 | }
296 | }
297 |
298 | fmt.Printf("\nSuccessfully uninstalled dictionary for locale %s.\n", dictLocale)
299 |
300 | return 0
301 | }
302 |
--------------------------------------------------------------------------------
/cmd/dictutil/unpack.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/hex"
5 | "fmt"
6 | "os"
7 | "path/filepath"
8 | "strings"
9 |
10 | "github.com/pgaskin/dictutil/kobodict"
11 | "github.com/spf13/pflag"
12 | )
13 |
14 | func init() {
15 | commands = append(commands, &command{Name: "unpack", Short: "u", Description: "Unpack a dictzip file", Main: unpackMain})
16 | }
17 |
18 | func unpackMain(args []string, fs *pflag.FlagSet) int {
19 | fs.SortFlags = false
20 | output := fs.StringP("output", "o", "", "The output directory (must not exist) (default: the basename of the input without the extension)")
21 | crypt := fs.StringP("crypt", "c", "", "Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex)")
22 | help := fs.BoolP("help", "h", false, "Show this help text")
23 | fs.Parse(args[1:])
24 |
25 | if *help || fs.NArg() != 1 {
26 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictzip\n\nOptions:\n%s", args[0], fs.FlagUsages())
27 | return 0
28 | }
29 |
30 | var c kobodict.Crypter
31 | if *crypt != "" {
32 | if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 {
33 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n")
34 | return 2
35 | } else if key, err := hex.DecodeString(spl[1]); err != nil {
36 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err)
37 | return 2
38 | } else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil {
39 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err)
40 | return 2
41 | } else {
42 | c = enc
43 | }
44 | }
45 |
46 | fn, err := filepath.Abs(fs.Args()[0])
47 | if err != nil {
48 | fmt.Fprintf(os.Stderr, "Error: resolve input path %#v: %v.\n", fs.Args()[0], err)
49 | return 2
50 | }
51 |
52 | ofn := *output
53 | if ofn == "" {
54 | ofn = strings.TrimSuffix(filepath.Base(fn), filepath.Ext(fn))
55 | }
56 |
57 | fmt.Printf("Opening input dictzip.\n")
58 | f, err := os.Open(fn)
59 | if err != nil {
60 | fmt.Fprintf(os.Stderr, "Error: open input file %#v: %v.\n", fn, err)
61 | return 1
62 | }
63 | defer f.Close()
64 |
65 | s, err := f.Stat()
66 | if err != nil {
67 | fmt.Fprintf(os.Stderr, "Error: stat input file %#v: %v.\n", fn, err)
68 | return 1
69 | }
70 |
71 | fmt.Printf("Parsing dictzip.\n")
72 | dr, err := kobodict.NewReader(f, s.Size())
73 | if err != nil {
74 | fmt.Fprintf(os.Stderr, "Error: parse input file %#v: %v.\n", fn, err)
75 | return 1
76 | }
77 | dr.SetDecrypter(c)
78 |
79 | fmt.Printf("Unpacking dictzip.\n")
80 | if err := kobodict.Unpack(dr, ofn); err != nil {
81 | fmt.Fprintf(os.Stderr, "Error: unpack input file %#v to %#v: %v.\n", fn, ofn, err)
82 | return 1
83 | }
84 |
85 | fmt.Printf("Successfully unpacked dictzip %#v to dictdir %#v.\n", fn, ofn)
86 | return 0
87 | }
88 |
--------------------------------------------------------------------------------
/dictgen/dictfile.go:
--------------------------------------------------------------------------------
1 | package dictgen
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "io"
7 | "strings"
8 | "text/template"
9 | )
10 |
11 | // A DictFile is a high-level representation of a Kobo dictionary.
12 | type DictFile []*DictFileEntry
13 |
14 | // DictFileEntry represents a single entry in the DictFile.
15 | type DictFileEntry struct {
16 | Headword string
17 | Variant []string
18 |
19 | NoHeader bool
20 | HeaderInfo string
21 |
22 | RawHTML bool
23 | Definition string
24 |
25 | PostRawHTML string // will not be parsed or saved, only to be used for runtime additions before generating
26 |
27 | line int // for internal use if parsed, zero otherwise
28 | }
29 |
30 | // ParseDictFile parses a DictFile from it's textual representation (usually
31 | // stored in a file with the extension .df).
32 | func ParseDictFile(r io.Reader) (DictFile, error) {
33 | var df DictFile
34 | var dfe *DictFileEntry
35 |
36 | br := bufio.NewScanner(r)
37 | br.Buffer(make([]byte, 64*1024), 2048*1024) // start with a 64KiB buffer, but allow up to 2MiB (for dictfiles with long lines of raw HTML)
38 | var line int
39 |
40 | for br.Scan() {
41 | buf := br.Bytes()
42 | line++
43 |
44 | if len(buf) == 0 {
45 | // if in a block and after the metadata (in the definition),
46 | // preserve the blank line
47 | if dfe != nil && len(dfe.Definition) != 0 {
48 | dfe.Definition += "\n"
49 | }
50 | continue
51 | }
52 |
53 | switch buf[0] {
54 | case '@':
55 | // start another one
56 | dfe = new(DictFileEntry)
57 |
58 | // add the headword and line info
59 | dfe.Headword = strings.TrimSpace(string(buf[1:]))
60 | dfe.line = line
61 |
62 | // but error if the headword is blank (note that duplicates are
63 | // acceptable, and encouraged in some cases; Kobo will merge it;
64 | // try looking up 'be' in the English dictionary)
65 | if len(dfe.Headword) == 0 {
66 | return nil, fmt.Errorf("dictfile: line %d: empty headword after @", line)
67 | }
68 |
69 | // otherwise, add it to the dictfile (remember it's a pointer, it'll
70 | // still get updated)
71 | df = append(df, dfe)
72 | case ':':
73 | // if not in a block (before the first @), return an error
74 | if dfe == nil {
75 | return nil, fmt.Errorf("dictfile: line %d: header info (: or ::) specified before word (@)", line)
76 | }
77 |
78 | // if already after the metadata (in the definition), return an error
79 | if len(dfe.Definition) != 0 {
80 | return nil, fmt.Errorf("dictfile: line %d: header info (: or ::) specified within definition content (prepend a space if this was intended to be part of the definition itself)", line)
81 | }
82 |
83 | // if already seen the header info (a line starting with :)
84 | if dfe.NoHeader || len(dfe.HeaderInfo) != 0 {
85 | return nil, fmt.Errorf("dictfile: line %d: multiple header infos (: or ::) specified in definition block", line)
86 | }
87 |
88 | // put the trimmed text in the header info, or disable the header if
89 | // it is ::
90 | if len(buf) >= 2 {
91 | if buf[1] == ':' {
92 | if len(strings.TrimSpace(string(buf[2:]))) != 0 {
93 | return nil, fmt.Errorf("dictfile: line %d: extra data after no header specified (::)", line)
94 | }
95 | dfe.NoHeader = true
96 | } else {
97 | dfe.HeaderInfo = strings.TrimSpace(string(buf[1:]))
98 | }
99 | } else {
100 | dfe.HeaderInfo = ""
101 | }
102 | case '&':
103 | // if not in a block, error
104 | if dfe == nil {
105 | return nil, fmt.Errorf("dictfile: line %d: variant (&) specified before word (@)", line)
106 | }
107 |
108 | // if already after the metadata (in the definition), error
109 | if len(dfe.Definition) != 0 {
110 | return nil, fmt.Errorf("dictfile: line %d: variant (&) specified within definition content (prepend a space if this was intended to be part of the definition itself)", line)
111 | }
112 |
113 | // trim the rest of the line (error if nothing left)
114 | v := strings.TrimSpace(string(buf[1:]))
115 | if len(v) == 0 {
116 | return nil, fmt.Errorf("dictfile: line %d: no word after variant specifier (&)", line)
117 | }
118 |
119 | // and add it to the variant list
120 | dfe.Variant = append(dfe.Variant, v)
121 | default:
122 | // if not in a block, error
123 | if dfe == nil {
124 | return nil, fmt.Errorf("dictfile: line %d: definition specified before word (@)", line)
125 | }
126 |
127 | // append the line to the definition
128 | dfe.Definition += string(buf) + "\n"
129 | }
130 | }
131 |
132 | // check for read errors
133 | if err := br.Err(); err != nil {
134 | return nil, err
135 | }
136 |
137 | // and finally, update the raw html flag and cleanup whitespace
138 | for _, dfe := range df {
139 | dfe.Definition = strings.TrimSpace(dfe.Definition)
140 |
141 | if v := strings.TrimSpace(strings.TrimPrefix(dfe.Definition, "")); v != dfe.Definition {
142 | if strings.HasSuffix(v, "") {
143 | return nil, fmt.Errorf("dictfile: entry at line %d: raw HTML definitions are specified with , but SHOULD NOT be a full HTML document ending with ", dfe.line)
144 | }
145 | dfe.RawHTML = true
146 | dfe.Definition = v
147 | } else if strings.Contains(dfe.Definition, "") {
148 | return nil, fmt.Errorf("dictfile: entry at line %d: why does the definition contain a tag ... to make it raw HTML, it should be at the very beginning", dfe.line)
149 | }
150 | }
151 |
152 | // note: validation is done separately (and always done before generation)
153 |
154 | return df, nil
155 | }
156 |
157 | // Validate validates the entries in the DictFile. Note that duplicate entries
158 | // are fine, and are encouraged if necessary (Kobo will merge them).
159 | func (df DictFile) Validate() error {
160 | illegal := func(s string, word bool) error {
161 | if word && strings.Contains(s, "\"") {
162 | return fmt.Errorf("must not contain %#v", "\"")
163 | }
164 | for _, c := range []string{
165 | "{{end -}}
243 |
244 | {{with .Definition}}
245 | {{dfesc .}}{{end -}}
246 |
247 | {{- /* keep trailing newline at end of template */}}
248 | `))
249 |
250 | func (d DictFileEntry) writeDictFileEntry(w io.Writer) error {
251 | return dictFileEntryTmpl.Execute(w, d)
252 | }
253 |
--------------------------------------------------------------------------------
/dictgen/dictfile_test.go:
--------------------------------------------------------------------------------
1 | package dictgen
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "fmt"
7 | "reflect"
8 | "sort"
9 | "strings"
10 | "testing"
11 | )
12 |
13 | type testcase struct {
14 | What string
15 |
16 | In string
17 | Err error
18 |
19 | Out DictFile
20 |
21 | OutDictFile string
22 | OutKoboHTML string
23 | }
24 |
25 | // TODO(v1): more specific tests
26 | var testcases = []testcase{{
27 | What: "some of everything",
28 | In: `@ blank
29 |
30 | @ headword
31 | : info
32 | & variant1
33 | &variant2
34 | test
35 | test
36 |
37 | @ custom
38 | & NORMALIZEME
39 | ::
40 |
41 | custom word:
42 | test
43 | @ markdown
44 | :-test
45 | 1. Definition point 1.
46 | - Blah
47 | - Blah
48 | 2. Blah blah blah.
49 | 3. Blah *blah* **blah**!
50 |
51 | Blah blah blah.`,
52 | Out: DictFile{
53 | {Headword: "blank", Variant: []string(nil), NoHeader: false, HeaderInfo: "", RawHTML: false, Definition: "", line: 1},
54 | {Headword: "headword", Variant: []string{"variant1", "variant2"}, NoHeader: false, HeaderInfo: "info", RawHTML: false, Definition: "test\ntest", line: 3},
55 | {Headword: "custom", Variant: []string{"NORMALIZEME"}, NoHeader: true, HeaderInfo: "", RawHTML: true, Definition: "custom word:\ntest
", line: 10},
56 | {Headword: "markdown", Variant: []string(nil), NoHeader: false, HeaderInfo: "-test", RawHTML: false, Definition: "1. Definition point 1.\n - Blah\n - Blah\n2. Blah blah blah.\n3. Blah *blah* **blah**!\n\nBlah blah blah.", line: 16},
57 | },
58 | OutDictFile: `@ blank
59 |
60 | @ headword
61 | : info
62 | & variant1
63 | & variant2
64 | test
65 | test
66 |
67 | @ custom
68 | ::
69 | & NORMALIZEME
70 |
71 | custom word:
72 | test
73 |
74 | @ markdown
75 | : -test
76 | 1. Definition point 1.
77 | - Blah
78 | - Blah
79 | 2. Blah blah blah.
80 | 3. Blah *blah* **blah**!
81 |
82 | Blah blah blah.
83 |
84 | `,
85 | OutKoboHTML: `blank
custom word:
86 | test
headword info
test
87 | test
markdown -test
88 | - Definition point 1.
89 |
90 |
94 | - Blah blah blah.
95 | - Blah blah blah!
96 |
97 |
98 | Blah blah blah.
`,
99 | }}
100 |
101 | func TestDictFile(t *testing.T) {
102 | for _, tc := range testcases {
103 | t.Logf("case %#v", tc.What)
104 |
105 | df, err := ParseDictFile(strings.NewReader(tc.In))
106 | if tc.Err == nil && err != nil {
107 | t.Fatalf("case %#v: parse dictfile: unexpected error: %v", tc.What, err)
108 | } else if tc.Err != nil && err == nil {
109 | t.Fatalf("case %#v: parse dictfile: expected error (%v)", tc.What, tc.Err)
110 | } else if tc.Err != nil && tc.Err.Error() != err.Error() {
111 | t.Fatalf("case %#v: parse dictfile: expected error (%v), got: %v", tc.What, tc.Err, err)
112 | }
113 |
114 | exp, err := json.MarshalIndent(tc.Out, "| ", " ")
115 | if err != nil {
116 | panic(err)
117 | }
118 |
119 | act, err := json.MarshalIndent(df, "| ", " ")
120 | if err != nil {
121 | panic(err)
122 | }
123 |
124 | if !reflect.DeepEqual(exp, act) {
125 | for _, dfe := range df {
126 | fmt.Printf("%#v,\n", dfe)
127 | }
128 | t.Fatalf("case %#v: expected:\n%s\n\ngot:\n%s", tc.What, exp, act)
129 | }
130 |
131 | buf := bytes.NewBuffer(nil)
132 | if err := df.WriteDictFile(buf); err != nil {
133 | t.Fatalf("case %#v: write dictfile: unexpected error: %v", tc.What, err)
134 | } else if tc.OutDictFile != buf.String() {
135 | fmt.Printf("expected:\n`%s`\n\ngot:\n`%s`", tc.OutDictFile, buf.String())
136 | t.Fatalf("case %#v: unexpected dictfile output", tc.What)
137 | }
138 |
139 | pdf, err := ParseDictFile(buf)
140 | if err != nil {
141 | t.Fatalf("case %#v: reparse written dictfile: unexpected error: %v", tc.What, err)
142 | }
143 | sort.Slice(pdf, func(i, j int) bool {
144 | return pdf[i].Headword < pdf[j].Headword
145 | })
146 | edf := df[:]
147 | sort.Slice(edf, func(i, j int) bool {
148 | return edf[i].Headword < edf[j].Headword
149 | })
150 | if jpdf, err := json.Marshal(pdf); err != nil {
151 | panic(pdf)
152 | } else if jedf, err := json.Marshal(edf); err != nil {
153 | panic(pdf)
154 | } else if !reflect.DeepEqual(jpdf, jedf) {
155 | t.Fatalf("case %#v: reparse written dictfile: differs from original (orig:%s) (reparsed:%s)", tc.What, jedf, jpdf)
156 | }
157 |
158 | buf.Reset()
159 | if err := df.WriteKoboHTML(buf); err != nil {
160 | t.Fatalf("case %#v: write kobo html: unexpected error: %v", tc.What, err)
161 | } else if tc.OutKoboHTML != buf.String() {
162 | fmt.Printf("expected:\n`%s`\n\ngot:\n`%s`", tc.OutKoboHTML, buf.String())
163 | t.Fatalf("case %#v: unexpected kobo html output", tc.What)
164 | }
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/dictgen/dictgen.go:
--------------------------------------------------------------------------------
1 | // Package dictgen simplifies creating full-featured dictionaries for Kobo
2 | // eReaders, with support for images, unicode prefixes, raw html, markdown, and
3 | // more.
4 | //
5 | // A marisa implementation must be provided by
6 | // github.com/pgaskin/kobodict/marisa or a custom one for this package to work.
7 | package dictgen
8 |
9 | import (
10 | "bytes"
11 | "fmt"
12 | "io"
13 | "sort"
14 | "strings"
15 | "text/template"
16 |
17 | "github.com/pgaskin/dictutil/kobodict"
18 | "github.com/russross/blackfriday/v2"
19 | )
20 |
21 | // WriteDictzip writes the dictfile to a kobodict.Writer, which should not have
22 | // been used yet. The writer is not closed automatically. If the ImageHandler
23 | // requires a file to be opened (i.e. not ImageHandlerRemove), the provided
24 | // ImageFunc will be called.
25 | func (df DictFile) WriteDictzip(dw *kobodict.Writer, ih ImageHandler, img ImageFunc) error {
26 | var prefixes []string
27 | prefixed := df.Prefixed()
28 | for pfx := range prefixed {
29 | prefixes = append(prefixes, pfx)
30 | }
31 | sort.Strings(prefixes)
32 |
33 | hbuf := bytes.NewBuffer(nil)
34 | for _, pfx := range prefixes {
35 | for _, dfe := range prefixed[pfx] {
36 | if err := dw.AddWord(dfe.Headword); err != nil {
37 | return fmt.Errorf("add word %#v: %w", dfe.Headword, err)
38 | }
39 | for _, v := range dfe.Variant {
40 | if err := dw.AddWord(v); err != nil {
41 | return fmt.Errorf("add variant %#v: %w", v, err)
42 | }
43 | }
44 | }
45 | hbuf.Reset()
46 | if err := prefixed[pfx].WriteKoboHTML(hbuf); err != nil {
47 | return fmt.Errorf("generate dicthtml for %s: %w", pfx, err)
48 | } else if buf, err := transformHTMLImages(ih, dw, hbuf.Bytes(), img); err != nil {
49 | return fmt.Errorf("generate dicthtml for %s: transform images: %w", pfx, err)
50 | } else if hw, err := dw.CreateDicthtml(pfx); err != nil {
51 | return fmt.Errorf("write dicthtml for %s: %w", pfx, err)
52 | } else if _, err = hw.Write(buf); err != nil {
53 | return fmt.Errorf("write dicthtml for %s: %w", pfx, err)
54 | }
55 | }
56 |
57 | return nil
58 | }
59 |
60 | // Prefixed shards the DictFile into the different word prefixes. The original
61 | // DictFile is unchanged, but the entries are still pointers to the originals
62 | // (i.e. the result will become out of date if you modify the entries).
63 | //
64 | // The DictFile is not validated.
65 | //
66 | // If a variamt has a different prefix, the entire entry is duplicated as
67 | // necessary.
68 | func (df DictFile) Prefixed() map[string]DictFile {
69 | prefixed := map[string]DictFile{}
70 | for _, dfe := range df {
71 | pfx := map[string]bool{}
72 |
73 | pfx[kobodict.WordPrefix(dfe.Headword)] = true
74 | for _, v := range dfe.Variant {
75 | pfx[kobodict.WordPrefix(v)] = true
76 | }
77 |
78 | for p := range pfx {
79 | prefixed[p] = append(prefixed[p], dfe)
80 | }
81 | }
82 | return prefixed
83 | }
84 |
85 | // WriteKoboHTML validates the DictFile and writes it to w in the dicthtml
86 | // format.
87 | func (df DictFile) WriteKoboHTML(w io.Writer) error {
88 | if err := df.Validate(); err != nil {
89 | return err
90 | }
91 |
92 | // must be sorted for proper matching
93 | dfs := df[:]
94 | sort.Slice(dfs, func(i int, j int) bool {
95 | return dfs[i].Headword < dfs[j].Headword
96 | })
97 |
98 | if _, err := w.Write([]byte("")); err != nil {
99 | return err
100 | }
101 | for _, dfe := range dfs {
102 | if err := dfe.writeKoboHTML(w); err != nil {
103 | return err
104 | }
105 | }
106 | if _, err := w.Write([]byte("")); err != nil {
107 | return err
108 | }
109 |
110 | return nil
111 | }
112 |
113 | // note: we don't want the html/template escaping, this isn't actually proper
114 | // html, and also, the whitespaces in the end tags should stay EXACTLY as is
115 | // (yes, I know there is a space before the end of the a but not the variant) to
116 | // provide the best possible matches against the regexps Kobo uses. Also, the
117 | // output should not have any newlines. Also, keep in mind headwords can have
118 | // unescaped html tags in it, and they will be rendered properly by Kobo.
119 | var koboHTMLTmpl = template.Must(template.New("").Funcs(template.FuncMap{
120 | "md": func(md string) string {
121 | return strings.TrimSpace(string(blackfriday.Run([]byte(md))))
122 | },
123 | "normhw": func(headword string) string {
124 | return kobodict.NormalizeWordReference(headword, false)
125 | },
126 | "normv": func(variant string) string {
127 | return kobodict.NormalizeWordReference(variant, true)
128 | },
129 | }).Parse(`
130 | {{- /* trim */ -}}
131 |
132 |
133 | {{- if .NoHeader -}}
134 |
135 | {{- else -}}
136 | {{.Headword}}{{with .HeaderInfo}} {{.}}{{end}}
137 | {{- end -}}
138 |
139 | {{- range .Variant -}}
140 |
141 | {{- end -}}
142 |
143 | {{- with .Definition -}}
144 | {{- if $.RawHTML -}}
145 | {{.}}
146 | {{- else -}}
147 | {{md .}}
148 | {{- end -}}
149 | {{- end -}}
150 | {{- with .PostRawHTML -}}
151 | {{.}}
152 | {{- end -}}
153 |
154 |
155 | {{- /* trim */ -}}
156 | `))
157 |
158 | func (d DictFileEntry) writeKoboHTML(w io.Writer) error {
159 | return koboHTMLTmpl.Execute(w, d)
160 | }
161 |
--------------------------------------------------------------------------------
/dictgen/image.go:
--------------------------------------------------------------------------------
1 | package dictgen
2 |
3 | import (
4 | "bytes"
5 | "crypto/sha1"
6 | "encoding/base64"
7 | "fmt"
8 | "image"
9 | "io"
10 | "math"
11 | "os"
12 | "path/filepath"
13 | "regexp"
14 | "strings"
15 |
16 | "github.com/disintegration/imaging"
17 | "github.com/pgaskin/dictutil/kobodict"
18 | )
19 |
20 | // ImageFunc reads an image from the path (it may be absolute or relative) src,
21 | // and returns an io.Reader for the image contents. If the returned reader
22 | // implements io.Closer, it will automatically be called after the image has
23 | // been processed.
24 | type ImageFunc func(src string) (io.Reader, error)
25 |
26 | // ImageFuncFilesystem loads an image from the filesystem. If src is relative,
27 | // it is resolved relative to the current dir.
28 | func ImageFuncFilesystem(src string) (io.Reader, error) {
29 | rsrc, err := filepath.Abs(src)
30 | if err != nil {
31 | return nil, fmt.Errorf("resolve path %#v: %w", src, err)
32 | }
33 | f, err := os.Open(rsrc)
34 | if err != nil {
35 | return nil, fmt.Errorf("open image file %#v (resolved from %#v): %w", rsrc, src, err)
36 | }
37 | return f, nil // f will be closed by transformHTMLImages
38 | }
39 |
40 | // ImageHandler transforms images referenced in a DictFile.
41 | type ImageHandler interface {
42 | // Transform transforms an image read from ir, and returns a new value for
43 | // the img tag's src attribute. As a special case, if an empty string is
44 | // returned and the error is nil, the image tag is removed entirely. In
45 | // addition, custom CSS (which must not contain any double quotes) can be
46 | // returned to be set on the img tag.
47 | Transform(src string, ir io.Reader, dw *kobodict.Writer) (nsrc string, css string, err error)
48 |
49 | // Description returns a human-readable description of what the handler does.
50 | Description() string
51 | }
52 |
53 | // ImageHandlerRemove removes images from the dicthtml.
54 | type ImageHandlerRemove struct{}
55 |
56 | // Transform implements ImageHandler.
57 | func (*ImageHandlerRemove) Transform(string, io.Reader, *kobodict.Writer) (string, string, error) {
58 | return "", "", nil
59 | }
60 |
61 | // Description implements ImageHandler.
62 | func (*ImageHandlerRemove) Description() string {
63 | return "remove images"
64 | }
65 |
66 | // ImageHandlerEmbed adds the images to the dictzip without any additional
67 | // modifications. Usually, this would be the best choice, but unfortunately,
68 | // it is too buggy as of firmware 4.19.14123.
69 | type ImageHandlerEmbed struct{}
70 |
71 | // Transform implements ImageHandler.
72 | func (*ImageHandlerEmbed) Transform(src string, ir io.Reader, dw *kobodict.Writer) (string, string, error) {
73 | if !strings.HasSuffix(src, ".jpg") && !strings.HasSuffix(src, ".gif") {
74 | return "", "", fmt.Errorf("ImageHandlerEmbed: unsupported image file %s: extension must be .jpg or .gif when embedding", src)
75 | }
76 |
77 | // to generate a deterministic usually-unique filename
78 | fn := fmt.Sprintf("%x%s", sha1.Sum([]byte(src)), filepath.Ext(src))
79 | if !dw.Exists(fn) { // CreateFile will error if it already exists, and we're pretty confident the file is identical anyways
80 | if iw, err := dw.CreateFile(fn); err != nil {
81 | return "", "", fmt.Errorf("ImageHandlerEmbed: create dictfile entry %#v: %w", fn, err)
82 | } else if _, err := io.Copy(iw, ir); err != nil {
83 | return "", "", fmt.Errorf("ImageHandlerEmbed: copy image to dictfile: %w", err)
84 | }
85 | }
86 | return "dict:///" + fn, "", nil
87 | }
88 |
89 | // Description implements ImageHandler.
90 | func (*ImageHandlerEmbed) Description() string {
91 | return "add to dictzip as-is (warning: this causes entries to appear blank due to a bug in nickel as of firmware 4.20.14601)"
92 | }
93 |
94 | // ImageHandlerBase64 optimizes the image and encodes it as base64. This is the
95 | // most compatible option, but it comes at the expense of space and speed. In
96 | // addition, if there are too many images, it can lead to nickel running out of
97 | // memory when parsing the dictionary (and sickel should reboot it).
98 | //
99 | // In addition, it adds CSS to fix sizing issues (by default, images appear
100 | // really small when rendered in the dictionary due to default styling).
101 | //
102 | // This is currently the recommended option for adding images.
103 | //
104 | // You must import image/* yourself for format support.
105 | type ImageHandlerBase64 struct {
106 | // Images will be resized to fit within these dimensions, while preserving
107 | // aspect ratio. If not specified, the default is 1000x1000.
108 | MaxSize image.Point
109 | // NoGrayscale will prevent images from being grayscaled.
110 | NoGrayscale bool
111 | // JPEGQuality sets the JPEG quality for the encoded images. If not set, it
112 | // defaults to 60.
113 | JPEGQuality int
114 | }
115 |
116 | func (ih *ImageHandlerBase64) params() (maxWidth, maxHeight int, noGrayscale bool, jpegQuality int) {
117 | mw, mh := float64(ih.MaxSize.X), float64(ih.MaxSize.Y)
118 | if mw < 1 {
119 | mw = 1000
120 | }
121 | if mh < 1 {
122 | mh = 1000
123 | }
124 | ng := ih.NoGrayscale
125 | jq := ih.JPEGQuality
126 | if jq == 0 {
127 | jq = 60
128 | }
129 | return int(mw), int(mh), ng, jq
130 | }
131 |
132 | // Transform implements ImageHandler.
133 | func (ih *ImageHandlerBase64) Transform(src string, ir io.Reader, dw *kobodict.Writer) (string, string, error) {
134 | mw, mh, ng, jq := ih.params()
135 |
136 | // decode the image
137 | img, err := imaging.Decode(ir)
138 | if err != nil {
139 | return "", "", fmt.Errorf("ImageHandlerBase64: decode image: %w", err)
140 | }
141 |
142 | // resize it
143 | ow, oh := float64(img.Bounds().Dx()), float64(img.Bounds().Dy())
144 | sf := math.Min(float64(mw)/ow, float64(mh)/oh)
145 | img = imaging.Resize(img, int(ow*sf), int(oh*sf), imaging.Lanczos)
146 |
147 | // make it grayscale
148 | if ng {
149 | img = imaging.Grayscale(img)
150 | }
151 |
152 | // encode the image
153 | buf := bytes.NewBuffer(nil)
154 | bw := base64.NewEncoder(base64.StdEncoding, buf)
155 | if err := imaging.Encode(bw, img, imaging.JPEG, imaging.JPEGQuality(jq)); err != nil {
156 | return "", "", fmt.Errorf("ImageHandlerBase64: encode new image to dictfile: %w", err)
157 | }
158 | _ = bw.Close()
159 |
160 | // generate the css
161 | css := fmt.Sprintf("width:%dpx;height:%dpx;max-width:100%%;margin:1em auto;page-break-before:auto;object-fit:scale-down;object-position:center", img.Bounds().Dx(), img.Bounds().Dy())
162 |
163 | // build the URL
164 | return "data:image/jpeg;base64," + buf.String(), css, nil
165 | }
166 |
167 | // Description implements ImageHandler.
168 | func (ih *ImageHandlerBase64) Description() string {
169 | mw, mh, ng, jq := ih.params()
170 | return fmt.Sprintf("optimize and encode as base64 data URL (max_width=%d, max_height=%d, grayscale=%t, jpeg_quality=%d) (warning: this causes segfaults in the in-book dictionary due to a bug in nickel with firmware versions below 4.20.14601)", mw, mh, ng, jq)
171 | }
172 |
173 | var imgTagRe = regexp.MustCompile(`(
]*\s+)?src\s*=\s*['"]+)([^'"]+)(['"][^>]*>)`)
174 |
175 | // transformHTMLImages transforms img tags in the specified HTML, using
176 | // openImage to read the specified paths. If openImage implements io.Closer,
177 | // it will be closed automatically. Img tags which reference have a data URL are
178 | // skipped.
179 | //
180 | // The dictwriter may be used during this process, so callers should not rely on
181 | // any entries opened before calling this.
182 | func transformHTMLImages(ih ImageHandler, dw *kobodict.Writer, html []byte, img ImageFunc) ([]byte, error) {
183 | nhtml := html[:]
184 | for _, m := range imgTagRe.FindAllSubmatch(html, -1) {
185 | t, a, b, src, c := m[0], m[1], m[2], m[3], m[4]
186 | if bytes.HasPrefix(src, []byte("data:")) {
187 | continue
188 | }
189 | ir, err := img(string(src))
190 | if err != nil {
191 | return nil, fmt.Errorf("transform image %#v: open file: %w", string(src), err)
192 | }
193 | nsrc, css, err := ih.Transform(string(src), ir, dw)
194 | if err != nil {
195 | if c, ok := ir.(io.Closer); ok {
196 | c.Close()
197 | }
198 | return nil, fmt.Errorf("transform image %#v: transform image: %w", string(src), err)
199 | }
200 | if c, ok := ir.(io.Closer); ok {
201 | c.Close()
202 | }
203 | var nstyle string
204 | if len(css) != 0 {
205 | nstyle = " style=\"" + css + "\""
206 | }
207 | if len(nsrc) == 0 {
208 | nhtml = bytes.Replace(nhtml, t, nil, 1)
209 | } else {
210 | nhtml = bytes.Replace(nhtml, t, []byte(string(a)+nstyle+string(b)+nsrc+string(c)), 1)
211 | }
212 | }
213 | return nhtml, nil
214 | }
215 |
--------------------------------------------------------------------------------
/dictgen/image_test.go:
--------------------------------------------------------------------------------
1 | package dictgen
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 | )
7 |
8 | func TestImgTagRe(t *testing.T) {
9 | inHTML := `
10 |
11 |
12 |
13 |
14 |
17 | `
18 | exImg := [][]string{
19 | {`
`},
20 | {`
`},
21 | {`
`},
22 | {`
`},
23 | {`
`},
26 | }
27 |
28 | acMatch := imgTagRe.FindAllStringSubmatch(inHTML, -1)
29 | acImg := make([][]string, len(acMatch))
30 | for i, m := range acMatch {
31 | acImg[i] = m[1:]
32 | }
33 |
34 | if !reflect.DeepEqual(exImg, acImg) {
35 | t.Errorf("Expected %#v, got %#v.", exImg, acImg)
36 | }
37 | }
38 |
39 | // TODO(v1): test the image handlers, especially the one which does the replacements
40 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | title: dictutil
2 | remote_theme: pmarsceill/just-the-docs
3 | url: https://pgaskin.net
4 | baseurl: /dictutil
5 | description: Tools, notes, and other stuff related to Kobo dictionaries.
6 | search_enabled: false
7 | aux_links:
8 | Download:
9 | - http://github.com/pgaskin/dictutil/releases/latest
10 | MobileRead:
11 | - https://www.mobileread.com/forums/showthread.php?t=327854
12 | GitHub:
13 | - http://github.com/pgaskin/dictutil
14 | heading_anchors: true
15 | footer_content: Copyright © 2020 Patrick Gaskin.
16 |
--------------------------------------------------------------------------------
/docs/_includes/head_custom.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/dictgen/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: dictgen
4 | has_children: false
5 | ---
6 |
7 | # dictgen
8 |
9 | This section contains documentation for dictgen, a high-level tool to create Kobo dictionaries.
10 | {: .fs-6 .fw-300 }
11 |
12 | ## Usage
13 |
14 | ```
15 | Usage: dictgen [options] dictfile...
16 |
17 | Options:
18 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "dicthtml.zip")
19 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex)
20 | -I, --image-method string How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove) (default "base64")
21 | --remove-footer Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary)
22 | -h, --help Show this help text
23 |
24 | If multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename.
25 |
26 | Note that currently, the only usable image method is removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary if images are enabled), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details.
27 |
28 | See https://pgaskin.net/dictutil/dictgen for more information about the dictfile format.
29 | ```
30 |
31 | ## Example usage
32 |
33 | **Building a dictzip for a dictfile:**
34 |
35 | ```
36 | dictgen my-dictionary.df
37 | ```
38 |
39 | If you are using Windows, you can also drag-and-drop a dictfile onto dictgen.exe.
40 |
41 | **Merging multiple dictfiles into a single dictzip:**
42 |
43 | ```
44 | dictgen my-dictionary.df another.df
45 | ```
46 |
47 | If you are using Windows, you can also drag-and-drop multiple dictfiles onto dictgen.exe.
48 |
49 | **Building a dictzip with images removed:**
50 |
51 | ```
52 | dictgen -I remove my-dictionary.df
53 | ```
54 |
55 | **Specifying a custom output filename:**
56 |
57 | ```
58 | dictgen -o dicthtml-df.zip my-dictionary.df
59 | ```
60 |
61 | ## Dictfile format
62 | Dictgen uses a simple, but feature-complete format for representing Kobo dictionaries.
63 |
64 | A dictfile (with the file extension `.df`) is a plain-text file consisting of multiple entries.
65 |
66 | Each entry represents a single definition. There can be more than one entry per word. An entry is denoted by a line starting with `@ ` followed by the headword. The headword can contain spaces, capital letters, and so on.
67 |
68 | After the headword, zero or more header lines can be added. To add additional variants which will be matched, use `& ` followed by the word variant. The variant can be anything which could be used in a headword. This can be specified more than once, but only one variant can be specified for each `& `. Another header type is word information, denoted by a `: `. If specified, the text following it is appended after the bolded headword on the same line (see the English built-in dictionary for an example; it has things like `-verb` and the pronunciation information here). If you want to have complete control over how the entry is displayed, use `::` (without anything following it) instead of `: `. This will remove the default bolded headword at the top of the generated entry.
69 |
70 | After the header lines, you can include the body of the entry. By default, this uses [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) for formatting. If you want to include raw HTML, prepend the HTML with `` (don't include a closing tag). This can span multiple lines, and will continue until the next entry or end of file.
71 |
72 | In addition, you can include GIF and JPEG images in the body using the usual Markdown or HTML syntax. If the image path is relative (i.e. not a full path), it is resolved relative to the directory you run dictgen from.
73 |
74 | You can also include custom CSS (per-entry) by including it between the `` tags. This is supported in both HTML and Markdown mode.
75 |
76 | ## Dictfile reference
77 |
78 | - `@ HEADWORD`: Start a new entry. The headword doesn't have to be unique, and can contain spaces.
79 | - Header
80 | - `: WORD_INFO` or `::` *(optional)*: Add extra word info after the headword, or remove it entirely.
81 | - `& VARIANT` *(optional)*: Add an additional word to match. Follows the same rules as the headword. Can be repeated multiple times.
82 | - Body
83 | - `MARKDOWN` or ` RAW_HTML`: Include a definition written in Markdown or raw HTML code.
84 |
85 | ## Examples
86 |
87 | ### Simplest
88 |
89 | ```
90 | @ word
91 | Definition here.
92 | @ word 1
93 | Definition 1 here.
94 | @ test
95 | Blah blah blah.
96 | ```
97 |
98 | ### Simple
99 |
100 | ```
101 | @ no
102 | - No means no...
103 |
104 | @ NO
105 | - A different definition for nitric oxide.
106 | - Blah blah blah.
107 |
108 | @ go
109 | & went
110 | & going
111 | 1. This definition is matched by three different words.
112 | 2. It's also numbered rather than bulleted.
113 | - With some sub-items.
114 | - And another.
115 |
116 | An image:
117 |
118 | 
119 |
120 | @ test
121 | : this appears beside the headword
122 | Blah blah blah.
123 | ```
124 |
125 | ### Full
126 |
127 | ```
128 | @ word
129 | This is the definition of a word.
130 |
131 | @ word 2
132 | This is the defnition of the second word.
133 |
134 | @ water
135 | & H2O
136 | 1. You can also use lists in Markdown.
137 | 2. And **bold text** or *italic text*.
138 | - Sub-items are also supported.
139 |
140 | @ test
141 | : -noun
142 | Blah blah blah.
143 |
144 | @ test
145 | : -verb
146 | Blah blah blah.
147 |
148 | @ custom
149 | ::
150 | **This is a custom word header!**
151 |
152 | And the definition here:
153 | - Blah blah blah.
154 | - Blah blah blah.
155 |
156 | @ images
157 | Embedding an image (relative paths):
158 |
159 | 
160 |
161 | Embedding an image (Linux/macOS style paths):
162 |
163 | 
164 |
165 | Embedding an image (Windows style paths):
166 |
167 | 
168 |
169 |
170 | @ raw-html
171 | This definition contains raw html.
172 |
173 | You can split it into multiple lines for readability.
174 |
175 |
176 | - You can also use all HTML tags.
177 | - This text has a dark background
178 | - This text is styled with CSS classes.
179 |
180 |
181 |
186 | ```
187 |
--------------------------------------------------------------------------------
/docs/dicthtml/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: dicthtml
4 | has_children: true
5 | ---
6 |
7 | # dicthtml
8 |
9 | This section contains documentation and notes about Kobo's dictionary format.
10 | {: .fs-6 .fw-300 }
--------------------------------------------------------------------------------
/docs/dicthtml/install.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Installing custom dictionaries
4 | parent: dicthtml
5 | ---
6 |
7 | # Installing custom dictionaries
8 | Sideloading custom dictionaries is easy, but slightly finicky.
9 |
10 | ## Using dictutil
11 | You can easily install dictionaries using dictutil. First, if you are not replacing a built-in dictionary, enable and install the **Enable searches on extra dictionaries** [patch](https://pgaskin.net/kobopatch-patches). Then, follow the [instructions for using the install command](../dictutil/install.html).
12 |
13 | You can uninstall custom dictionaries (including reverting overwritten built-in ones) using the [uninstall command](../dictutil/uninstall.html).
14 |
15 | ## Manual installation
16 | 1. Enable and install the **Enable searches on extra dictionaries** [patch](https://pgaskin.net/kobopatch-patches).
17 | 2. Copy the dictionary to `KOBOeReader/.kobo/dict/dicthtml-LOCALE.zip`, where **LOCALE** is a string consisting of 2 lowercase alphanumeric characters. It does not have to be a valid locale.
18 | 3. If using a a firmware version 4.20.14601 or newer, mark the file as read-only (in Windows Explorer, or `chmod 444 dicthtml-LOCALE.zip`) to prevent nickel from overwriting it during the sync process.
19 | 4. If using a firmware version older than 4.20.14601, open `KOBOeReader/.kobo/KoboReader.sqlite` in a SQLite3 editor, and add a row to the Dictionary table with the following values:
20 | - **Suffix:** `-LOCALE`, where **LOCALE** is the locale code you chose earlier. This is used when constructing filenames.
21 | - **Name:** `Extra:_LOCALE LABEL`, where **LOCALE** is the locale code you chose earlier, and **LABEL** is a custom label (it can have spaces in it).
22 | - **Installed:** `true`. This one is self-explanatory.
23 | - **Size:** `SIZE`, where *SIZE* is the size of the dictzip in bytes. This is displayed in the dictionary settings, but is unused otherwise, so it's fine if it isn't accurate as long as it is a valid number. For built-in dictionaries with `IsSynced` set, it is used to check for updates.
24 | - **IsSynced:** `false`. This is used to see if the sync process should attempt to sync the specified dictionary. If true, the `Size` column is checked against the expected size of the latest version (from the dictionary download server), and if it does not match, the new dictionary is downloaded over it.
25 | 5. Open `KOBOeReader/.kobo/Kobo/Kobo eReader.conf`, and add a line like `ExtraLocales=LOCALE` in the `ApplicationPreferences` section. If it already exists, add your locale code to it and keep the items separated by a comma and a space (e.g. `ExtraLocales=a1, a2`).
26 | 6. Eject your eReader and test the dictionary.
27 | - If the dictionary is unselectable, ensure you followed the steps correctly, especially regarding the locale codes.
28 | - If the dictionary says that the word wasn't found, or just acts unusually in general, ensure the dictionary file is valid.
29 |
30 | ## About locale names and patches
31 | The reason why the patch is required is due to a bug in the firmware. When you choose an entry from the dictionary dropdown, it tries to find a locale name matching it (which it uses to construct the filename for the dicthtml). Kobo has a hard-coded list of supported built-in locales, and supports adding extra ones using the **ApplicationPreferences->ExtraLocales** config file option (a comma separated list of locale codes). These locales have an automatically generated name of "Extra: LOCALE".
32 |
33 | But, this is where the bug occurs. To support translation dictionaries, the dictionary selector will split the name by spaces, and only check against the first element. This is perfectly fine for one-word locale names (i.e. all the built-in ones) For custom locales, it will try to match **Extra:**, which doesn't exist, so it will default to the English dictionary. Thus, to fix this, the "Extra: " prefix used for the custom locales needs to be changed to one without a space. The patch replaces the space with an underscore. This bug does have one benefit though: since only stuff before the first space is considered, you can have a custom label after it.
34 |
35 | ## Alternative method
36 | It is also possible to install custom dictionaries by replacing an existing built-in installed dictionary in `KOBOeReader/.kobo/dict`. To prevent it from being overwritten during a sync, set the `IsSynced` column to `false` for it in the DB on firmware versions older than 4.20.14601, otherwise, mark it read-only.
37 |
38 | ## About changes in firmware 4.20.14601
39 |
40 | In short:
41 |
42 | - **Same:** Nickel will still attempt to sync all dictionaries, including sideloaded ones, unless IsSynced is false.
43 | - **New:** IsSynced can't be changed anymore due to the dictionary table being removed.
44 | - **New:** Nickel will avoid overwriting dictionary files if they are marked read-only, and will instead write `"dicthtml-LOCALE" marked as read-only.. skipping` to the log in the `sync` category. Note that this functionality has been around since at least 4.10.11655, but the database needed to be modified anyways, so there wasn't much point to using it (and nobody noticed it either).
45 | - **Same:** Nickel still generates locale names by default with `Extra: LOCALE`.
46 | - **New:** Nickel doesn't read the dictionary table anymore, so the name in it is ignored. In addition, entries in the table won't change anything even if it is still present.
47 | - **New:** The built-in dictionaries are hard-coded, rather than writing them to the db during migrations and reading from it at runtime.
48 | - **Same:** Nickel still has the bug where the locale splitting is messed up, so the `Extra: LOCALE` names are inherently broken.
49 | - **Same:** The matching can be fixed by replacing `Extra: ` with `Extra:_` (or anything not containing Unicode whitespace).
50 | - **New:** The database doesn't need to be changed anymore in addition to the patch, as the names are generated dynamically using the same string.
51 | - **Therefore:** If the dictionary table is present, it can safely be removed.
52 | - **Therefore:** The steps required to install custom dictionaries are now (note that these have already been incorporated into the instructions above, they are just here for convenience):
53 | - Copy the dictzip and mark it read-only.
54 | - Add it to ExtraLocales if it is not a built-in locale.
55 | - Use the patch to replace `Extra: ` in libnickel with any other string (same length or shorter with a null byte at the end), but does not contain a space (` `).
56 |
57 | See [#49](https://github.com/pgaskin/kobopatch-patches/issues/49) for more information.
58 |
59 | ## Issues with the read-only method for preventing dictionaries from being overwritten
60 | There have been reports of the read-only property (see [#6](https://github.com/pgaskin/dictutil/issues/6) and the threads on MobileRead for more details) not having an effect since at least 4.20.14622. This seems to be due to other checks in the code (for IsSynced and the file size) preventing the read-only one from actually being checked under some conditions. Additionally, some people have had problems marking the dictionary as read-only to begin with (this doesn't seem to be an issue on Linux).
61 |
62 | For now, you can use this [patch](https://pgaskin.net/kobopatch-patches) (for kobopatch v0.15.0, which is included in patches v60+) to prevent all dictionaries from being synced. It should work on most recent firmware versions starting from 4.22.15190.
63 |
64 | ```yaml
65 | Never sync dictionaries:
66 | - Enabled: no
67 | - BaseAddress: {Sym: "SyncDictionariesCommand::prepareDownloadList()"}
68 | - ReplaceBytes: {Offset: 922, FindH: 0CD5, ReplaceH: 0CE0} #permissions
69 | - ReplaceBytes: {Offset: 900, FindH: FFF6CAAE, ReplaceInstNOP: true} #size
70 | - ReplaceBytes: {Offset: 866, FindH: 3FF4DBAE, ReplaceInstNOP: true} #isSynced
71 | ```
72 |
73 | For versions 4.20.14601 to 4.21.15015, use this patch instead:
74 |
75 | ```yaml
76 | Never sync dictionaries:
77 | - Enabled: no
78 | - BaseAddress: {Sym: "SyncDictionariesCommand::prepareDownloadList()"}
79 | - ReplaceBytes: {Offset: 1048, FindH: 0CD5, ReplaceH: 0CE0} #permissions
80 | - ReplaceBytes: {Offset: 1026, FindH: FFF68DAE, ReplaceInstNOP: true} #size
81 | - ReplaceBytes: {Offset: 992, FindH: 3FF49EAE, ReplaceInstNOP: true} #isSynced
82 | ```
83 |
--------------------------------------------------------------------------------
/docs/dicthtml/matching.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Matching words
4 | parent: dicthtml
5 | ---
6 |
7 | # Matching words
8 | TODO
9 |
--------------------------------------------------------------------------------
/docs/dicthtml/prefixes.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Prefixes
4 | parent: dicthtml
5 | ---
6 |
7 | # Prefixes
8 | Kobo dictionaries are sharded by a prefix derived from the headword.
9 |
10 | The information in this document is based on reverse engineering DictionaryParser::htmlForWord.
11 |
12 | **Note:** Kobo will only look in the file matching the word's prefix, so if a variant has a different prefix, it must be duplicated into each matching file (note that duplicate words aren't an issue).
13 |
14 | **Note:** This document only covers the algorithm used for non-Japanese (Kanji) dictionaries.
15 |
16 | ## Prefix algorithm
17 | Prefixes are calculated using the following steps. Note that "character" refers to a single Unicode code point, not a byte.
18 |
19 | 1. Trim the word at the first null byte, if any (i.e. treat it as a C string).
20 | 2. Discard everything but the first two characters.
21 | 3. Convert the characters to lowercase using the Unicode case mapping rules.
22 | 4. Trim all whitespace characters on the left and right sides.
23 | 5. If the string is empty, return "11".
24 | 6. If the first of the remaining characters is in the Unicode Cyrillic character class, return them as-is.
25 | 7. Right-pad the remaining characters to 2 characters long using "`a`"s.
26 | 8. If either of the first two characters are not in the Unicode Letter character class, return "11".
27 | 9. Return the characters as-is.
28 |
29 | ## Examples
30 |
31 |
32 |
33 | | Word | Prefix | Notes |
34 | | --- | --- | --- |
35 | | "`test`" | "`te`" | |
36 | | "`a`" | "`aa`" | |
37 | | "`Èe`" | "`èe`" | The word is made lowercase using unicode rules (i.e. accented characters are included). |
38 | | "`multiple words`" | "`mu`" | |
39 | | "`àççèñts`" | "`àç`" | |
40 | | "`à`" | "`àa`" | |
41 | | "`ç`" | "`ça`" | |
42 | | "" | "`11`" | |
43 | | "` `" | "`11`" | Space trimming is done after taking the first 2 characters. |
44 | | "` x`" | "`xa`" | |
45 | | "` 123`" | "`11`" | |
46 | | "`x 23`" | "`xa`" | |
47 | | "`д `" | "`д`" | "д" is a Cyrillic character, and it's the first character of the word (after trimming spaces), so it isn't padded with "a"s. |
48 | | "`дaд`" | "`дa`" | |
49 | | "`未未`" | "`未未`" | |
50 | | "`未`" | "`未a`" | Even though "未" is a two-byte character, it is a single unicode rune (and the characters are counted, not bytes). |
51 | | "` 未`" | "`11`" | Space trimming is done after taking the first 2 characters. |
52 | | "` 未`" | "`未a`" | The two-byte "未" character isn't split up when taking the first 2 characters. |
53 |
54 | ## Testing
55 | You can test Kobo's prefix algorithm directly using [dictword-test](https://github.com/pgaskin/kobo-mods/tree/master/dictword-test/).
56 |
57 | If you just want an easy way to generate prefixes for words, use the [dictutil prefix](../dictutil/prefix.html) command
58 |
59 | ## Sample implementation
60 | Here is the Go implementation used in dictutil:
61 |
62 | ```go
63 | func WordPrefix(word string) string {
64 | pfx := []rune(word)
65 |
66 | for i, c := range pfx {
67 | if i >= 2 || c == '\x00' { // limit to 2 chars, also cut at null
68 | pfx = pfx[:i] // trim up to current char
69 | break
70 | }
71 | pfx[i] = unicode.ToLower(c) // this includes accented chars
72 | }
73 |
74 | for len(pfx) != 0 {
75 | if unicode.IsSpace(pfx[0]) {
76 | pfx = pfx[1:] // trim left space
77 | } else {
78 | break
79 | }
80 | }
81 |
82 | for len(pfx) != 0 {
83 | if unicode.IsSpace(pfx[len(pfx)-1]) {
84 | pfx = pfx[:len(pfx)-1] // trim right space
85 | } else {
86 | break
87 | }
88 | }
89 |
90 | if len(pfx) == 0 {
91 | return "11" // if empty, return "11"
92 | }
93 |
94 | if !unicode.Is(unicode.Cyrillic, pfx[0]) {
95 | for len(pfx) < 2 {
96 | pfx = append(pfx, 'a') // pad right with 'a's to 2 chars
97 | }
98 | if !unicode.IsLetter(pfx[0]) || !unicode.IsLetter(pfx[1]) {
99 | return "11" // if either of the first 2 chars are letters, return "11"
100 | }
101 | }
102 |
103 | return string(pfx)
104 | }
105 | ```
106 |
--------------------------------------------------------------------------------
/docs/dicthtml/v1v2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgaskin/dictutil/6708cff9a06dbd088ec2267a2314028a9a00b5a7/docs/dicthtml/v1v2-1.png
--------------------------------------------------------------------------------
/docs/dicthtml/v1v2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgaskin/dictutil/6708cff9a06dbd088ec2267a2314028a9a00b5a7/docs/dicthtml/v1v2-2.png
--------------------------------------------------------------------------------
/docs/dicthtml/v1v2.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Dicthtml v1/v2
4 | parent: dicthtml
5 | ---
6 |
7 | # Dicthtml v1/v2
8 | In firmware 4.7.10364 (December 2017), Kobo introduced a new version of the dictionaries.
9 |
10 | The v1 dictionaries are served from download.kobobooks.com/ereader/dictionaries/dicthtml\*.zip, while the v2 dictionaries are served from download.kobobooks.com/ereader/dictionaries/v2/dicthtml\*.zip.
11 |
12 | While the v1 dictionaries are still available (probably for the Kobo Mini, which is still on 3.19.5761), they will not fully work on newer firmware versions due to the prefix changes.
13 |
14 | I haven't looked at the exact details about v1 dictionaries, but the main change seems to be the rules for computing prefixes for words with accents.
15 |
16 | ## Prefix changes
17 |
18 | 
19 |
20 | The primary change in v2 was the removal of the last step of prefix calculation - converting all non-ascii characters to `1`s. Note that this step is done after checking that the first two characters are all Unicode letters (which include accented letters), hence why the prefix wouldn't be `11` (which is used if any of the first 2 characters are not Unicode letters).
21 |
22 | ## Built-in dictionary fixes
23 |
24 | In addition, Kobo fixed some bugs with the dictionaries themselves. In v1, a few dictionaries were missing `` tags around some words, presumably because the conversion code was buggy and the input format was undocumented/unstructured.
25 |
26 | 
27 |
28 | As illustrated by the diff above, some words weren't separated properly and a few line breaks were missing in v1.
--------------------------------------------------------------------------------
/docs/dictutil/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: dictutil
4 | has_children: true
5 | ---
6 |
7 | # dictutil
8 |
9 | This section contains documentation for dictutil, a tool to manipulate Kobo dictionaries.
10 | {: .fs-6 .fw-300 }
11 |
12 | ```
13 | Usage: dictutil command [options] [arguments]
14 |
15 | Dictutil provides low-level utilities to manipulate Kobo dictionaries (v2).
16 |
17 | Commands:
18 | install (I) Install a dictzip file
19 | pack (p) Pack a dictzip file
20 | prefix (x) Calculate the prefix for a word
21 | uninstall (U) Uninstall a dictzip file
22 | unpack (u) Unpack a dictzip file
23 | help Show help for all commands
24 |
25 | Options:
26 | -h, --help Show this help text
27 | ```
--------------------------------------------------------------------------------
/docs/dictutil/install.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Install
4 | parent: dictutil
5 | ---
6 |
7 | # Install
8 |
9 | ## Usage
10 |
11 | ```
12 | Usage: dictutil install [options] dictzip
13 |
14 | Options:
15 | -k, --kobo string KOBOeReader path (default: automatically detected)
16 | -l, --locale string Locale name to use (format: ALPHANUMERIC{2}[-ALPHANUMERIC{2}]) (default: detected from filename if in format dicthtml-**.zip)
17 | -n, --name string Custom additional label for dictionary (ignored when replacing built-in dictionaries) (doesn't have any effect on 4.20.14601+)
18 | -b, --builtin string How to handle built-in locales [replace = replace and prevent from syncing] [ignore = replace and leave syncing as-is] (doesn't have any effect on 4.24.15672+) (default "replace")
19 | -B, --no-custom Whether to force installation to .kobo/dict instead of .kobo/custom-dict (4.24.15672+ only)
20 | --use-extra-locales Whether to use ExtraLocales on 4.24.15672+ if not a built-in dictionary (this is not required anymore since 4.24.15672) (4.24.15672+ only)
21 | -h, --help Show this help text
22 |
23 | Note:
24 | If you are not replacing a built-in dictionary and are using a firmware
25 | version before 4.24.15672, the 'Enable searches on extra dictionaries patch'
26 | must be installed or you will not be able to select your custom dictionary.
27 | ```
28 |
29 | ## Examples
30 |
31 | **Install a dictionary with the locale in the filename (dicthtml-\*\*.zip):**
32 |
33 | ```sh
34 | dictutil install dicthtml-aa.zip
35 | ```
36 |
37 | **Install a dictionary with a different locale:**
38 |
39 | ```sh
40 | dictutil install --locale aa mydictionary.zip
41 | ```
42 |
43 | **Install a dictionary on a specific Kobo:**
44 |
45 | ```sh
46 | dictutil install --kobo /path/to/KOBOeReader dicthtml-aa.zip
47 | ```
48 |
49 | **Install a dictionary with a custom label (4.19.14123 and older):**
50 |
51 | ```sh
52 | dictutil install --name "My Dictionary" dicthtml-aa.zip
53 | ```
54 |
55 | ## Details
56 | See [installing dictionaries](../dicthtml/install.html) for more details on how this works.
57 |
--------------------------------------------------------------------------------
/docs/dictutil/pack.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Pack
4 | parent: dictutil
5 | ---
6 |
7 | # Pack
8 |
9 | ## Usage
10 |
11 | ```
12 | Usage: dictutil pack [options] dictdir
13 |
14 | Options:
15 | -o, --output string The output dictzip filename (will be overwritten if it exists) (default "dicthtml.zip")
16 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex)
17 | -h, --help Show this help text
18 | ```
19 |
20 | ## Examples
21 |
22 | **Pack a dictdir:**
23 |
24 | ```sh
25 | dictutil pack /path/to/dictdir
26 | # the output is written to dicthtml.zip
27 | ```
28 |
29 | **Pack a dictdir to a specific filename:**
30 |
31 | ```sh
32 | dictutil pack --output "dicthtml-aa.zip" /path/to/dictdir
33 | ```
34 |
35 | ## Input format
36 | The input dictdir is the same as the output of [dictutil unpack](./unpack.html).
37 |
--------------------------------------------------------------------------------
/docs/dictutil/prefix.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Prefix
4 | parent: dictutil
5 | ---
6 |
7 | # Prefix
8 |
9 | ## Usage
10 |
11 | ```
12 | Usage: dictutil prefix [options] word...
13 |
14 | Options:
15 | -f, --format string The output format (go-slice, go-map, csv, tsv, json-array, json-object) (default "json-array")
16 | -h, --help Show this help text
17 | ```
18 |
19 | ## Examples
20 |
21 | **Get the prefix for a word:**
22 |
23 | ```sh
24 | dictutil prefix "word"
25 | ```
26 |
27 | **Get the prefix for multiple words:**
28 |
29 | ```sh
30 | dictutil prefix "word1" "word2" "word3"
31 | ```
32 |
33 | **Get the prefix for multiple words as CSV:**
34 |
35 | ```sh
36 | dictutil prefix --format csv "word1" "word2" "word3"
37 | ```
38 |
--------------------------------------------------------------------------------
/docs/dictutil/uninstall.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Uninstall
4 | parent: dictutil
5 | ---
6 |
7 | # Uninstall
8 |
9 | ## Usage
10 |
11 | ```
12 | Usage: dictutil uninstall [options] locale
13 |
14 | Options:
15 | -k, --kobo string KOBOeReader path (default: automatically detected)
16 | -b, --builtin string How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+) (default "normal")
17 | -B, --no-custom Uninstall built-in dictionaries instead of custom ones on 4.24.15672+
18 | -h, --help Show this help text
19 | ```
20 |
21 | ## Examples
22 |
23 | **Uninstall a dictionary:**
24 |
25 | ```sh
26 | dictutil uninstall aa
27 | ```
28 |
29 | **Restore a overwritten built-in dictionary:**
30 |
31 | ```sh
32 | dictutil uninstall --builtin restore fr
33 | ```
34 |
35 | **Completely delete a built-in dictionary:**
36 |
37 | ```sh
38 | dictutil uninstall --builtin delete fr
39 | ```
40 |
41 | Note: You can restore the dictionary by manually downloading it and using [dictutil install](./install).
42 |
43 | ## Details
44 | Uninstall does the following steps:
45 |
46 | 1. If the DB entry for the dictionary exists:
47 | - Built-in (normal): Set `Installed` to `false`.
48 | - Built-in (delete): Remove the row for the suffix.
49 | - Built-in (restore): Set `Installed` to `true`.
50 | - Extra: Remove the row for the suffix.
51 | 2. If the dictionary is not built-in and there is an `ExtraLocales` entry for the locale in the `.kobo/Kobo/Kobo eReader.conf`, remove it.
52 | 3. With the dictzip:
53 | - Built-in (normal): Delete it if it exists.
54 | - Built-in (delete): Delete it if it exists.
55 | - Built-in (restore): Delete it if it exists, then download it again from Kobo.
56 | - Extra: Delete it if it exists.
57 |
--------------------------------------------------------------------------------
/docs/dictutil/unpack.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Unpack
4 | parent: dictutil
5 | ---
6 |
7 | # Unpack
8 |
9 | ## Usage
10 |
11 | ```
12 | Usage: dictutil unpack [options] dictzip
13 |
14 | Options:
15 | -o, --output string The output directory (must not exist) (default: the basename of the input without the extension)
16 | -c, --crypt string Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex)
17 | -h, --help Show this help text
18 | ```
19 |
20 | ## Examples
21 |
22 | **Unpack a dictionary:**
23 |
24 | ```sh
25 | dictutil unpack dicthtml.zip
26 | # The output is written to ./dicthtml
27 | ```
28 |
29 | ```sh
30 | dictutil unpack dicthtml-fr.zip
31 | # The output is written to ./dicthtml-fr
32 | ```
33 |
34 | **Unpack a dictionary to a custom directory:**
35 |
36 | ```
37 | dictutil unpack --output mydictionary dicthtml.zip
38 | ```
39 |
40 | ## Details
41 | An unpacked dictdir contains:
42 |
43 | - `words`: The parsed marisa word list (newline-separated).
44 | - `*.html`: The ungzipped dicthtml files.
45 | - `*`: Any additional files as-is.
46 |
--------------------------------------------------------------------------------
/docs/examples/bgl-convert.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: bgl-convert
4 | parent: examples
5 | ---
6 |
7 | # bgl-convert
8 | Converts Babylon BGL dictionaries into dictfiles for use with dictgen.
9 |
10 | Paste the BGL text in the box below to convert it:
11 |
12 |
13 |
14 | Example BGL:
15 |
16 | ```
17 | ### metadata trimmed for brevity
18 | ### ...
19 |
20 | headword
21 | Definition with html tags.
22 |
23 | headword1|variant1|variant2
24 | The second definition. Blah
25 | blah blah blah.
26 |
27 |
28 | ```
29 |
--------------------------------------------------------------------------------
/docs/examples/dictzip-decompile.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: dictzip-decompile
4 | parent: examples
5 | ---
6 |
7 | # dictzip-decompile
8 | This is an **experimental** tool to convert a dictzip into a dictfile. The output may not be perfect for complex dictionaries. The output should be perfect for dictionaries generated by Penelope.
9 |
10 | ## Usage
11 |
12 | ```
13 | Usage: dictzip-decompile [options] dictzip
14 |
15 | Options:
16 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./decompiled.df")
17 | -r, --resources Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled)
18 | -h, --help Show this help text
19 |
20 | Arguments:
21 | dictzip is the path to the dictzip to decompile.
22 |
23 | To convert the resulting dictfile into a dictzip, use dictgen.
24 |
25 | Note: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown.
26 |
27 | This is an experimental tool, and the output may not be perfect on complex dictionaries.
28 | ```
29 |
30 | ## Example uses
31 | - Fixing prefixes or missing variants in dictzips generated by other tools (recompiling the dictfile will automatically fix the prefixes and variants).
32 | - Upgrading a v1 dictzip to v2 (same as above).
33 | - Decompiling a dictzip to merge it with another.
34 | - Converting a previously-created dictzip to a dictfile to make it easier to improve.
35 | - Converting StarDict dictionaries by converting to a dictzip using Penelope, then to a dictfile using this tool.
36 |
37 | ## Notes
38 | The following dictzip generators have enhanced decompilation support:
39 |
40 | - **Penelope:** The output should be perfect.
41 | - **Kobo (en, a few others):** The output should be mostly perfect, but there are a few missing edge cases. Variants (`&`) and header info (`:`) are extracted in addition to the entry content.
42 | - **Kobo (fr):** The output should be mostly perfect, but there are a few missing edge cases. Variants (`&`) and header info (`:`) are extracted in addition to the entry content.
43 | - **dictgen:** The output should be very close to the original dictfile (it has been tested with the output of gotdict-convert and webster1913-convert). With gotdict-convert, the only difference when the decompiled dictzip's dictfile was recompiled was the casing of a few entries in the words index. Even so, this should not be used unless the original dictfile has been lost. In addition, the original Markdown code and images are not recovered. Variants (`&`) and header info (`:` / `::`) are extracted in addition to the entry content.
44 |
45 | Other dictzips only have the headword (`@`) and variants (`&`) extracted, and the content is included as-is as raw HTML without support for other dictfile features.
46 |
--------------------------------------------------------------------------------
/docs/examples/gotdict-convert.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: gotdict-convert
4 | parent: examples
5 | ---
6 |
7 | # gotdict-convert
8 | This tool converts [gotdict](https://github.com/wjdp/gotdict) to a dictfile for conversion into a Kobo dictzip.
9 |
10 | Images are supported on firmware 4.20.14601+.
11 |
12 | ## Download
13 | Pre-built dictionaries can be downloaded from the following links:
14 | - GOTDict *(with images, firmware 4.20.14601+)*: [dictzip (dicthtml-gt.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.df?branch=master&all=false&pr=false)
15 | - GOTDict *(without images)*: [dictzip (dicthtml-gt.noimg.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.noimg.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.noimg.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.noimg.df?branch=master&all=false&pr=false)
16 |
17 | You can use [dictutil](../dictutil/install.html) to install the dictionaries, or see [here](../dicthtml/install.html) for manual installation instructions.
18 |
19 | ## Usage
20 |
21 | ```
22 | Usage: gotdict-convert [options]
23 |
24 | Version: dev
25 |
26 | Options:
27 | -g, --gotdict string The path to the local copy of github.com/wjdp/gotdict. (default "./gotdict")
28 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./gotdict.df")
29 | -I, --images Include images in dictfile
30 | -h, --help Show this help text
31 |
32 | To convert the resulting dictfile into a dictzip, use dictgen.
33 | ```
34 |
35 | You can also use the parser as a [Go library](https://pkg.go.dev/github.com/pgaskin/dictutil/examples/gotdict-convert/gotdict).
36 |
--------------------------------------------------------------------------------
/docs/examples/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: examples
4 | has_children: true
5 | ---
6 |
7 | # Examples
8 |
9 | This section contains some tools which make use of dictutil.
10 | {: .fs-6 .fw-300 }
--------------------------------------------------------------------------------
/docs/examples/webster1913-convert.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: webster1913-convert
4 | parent: examples
5 | ---
6 |
7 | # webster1913-convert
8 | This tool converts [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) into a dictfile for conversion into a Kobo dictzip.
9 |
10 | ## Download
11 | Pre-built dictionaries can be downloaded from the following links:
12 | - Webster's 1913 Dictionary: [dictzip (dicthtml-wb.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/dicthtml-wb.zip?branch=master&all=false&pr=false), [source dictfile (webster1913.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/webster1913.df?branch=master&all=false&pr=false)
13 |
14 | You can use [dictutil](../dictutil/install.html) to install the dictionaries, or see [here](../dicthtml/install.html) for manual installation instructions.
15 |
16 | ## Usage
17 |
18 | ```
19 | Usage: webster1913-convert [options] gutenberg_webster1913_path
20 |
21 | Options:
22 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./webster1913.df")
23 | --dump Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging)
24 | -h, --help Show this help text
25 |
26 | Arguments:
27 | gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin.
28 |
29 | To convert the resulting dictfile into a dictzip, use dictgen.
30 | ```
31 |
32 | The source dictionary can be downloaded [here](http://www.gutenberg.org/ebooks/29765.txt.utf-8) or [here](https://github.com/pgaskin/dictserver/raw/master/data/dictionary.txt).
33 |
34 | You can also use the parser as a [Go library](https://pkg.go.dev/github.com/pgaskin/dictutil/examples/webster1913-convert/webster1913).
35 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Documentation
4 | nav_order: 1
5 | ---
6 |
7 | # Dictutil
8 | {: .fs-9 }
9 |
10 | A collection of documentation and tools for working with Kobo dictionaries.
11 | {: .fs-6 .fw-300 }
12 |
13 | [Download](https://github.com/pgaskin/dictutil/releases){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 } [dicthtml](./dicthtml/){: .btn .fs-5 .mb-4 .mb-md-0 } [dictgen](./dictgen/){: .btn .fs-5 .mb-4 .mb-md-0 } [dictutil](./dictutil/){: .btn .fs-5 .mb-4 .mb-md-0 }
14 |
15 | ---
16 |
17 | **Prebuilt dictionaries:**
18 |
19 | [GOTDict](./examples/gotdict-convert.html#download){: .btn .fs-3 .mb-1 .mb-md-0 } [Webster's 1913 Unabridged Dictionary](./examples/webster1913-convert.html#download){: .btn .fs-3 .mb-1 .mb-md-0 }
20 |
21 | ---
22 |
23 | These tools are designed to work with v2 dictionaries (4.7.10364+).
24 |
25 | ## Getting started
26 | If you're interested in creating dictionaries, look at the [dictgen documentation](./dictgen/). If you're interested in installing or manipulating existing dictionaries, see the [dictutil documentation](./dictutil/). Otherwise, see the [dicthtml documentation](./dicthtml/) for more information about the Kobo dictionary format.
27 |
28 | ## dicthtml
29 | These pages are some notes I've made about the Kobo dictionary format based on reverse engineering the firmware and the official dictionaries.
30 |
31 | - **[Format](./dicthtml/format.html):** About the Kobo dictionary format.
32 | - **[Prefixes](./dicthtml/prefixes.html):** Details about prefix calculation.
33 | - **[v1/v2 dictionaries](./dicthtml/v1v2.html):** Changes between v1/v2 dictionaries.
34 | - **[Installing custom dictionaries](./dicthtml/install.html):** Notes about sideloading dictionaries.
35 |
36 | ## dictutil
37 | dictutil is a low-level tool to unpack, pack, and perform other operations on Kobo dictzips.
38 |
39 | - **[Dictutil](./dictutil/)**
40 | - **[Install](./dictutil/install.html):** Install a dictzip.
41 | - **[Uninstall](./dictutil/uninstall.html):** Uninstall a dictzip.
42 | - **[Pack](./dictutil/pack.html):** Pack a dictzip from a dictdir.
43 | - **[Unpack](./dictutil/unpack.html):** Unpack a dictzip into a dictdir.
44 | - **[Prefix](./dictutil/prefix.html):** Calculate the dicthtml prefix for a word.
45 |
46 | ## dictgen
47 | dictgen is an easy-to-use tool/library to generate Kobo dictionaries from scratch or use in conversion scripts. It deals with all the unusual bits (e.g. variant capitalization, prefix generation, etc) for you and gives warnings when it can't.
48 |
49 | - **[Dictgen](./dictgen#usage)**
50 | - **[Dictfile format](./dictgen#dictfile-format)**
51 |
52 | ## examples
53 | These are some tools which make use of dictutil to convert actual dictionaries.
54 |
55 | - **[gotdict-convert](./examples/gotdict-convert.html):** Converts [github.com/wjdp/gotdict](https://github.com/wjdp/gotdict) to a dictfile.
56 | - **[webster1913-convert](./examples/webster1913-convert.html):** Converts [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) to a dictfile.
57 | - **[dictzip-decompile](./examples/dictzip-decompile.html):** An **experimental** tool to convert a dictzip into a dictfile.
58 | - **[bgl-convert](./examples/bgl-convert.html):** A simple tool to convert Babylon BGL dictionaries to a dictfile.
59 |
60 | ## other
61 |
62 | - **[dictword-test](https://github.com/pgaskin/kobo-mods/tree/master/dictword-test):** Calculates word prefixes using libnickel.
63 | - **[marisa](https://github.com/pgaskin/dictutil/tree/master/marisa):** Marisa bindings for Go.
64 |
--------------------------------------------------------------------------------
/examples/bgl-convert/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | BGL Converter
6 |
7 |
8 |
9 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
114 |
--------------------------------------------------------------------------------
/examples/dictzip-decompile/main.go:
--------------------------------------------------------------------------------
1 | // Command dictzip-decompile converts a dictzip into a dictfile. The regenerated
2 | // dictzip from the dictfile may not match exactly, but it will look the same,
3 | // and certain bugs with prefixes and variants will be implicitly fixed by the
4 | // conversion process (i.e. variant in wrong file, incorrect prefix, missing
5 | // words in index file). All output is in raw HTML, not Markdown.
6 | //
7 | // This is an experimental tool, and the output may not be perfect on complex
8 | // dictionaries.
9 | package main
10 |
11 | import (
12 | "fmt"
13 | "io"
14 | "os"
15 |
16 | "github.com/pgaskin/dictutil/kobodict"
17 | "github.com/spf13/pflag"
18 |
19 | _ "github.com/pgaskin/dictutil/kobodict/marisa"
20 | )
21 |
22 | var version = "dev"
23 |
24 | func main() {
25 | pflag.CommandLine.SortFlags = false
26 | output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"decompiled.df", "The output filename (will be overwritten if it exists) (- is stdout)")
27 | resources := pflag.BoolP("resources", "r", false, "Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled)")
28 | help := pflag.BoolP("help", "h", false, "Show this help text")
29 | pflag.Parse()
30 |
31 | if *help || pflag.NArg() != 1 {
32 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictzip\n\nVersion: dictzip-decompile %s\n\nOptions:\n%s\nArguments:\n dictzip is the path to the dictzip to decompile.\n\nTo convert the resulting dictfile into a dictzip, use dictgen.\n\nNote: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown.\n\nThis is an experimental tool, and the output may not be perfect on complex dictionaries.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
33 | if pflag.NArg() != 0 {
34 | os.Exit(2)
35 | } else {
36 | os.Exit(0)
37 | }
38 | return
39 | }
40 |
41 | fn := pflag.Args()[0]
42 |
43 | fmt.Fprintf(os.Stderr, "Opening input dictzip.\n")
44 | f, err := os.Open(fn)
45 | if err != nil {
46 | fmt.Fprintf(os.Stderr, "Error: open input file %#v: %v.\n", fn, err)
47 | os.Exit(1)
48 | return
49 | }
50 | defer f.Close()
51 |
52 | s, err := f.Stat()
53 | if err != nil {
54 | fmt.Fprintf(os.Stderr, "Error: stat input file %#v: %v.\n", fn, err)
55 | os.Exit(1)
56 | return
57 | }
58 |
59 | fmt.Fprintf(os.Stderr, "Parsing dictzip.\n")
60 | dr, err := kobodict.NewReader(f, s.Size())
61 | if err != nil {
62 | fmt.Fprintf(os.Stderr, "Error: parse input file %#v: %v.\n", fn, err)
63 | os.Exit(1)
64 | return
65 | }
66 |
67 | fmt.Fprintf(os.Stderr, "Decompiling dictzip.\n")
68 | df, err := decompile(dr)
69 | if err != nil {
70 | fmt.Fprintf(os.Stderr, "Error: decompile dictzip %#v: %v.\n", fn, err)
71 | os.Exit(1)
72 | return
73 | }
74 |
75 | if *resources {
76 | fmt.Fprintf(os.Stderr, "Extracting resources.\n")
77 | for _, f := range dr.File {
78 | fmt.Fprintf(os.Stderr, " ./%s\n", f.Name)
79 | if err := func() error {
80 | rc, err := f.Open()
81 | if err != nil {
82 | return fmt.Errorf("open: %w", err)
83 | }
84 | defer rc.Close()
85 |
86 | f, err := os.OpenFile(f.Name, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
87 | if err != nil {
88 | return fmt.Errorf("create output: %w", err)
89 | }
90 | defer f.Close()
91 |
92 | if _, err := io.Copy(f, rc); err != nil {
93 | return fmt.Errorf("copy: %w", err)
94 | }
95 |
96 | if err := f.Close(); err != nil {
97 | return fmt.Errorf("write output: %w", err)
98 | }
99 |
100 | return nil
101 | }(); err != nil {
102 | fmt.Fprintf(os.Stderr, "Error: extract resource %#v: %v.\n", f.Name, err)
103 | os.Exit(1)
104 | return
105 | }
106 | }
107 | } else {
108 | if len(dr.File) != 0 {
109 | fmt.Fprintf(os.Stderr, "Warning: dictfile contains %d resources, but skipping because resource extraction is not enabled (see --help for more details).\n", len(dr.File))
110 | }
111 | }
112 |
113 | fmt.Fprintf(os.Stderr, "Writing dictfile.\n")
114 | switch *output {
115 | case "-":
116 | if err := df.WriteDictFile(os.Stdout); err != nil {
117 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
118 | os.Exit(1)
119 | return
120 | }
121 | default:
122 | f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
123 | if err != nil {
124 | fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err)
125 | os.Exit(1)
126 | return
127 | }
128 |
129 | if err := df.WriteDictFile(f); err != nil {
130 | f.Close()
131 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
132 | os.Exit(1)
133 | return
134 | }
135 |
136 | if err := f.Close(); err != nil {
137 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
138 | os.Exit(1)
139 | return
140 | }
141 | }
142 |
143 | fmt.Fprintf(os.Stderr, "Successfully converted %d entries from dictzip %#v to dictfile %s.\n", len(df), fn, *output)
144 | os.Exit(0)
145 | }
146 |
--------------------------------------------------------------------------------
/examples/dictzip-decompile/parse.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "crypto/sha1"
6 | "fmt"
7 | "io/ioutil"
8 | "regexp"
9 | "unicode"
10 |
11 | "github.com/pgaskin/dictutil/dictgen"
12 | "github.com/pgaskin/dictutil/kobodict"
13 | )
14 |
15 | // This isn't exposed as a separate package, as it's subject to change and
16 | // highly specific to dictzip-decompile.
17 |
18 | // The regexps used to extract data should have a similar level of strictness as
19 | // the ones used by nickel (for simplicity, compatibility, and predictability).
20 |
21 | // decompile decompiles a dictzip into a dictfile. External resources are not
22 | // extracted, and must be done separately.
23 | //
24 | // Duplicate entries (e.g. the ones added by dictgen for fixing broken variants)
25 | // are collapsed into one. They will be expanded again as necessary when the
26 | // dictfile is compiled by dictgen.
27 | func decompile(r *kobodict.Reader) (dictgen.DictFile, error) {
28 | var df dictgen.DictFile
29 | seenEntries := map[[20]byte]struct{}{}
30 | for _, dh := range r.Dicthtml {
31 | if err := func() error {
32 | rc, err := dh.Open()
33 | if err != nil {
34 | return fmt.Errorf("open: %w", err)
35 | }
36 | defer rc.Close()
37 |
38 | buf, err := ioutil.ReadAll(rc)
39 | if err != nil {
40 | return fmt.Errorf("read: %w", err)
41 | }
42 |
43 | es, err := extractEntries(buf)
44 | if err != nil {
45 | return fmt.Errorf("extract entries: %w", err)
46 | }
47 |
48 | for _, e := range es {
49 | ss := sha1.Sum(e)
50 | if _, ok := seenEntries[ss]; ok {
51 | continue
52 | }
53 | seenEntries[ss] = struct{}{}
54 |
55 | de, err := decompileEntry(e)
56 | if err != nil {
57 | return fmt.Errorf("decompile entry %#v: %w", string(e), err)
58 | }
59 |
60 | df = append(df, de)
61 | }
62 |
63 | return nil
64 | }(); err != nil {
65 | return nil, fmt.Errorf("process dicthtml %#v: %w", dh.Name, err)
66 | }
67 | }
68 | return df, nil
69 | }
70 |
71 | // The regexps/vars used by decompileEntry.
72 | var (
73 | // generator matchers (match the entire entry, split into parts) (match in order) (don't include variants here)
74 | generator1PenelopeRe = regexp.MustCompile(`^(?s)([^<]+)<\/b>
(.+)<\/div>$`) // also: first and second groups must be equal
75 | generator2KoboFrRe = regexp.MustCompile(`^(?s)|><\/a>)\s*([^<]+)\s*<\/b>\s*(.*?)
\s*(.+)\s*<\/p>$`) // also: 2nd and 3rd (header) group must not contain "
|><\/a>)\s*(.+?)\s*<\/b>\s*(.*?)\s*<\/p>\s*(.+)\s*$`) // also: 2nd and 3rd (header) group must not contain "
|><\/a>)`) // this is slightly more lenient than some of Kobo's (it makes the space before the closing optional)
79 | // other matchers
80 | variantsRe = regexp.MustCompile(`(.*?)<\/var>`)
81 | variantsItemRe = regexp.MustCompile(`|><\/variant>)`)
82 | )
83 |
84 | // decompileEntry parses an entry (it must be trimmed).
85 | func decompileEntry(buf []byte) (*dictgen.DictFileEntry, error) {
86 | var entry dictgen.DictFileEntry
87 |
88 | // Generator-specific enhanced extraction (for making use of dictfile lines
89 | // starting with &, :, etc).
90 | var generatorMatched bool
91 | // -- Penelope: https://github.com/pettarin/penelope/blob/fce6dcfd899d3755ae3a5a3867d7d436105ada56/penelope/format_kobo.py#L167
92 | // e.g. dfgdfg
Penelope
sdfsdf
93 | if !generatorMatched {
94 | if m := generator1PenelopeRe.FindSubmatch(buf); len(m) != 0 {
95 | headwordIndex, headwordDisplay, contentHTML := m[1], m[2], m[3]
96 | if !bytes.Equal(headwordIndex, headwordDisplay) {
97 | // it's a false positive if those aren't identical
98 | } else {
99 | entry.Headword = string(headwordIndex)
100 | entry.RawHTML = true
101 | entry.Definition = string(contentHTML)
102 | generatorMatched = true
103 | }
104 | }
105 | }
106 | // -- Kobo: based on dicthtml-fr
107 | // e.g. a-, an-
- Élément exprimant la négation ( pas ), ou la privation ( sans ).
⇒anormal, apolitique.
108 | if !generatorMatched {
109 | if m := generator2KoboFrRe.FindSubmatch(buf); len(m) != 0 {
110 | headwordIndex, headwordDisplay, headerInfo, contentHTML := m[1], m[2], m[3], m[4]
111 | if bytes.Contains(headwordDisplay, []byte("
"
128 | generatorMatched = true
129 | }
130 | }
131 | }
132 | // -- Kobo: based on dicthtml-en, a few others
133 | // e.g. ab ['ab] -n
- an abdominal muscle usu. used in pl.
- about
134 | // -- or dictgen
135 | // e.g. a A (# emph. #).
- Etym: [Shortened form of an. AS. an one. See One.] An adjective, commonly called the indefinite article, and signifying one or any, but less emphatically.
- "At a birth"; "In a word"; "At a blow". Shak. Note: It is placed before nouns of the singular number denoting an individual object, or a quality individualized, before collective nouns, and also before plural nouns when the adjective few or the phrase great many or good many is interposed; as, a dog, a house, a man; a color; a sweetness; a hundred, a fleet, a regiment; a few persons, a great many days. It is used for an, for the sake of euphony, before words beginning with a consonant sound [for exception of certain words beginning with h, see An]; as, a table, a woman, a year, a unit, a eulogy, a ewe, a oneness, such a one, etc. Formally an was used both before vowels and consonants.
- Etym: [Originally the preposition a (an, on).] In each; to or for each; as, "twenty leagues a day", "a hundred pounds a year", "a dollar a yard", etc.
136 | if !generatorMatched {
137 | if m := generator3KoboEnOrDictutilRe.FindSubmatch(buf); len(m) != 0 {
138 | headwordIndex, headwordDisplay, headerInfo, contentHTML := m[1], m[2], m[3], m[4]
139 | if bytes.Contains(headwordDisplay, []byte("
dfkgjdlfjglkdfjg
162 | if !generatorMatched {
163 | entry.NoHeader = true
164 | entry.RawHTML = true
165 | entry.Definition = string(headFallbackIndexWordRe.ReplaceAllFunc(buf, func(src []byte) []byte {
166 | if entry.Headword != "" {
167 | return src // don't continue after the first headword has been found
168 | }
169 | entry.Headword = string(headFallbackIndexWordRe.FindSubmatch(src)[1])
170 | return nil // remove the entire a tag
171 | }))
172 | if entry.Headword == "" {
173 | return nil, fmt.Errorf("no headword found in %#v", string(buf))
174 | }
175 | generatorMatched = true
176 | }
177 |
178 | // Add any additional headwords (then remove) (which really shouldn't be there in the first place) as variants.
179 | // i.e. stray tags (but not if the link has text, because then it's not a headword anymore)
180 | entry.Definition = string(headFallbackIndexWordRe.ReplaceAllFunc([]byte(entry.Definition), func(src []byte) []byte {
181 | entry.Variant = append(entry.Variant, string(headFallbackIndexWordRe.FindSubmatch(src)[1]))
182 | return nil // remove the entire a tag
183 | }))
184 |
185 | // Append (then remove) any variants found in the raw html.
186 | // i.e. tags inside ones
187 | entry.Definition = string(variantsRe.ReplaceAllFunc([]byte(entry.Definition), func(src []byte) []byte {
188 | for _, m := range variantsItemRe.FindAllSubmatch(src, -1) {
189 | entry.Variant = append(entry.Variant, string(m[1]))
190 | }
191 | return nil // remove the entire variant tag
192 | }))
193 |
194 | return &entry, nil
195 | }
196 |
197 | // The regexps/vars used by extractEntries.
198 | var (
199 | htmlStart = []byte("")
200 | htmlEnd = []byte("")
201 | entryRe = regexp.MustCompile(`(?s)\s*(.+?)\s*<\/w>`)
202 | )
203 |
204 | // extractEntries gets the trimmed body of each entry in the dicthtml file.
205 | func extractEntries(buf []byte) ([][]byte, error) {
206 | if idx := bytes.Index(buf, htmlStart); idx < 0 {
207 | return nil, fmt.Errorf("missing %s tag", string(htmlStart))
208 | } else {
209 | buf = buf[idx+len(htmlStart):]
210 | }
211 |
212 | if idx := bytes.LastIndex(buf, htmlEnd); idx < 0 {
213 | return nil, fmt.Errorf("missing %s tag", string(htmlStart))
214 | } else {
215 | buf = buf[:idx]
216 | }
217 |
218 | var entries [][]byte
219 |
220 | var cur, prev, body []int
221 | prev = []int{0, 0}
222 | for _, m := range entryRe.FindAllSubmatchIndex(buf, -1) {
223 | cur, body = m[0:2][:], m[2:4]
224 | for _, b := range buf[prev[1]:cur[0]] {
225 | // note: even though we might split up multi-byte utf-8 chars
226 | // here, it's fine, as the whitespace should be ascii if any,
227 | // and if there is anything else, it's an issue.
228 | if !unicode.IsSpace(rune(b)) {
229 | return nil, fmt.Errorf("non-whitespace between word entries (%#v in %#v before %#v)", string(rune(b)), string(buf[prev[1]:cur[0]]), string(buf[cur[0]:cur[1]]))
230 | }
231 | }
232 | prev = cur
233 | entries = append(entries, buf[body[0]:body[1]])
234 | }
235 | for _, b := range buf[prev[1]:] {
236 | if !unicode.IsSpace(rune(b)) {
237 | return nil, fmt.Errorf("non-whitespace after last word entry (%#v in %#v)", string(rune(b)), string(buf[prev[1]:]))
238 | }
239 | }
240 |
241 | return entries, nil
242 | }
243 |
--------------------------------------------------------------------------------
/examples/gotdict-convert/gotdict/parser.go:
--------------------------------------------------------------------------------
1 | // Package gotdict parses GOTDict (https://github.com/wjdp/gotdict).
2 | package gotdict
3 |
4 | import (
5 | "bytes"
6 | "fmt"
7 | "io/ioutil"
8 | "os"
9 | "path/filepath"
10 | "regexp"
11 | "sort"
12 | "strings"
13 | "unicode"
14 |
15 | "gopkg.in/yaml.v2"
16 | )
17 |
18 | // Dict represents the Dict.
19 | type Dict []*Def
20 |
21 | // Def represents a definition.
22 | type Def struct {
23 | // Title is the main title of the definition (it may contain spaces) (i.e. Tyrion Lannister).
24 | Title string
25 | // Terms are other forms of the title which should be recognized.
26 | Terms []string
27 | // Type is the record type. Currently, not many entries have one.
28 | Type Type
29 | // Images contains referenced image files.
30 | Images map[string][]byte
31 | // Definition contains the Markdown definition.
32 | Definition string
33 | }
34 |
35 | // Type is a Dict record type.
36 | type Type string
37 |
38 | const (
39 | // TypeUnknown is used for definitions without a type set (i.e. before types were used).
40 | TypeUnknown Type = ""
41 | // TypeCharacter is a character (e.g. Jon, Tyrion).
42 | TypeCharacter Type = "character"
43 | // TypeHouse is a house (e.g. Lannister, Stark).
44 | TypeHouse Type = "house"
45 | // TypeEvent is an event in time.
46 | TypeEvent Type = "event"
47 | // TypeCity is a city.
48 | TypeCity Type = "city"
49 | // TypeLocation is a location (e.g. King's Landing).
50 | TypeLocation Type = "location"
51 | // TypeRiver is a river.
52 | TypeRiver Type = "river"
53 | // TypeShip is a ship.
54 | TypeShip Type = "ship"
55 | // TypeWord is an uncommon or ASOIAF-specific word.
56 | TypeWord Type = "word"
57 | )
58 |
59 | // Parse parses the Dict. If imgdir is an empty string, images are removed. If
60 | // imgref is true, image paths are set to the full filepath rather than reading
61 | // the images to memory.
62 | func Parse(defdir, imgdir string, imgref bool) (Dict, error) {
63 | var dict Dict
64 |
65 | fis, err := ioutil.ReadDir(defdir)
66 | if err != nil {
67 | return nil, err
68 | }
69 |
70 | seen := map[string]*Def{}
71 | for _, fi := range fis {
72 | if filepath.Ext(fi.Name()) != ".mdd" {
73 | continue
74 | }
75 |
76 | buf, err := ioutil.ReadFile(filepath.Join(defdir, fi.Name()))
77 | if err != nil {
78 | return nil, err
79 | }
80 |
81 | var obj struct {
82 | Title string `yaml:"title"`
83 | Terms []string `yaml:"terms"`
84 | Type Type `yaml:"type"`
85 | }
86 |
87 | md, err := unmarshalStrictFrontMatter(buf, &obj)
88 | if err != nil {
89 | return nil, fmt.Errorf("parse %s frontmatter: %w", fi.Name(), err)
90 | } else if obj.Title == "" {
91 | return nil, fmt.Errorf("parse %s frontmatter: title not set", fi.Name())
92 | }
93 |
94 | def := &Def{}
95 |
96 | obj.Title = strings.TrimSpace(obj.Title)
97 | if odef, ok := seen[obj.Title]; ok {
98 | return nil, fmt.Errorf("parse %s: already seen %#v in other def %#v", fi.Name(), def.Title, odef)
99 | }
100 | seen[obj.Title] = def
101 | def.Title = obj.Title
102 |
103 | for _, term := range obj.Terms {
104 | term = strings.TrimSpace(term)
105 | if odef, ok := seen[term]; ok && term != "Jon Umber" { // it's usually a mistake to have duplicate terms (but remember that dictgen will handle them fine)
106 | return nil, fmt.Errorf("parse %s: already seen term %#v in other def %#v", fi.Name(), term, odef)
107 | }
108 | seen[term] = def
109 | def.Terms = append(def.Terms, term)
110 | }
111 |
112 | def.Type = Type(strings.TrimSpace(string(obj.Type)))
113 | def.Images = map[string][]byte{}
114 | def.Definition = string(md)
115 |
116 | if imgdir == "" {
117 | def.Definition = regexp.MustCompile(`(\s*Map on [Nn]ext [Pp]age\.?)|(\s*\(Map on [Nn]ext [Pp]age\.?\))|(!\[[^]]*\]\([^)]+\))`).ReplaceAllLiteralString(def.Definition, "")
118 | } else {
119 | var repl []string
120 | for _, img := range regexp.MustCompile(`!\[[^]]*\]\((images/)?([^)]+)\)`).FindAllStringSubmatch(def.Definition, -1) {
121 | if img[1] == "" {
122 | return nil, fmt.Errorf("parse %s: unknown image path %#v", fi.Name(), img[1])
123 | }
124 | fn, err := filepath.Abs(filepath.Join(imgdir, img[2]))
125 | if err != nil {
126 | return nil, fmt.Errorf("parse %s: resolve image %#v: %w", fi.Name(), img[1], err)
127 | }
128 | if imgref {
129 | if _, err := os.Stat(fn); err != nil {
130 | return nil, fmt.Errorf("parse %s: stat image %#v: %w", fi.Name(), img[1], err)
131 | }
132 | repl = append(repl, "("+img[1]+img[2]+")", "("+fn+")")
133 | } else {
134 | imgbuf, err := ioutil.ReadFile(fn)
135 | if err != nil {
136 | return nil, fmt.Errorf("parse %s: read image %#v: %w", fi.Name(), img[1], err)
137 | }
138 | def.Images[img[2]] = imgbuf
139 | repl = append(repl, "("+img[1]+img[2]+")", "("+img[2]+")")
140 | }
141 | }
142 | def.Definition = strings.NewReplacer(repl...).Replace(def.Definition)
143 | }
144 |
145 | def.Definition = strings.TrimSpace(def.Definition)
146 |
147 | dict = append(dict, def)
148 | }
149 |
150 | sort.Slice(dict, func(i, j int) bool {
151 | return dict[i].Title < dict[j].Title
152 | })
153 |
154 | return dict, nil
155 | }
156 |
157 | func unmarshalStrictFrontMatter(buf []byte, v interface{}) (content []byte, err error) {
158 | spl := bytes.SplitN(buf, []byte{'-', '-', '-'}, 3)
159 | for _, b := range spl[0] {
160 | if !unicode.IsSpace(rune(b)) {
161 | return buf, nil
162 | }
163 | }
164 | return spl[2], yaml.UnmarshalStrict(spl[1], v)
165 | }
166 |
--------------------------------------------------------------------------------
/examples/gotdict-convert/main.go:
--------------------------------------------------------------------------------
1 | // Command gotdict-convert converts GOTDict (https://github.com/wjdp/gotdict) to
2 | // a dictgen dictfile.
3 | package main
4 |
5 | import (
6 | "fmt"
7 | "os"
8 | "path/filepath"
9 |
10 | "github.com/spf13/pflag"
11 |
12 | "github.com/pgaskin/dictutil/dictgen"
13 | "github.com/pgaskin/dictutil/examples/gotdict-convert/gotdict"
14 | )
15 |
16 | var version = "dev"
17 |
18 | func main() {
19 | pflag.CommandLine.SortFlags = false
20 | gotdictp := pflag.StringP("gotdict", "g", "."+string(os.PathSeparator)+"gotdict", "The path to the local copy of github.com/wjdp/gotdict.")
21 | output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"gotdict.df", "The output filename (will be overwritten if it exists) (- is stdout)")
22 | images := pflag.BoolP("images", "I", false, "Include images in the generated dictfile")
23 | help := pflag.BoolP("help", "h", false, "Show this help text")
24 | pflag.Parse()
25 |
26 | if *help || pflag.NArg() != 0 {
27 | fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\nVersion: gotdict-convert %s\n\nOptions:\n%s\nTo convert the resulting dictfile into a dictzip, use dictgen.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
28 | os.Exit(0)
29 | return
30 | }
31 |
32 | var img string
33 | if *images {
34 | fmt.Fprintf(os.Stderr, "Parsing gotdict (with images).\n")
35 | img = filepath.Join(*gotdictp, "images")
36 | } else {
37 | fmt.Fprintf(os.Stderr, "Parsing gotdict (no images).\n")
38 | }
39 |
40 | gd, err := gotdict.Parse(filepath.Join(*gotdictp, "_definitions"), img, true)
41 | if err != nil {
42 | fmt.Fprintf(os.Stderr, "Error: parse gotdict: %v\n", err)
43 | os.Exit(1)
44 | return
45 | }
46 |
47 | fmt.Fprintf(os.Stderr, "Transforming definitions.\n")
48 | var df dictgen.DictFile
49 | for _, d := range gd {
50 | var hwi string
51 | if d.Type != "" {
52 | hwi = "-" + string(d.Type)
53 | }
54 |
55 | df = append(df, &dictgen.DictFileEntry{
56 | Headword: d.Title,
57 | HeaderInfo: hwi,
58 | Variant: d.Terms,
59 | Definition: d.Definition,
60 | })
61 | }
62 |
63 | fmt.Fprintf(os.Stderr, "Writing dictfile.\n")
64 | switch *output {
65 | case "-":
66 | if err := df.WriteDictFile(os.Stdout); err != nil {
67 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
68 | os.Exit(1)
69 | return
70 | }
71 | default:
72 | f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
73 | if err != nil {
74 | fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err)
75 | os.Exit(1)
76 | return
77 | }
78 |
79 | if err := df.WriteDictFile(f); err != nil {
80 | f.Close()
81 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
82 | os.Exit(1)
83 | return
84 | }
85 |
86 | if err := f.Close(); err != nil {
87 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
88 | os.Exit(1)
89 | return
90 | }
91 | }
92 |
93 | fmt.Fprintf(os.Stderr, "Successfully converted %d entries from gotdict %s to dictfile %s.\n", len(df), *gotdictp, *output)
94 | os.Exit(0)
95 | }
96 |
--------------------------------------------------------------------------------
/examples/webster1913-convert/main.go:
--------------------------------------------------------------------------------
1 | // Command webster1913-convert converts Project Gutenberg's Webster's 1913
2 | // Unabridged Dictionary to a dictgen dictfile.
3 | package main
4 |
5 | import (
6 | "bytes"
7 | "encoding/json"
8 | "fmt"
9 | "html/template"
10 | "io"
11 | "os"
12 |
13 | "github.com/spf13/pflag"
14 |
15 | "github.com/pgaskin/dictutil/dictgen"
16 | "github.com/pgaskin/dictutil/examples/webster1913-convert/webster1913"
17 | )
18 |
19 | var version = "dev"
20 |
21 | var deftmpl = template.Must(template.New("").Funcs(template.FuncMap{
22 | "spldc": func(s string) []string {
23 | for i, c := range s {
24 | if c == '.' || c == ',' || c == '(' {
25 | return []string{s[:i], s[i:]}
26 | }
27 | }
28 | return []string{"", s}
29 | },
30 | }).Parse(`
31 | {{- with .Etymology}}{{.}}
{{end -}}
32 | {{- with .Meanings}}{{range .}}- {{.Text}}{{with .Example}}
{{.}}{{end}} {{end}}
{{end -}}
33 | {{- with .PhraseDefns}}{{range $n, $v := .}}{{if $n}} {{end}}{{range $x, $y := (spldc $v)}}{{if $x}}{{$y}}{{else}}{{$y}}{{end}}{{end}}{{end}}
{{end -}}
34 | {{- with .Synonyms}}{{range $n, $v := .}}{{if $n}} {{end}}{{$v}}{{end}}
{{end -}}
35 | {{- with .Extra}}{{.}}
{{end -}}
36 | `))
37 |
38 | func main() {
39 | pflag.CommandLine.SortFlags = false
40 | output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"webster1913.df", "The output filename (will be overwritten if it exists) (- is stdout)")
41 | dump := pflag.Bool("dump", false, "Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging)")
42 | help := pflag.BoolP("help", "h", false, "Show this help text")
43 | pflag.Parse()
44 |
45 | if *help || pflag.NArg() != 1 {
46 | fmt.Fprintf(os.Stderr, "Usage: %s [options] gutenberg_webster1913_path\n\nVersion: webster1913-convert %s\n\nOptions:\n%s\nArguments:\n gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin.\n\nTo convert the resulting dictfile into a dictzip, use dictgen.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
47 | os.Exit(0)
48 | return
49 | }
50 |
51 | fmt.Fprintf(os.Stderr, "Opening input file.\n")
52 | var r io.Reader
53 | switch v := pflag.Args()[0]; v {
54 | case "-":
55 | r = os.Stdin
56 | default:
57 | f, err := os.Open(v)
58 | if err != nil {
59 | fmt.Fprintf(os.Stderr, "Error: open input %#v: %v\n", v, err)
60 | os.Exit(1)
61 | return
62 | }
63 | defer f.Close()
64 | r = f
65 | }
66 |
67 | fmt.Fprintf(os.Stderr, "Parsing dictionary.\n")
68 | wd, err := webster1913.Parse(r, func(i int, word string) {
69 | if i%1000 == 0 {
70 | fmt.Fprintf(os.Stderr, "[% 5d] %s\n", i, word)
71 | }
72 | })
73 | if err != nil {
74 | fmt.Fprintf(os.Stderr, "Error: parse webster1913: %v\n", err)
75 | os.Exit(1)
76 | return
77 | }
78 |
79 | if *dump {
80 | fmt.Fprintf(os.Stderr, "Dumping JSON to stdout.\n")
81 | enc := json.NewEncoder(os.Stdout)
82 | enc.SetIndent("", " ")
83 | enc.Encode(wd)
84 | os.Exit(0)
85 | return
86 | }
87 |
88 | fmt.Fprintf(os.Stderr, "Transforming definitions.\n")
89 | var df dictgen.DictFile
90 | dbuf := bytes.NewBuffer(nil)
91 | for _, d := range wd {
92 | dbuf.Reset()
93 | if err := deftmpl.Execute(dbuf, d); err != nil {
94 | fmt.Fprintf(os.Stderr, "Error: render definition %#v: %v\n", d, err)
95 | os.Exit(1)
96 | return
97 | }
98 | df = append(df, &dictgen.DictFileEntry{
99 | Headword: d.Headword,
100 | Variant: d.Variant,
101 | RawHTML: true,
102 | HeaderInfo: d.Info,
103 | Definition: dbuf.String(),
104 | })
105 | }
106 |
107 | fmt.Fprintf(os.Stderr, "Writing dictfile.\n")
108 | switch *output {
109 | case "-":
110 | if err := df.WriteDictFile(os.Stdout); err != nil {
111 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
112 | os.Exit(1)
113 | return
114 | }
115 | default:
116 | f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
117 | if err != nil {
118 | fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err)
119 | os.Exit(1)
120 | return
121 | }
122 |
123 | if err := df.WriteDictFile(f); err != nil {
124 | f.Close()
125 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
126 | os.Exit(1)
127 | return
128 | }
129 |
130 | if err := f.Close(); err != nil {
131 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
132 | os.Exit(1)
133 | return
134 | }
135 | }
136 |
137 | fmt.Fprintf(os.Stderr, "Successfully converted %d entries from Webster's 1913 dictionary %#v to dictfile %s.\n", len(df), pflag.Args()[0], *output)
138 | os.Exit(0)
139 | }
140 |
--------------------------------------------------------------------------------
/examples/webster1913-convert/webster1913/parser.go:
--------------------------------------------------------------------------------
1 | // Package webster1913 parses Project Gutenberg's Webster's 1913 Unabridged
2 | // Dictionary (http://www.gutenberg.org/ebooks/29765.txt.utf-8).
3 | package webster1913
4 |
5 | import (
6 | "bufio"
7 | "bytes"
8 | "io"
9 | "regexp"
10 | "runtime/debug"
11 | "strings"
12 | )
13 |
14 | // Dict represents the parsed dictionary.
15 | type Dict []*Entry
16 |
17 | // Entry is a single dictionary entry.
18 | type Entry struct {
19 | Headword string
20 | Variant []string
21 | Info string
22 | Etymology string
23 | Meanings []*EntryMeaning
24 | Synonyms []string
25 | PhraseDefns []string
26 | Extra string // unparseable text
27 | }
28 |
29 | // EntryMeaning is a meaning for a dictionary entry.
30 | type EntryMeaning struct {
31 | Text string
32 | Example string
33 | }
34 |
35 | var (
36 | entryWordRe = regexp.MustCompile(`^[A-Z_ ;-]+$`)
37 | numberedDefnStartRe = regexp.MustCompile(`^[0-9]+\.\s*`)
38 | singleDefnStartRe = regexp.MustCompile(`^Defn:\s+`)
39 | noteStartRe = regexp.MustCompile(`^\s*Note:\s+`)
40 | synStartRe = regexp.MustCompile(`^Syn.\s*$`)
41 | synItemStartRe = regexp.MustCompile(`^\s+--\s+`)
42 | phraseDefnStartRe = regexp.MustCompile(`^\s+--\s+([A-Za-z ]+?[A-Za-z])\s*(\([^)]+\))?[,.]\s*`)
43 | wordInfoFormRe = regexp.MustCompile(`(?:p\. p\.|vb\. n\.|p\. pr\.) +([A-Z][a-z]+)[:;.,]`)
44 | )
45 |
46 | type state int
47 |
48 | const (
49 | // StateNone is before the first entry.
50 | StateNone state = iota
51 | // StateEntryInfo is at the beginning of the entry.
52 | StateEntryInfo
53 | // StateEntryExtra is unclassified text in the entry.
54 | StateEntryExtra
55 | // StateEntryMeaningText is inside an entry's meaning's text.
56 | StateEntryMeaningText
57 | // StateEntryMeaningExample is inside an entry's meaning's example.
58 | StateEntryMeaningExample
59 | // StateEntrySynonym is inside an entry's synonym list.
60 | StateEntrySynonym
61 | // StateEntryPhraseDefn is inside an entry's phrase definition list.
62 | StateEntryPhraseDefn
63 | )
64 |
65 | // Parse parses Project Gutenberg's Webster's Unabridged Dictionary.
66 | func Parse(r io.Reader, progress func(i int, w string)) (Dict, error) {
67 | var wd Dict
68 | var perr error
69 | sc := bufio.NewScanner(r)
70 |
71 | var state state
72 | var entry *Entry
73 | var meaning *EntryMeaning
74 | var i int
75 | for sc.Scan() {
76 | ln := sc.Bytes()
77 | lnt := bytes.TrimSpace(ln)
78 | blankLine := len(lnt) == 0
79 |
80 | if bytes.HasPrefix(lnt, []byte("*** END")) {
81 | break
82 | }
83 |
84 | if entryWordRe.Match(ln) {
85 | if state == StateNone {
86 | // skip the file header(up to the word "A")
87 | if !bytes.Equal(lnt, []byte{'A'}) {
88 | continue
89 | }
90 | }
91 | if bytes.Count(lnt, []byte{'-'}) != len(lnt) {
92 | // ^ if all dashes, it is a false positive
93 | if entry != nil {
94 | progress(len(wd), entry.Headword)
95 | }
96 | spl := strings.Split(string(bytes.ToLower(ln)), ";")
97 | entry = &Entry{Headword: strings.TrimSpace(spl[0])}
98 | if len(spl) > 1 {
99 | for _, v := range spl[1:] {
100 | if w := strings.TrimSpace(v); w != "" {
101 | entry.Variant = append(entry.Variant, w)
102 | }
103 | }
104 | }
105 | meaning = nil
106 | wd = append(wd, entry)
107 | state = StateEntryInfo
108 | continue
109 | }
110 | }
111 |
112 | switch state {
113 | case StateNone:
114 | // ignore any text before the first entry
115 | case StateEntryInfo:
116 | switch {
117 | case blankLine:
118 | for _, m := range wordInfoFormRe.FindAllStringSubmatch(entry.Info, -1) {
119 | entry.Variant = append(entry.Variant, strings.ToLower(m[1]))
120 | }
121 | // attempt to split into etymology
122 | if spl := strings.SplitN(entry.Info, " Etym: ", 2); len(spl) == 2 {
123 | entry.Info = strings.TrimSpace(spl[0])
124 | entry.Etymology = strings.TrimSpace(spl[1])
125 | }
126 | state = StateEntryExtra
127 | default:
128 | entry.Info += " " + string(lnt)
129 | }
130 | case StateEntryExtra:
131 | switch {
132 | case singleDefnStartRe.Match(ln):
133 | meaning = &EntryMeaning{Text: string(singleDefnStartRe.ReplaceAllLiteral(ln, nil))}
134 | entry.Meanings = append(entry.Meanings, meaning)
135 | state = StateEntryMeaningText
136 | case numberedDefnStartRe.Match(ln):
137 | meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))}
138 | entry.Meanings = append(entry.Meanings, meaning)
139 | state = StateEntryMeaningText
140 | case phraseDefnStartRe.Match(ln):
141 | meaning = nil
142 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
143 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
144 | state = StateEntryPhraseDefn
145 | case blankLine:
146 | // ignore
147 | default:
148 | entry.Extra += " " + string(lnt)
149 | }
150 | case StateEntryMeaningText:
151 | switch {
152 | case synStartRe.Match(ln):
153 | meaning = nil
154 | state = StateEntrySynonym
155 | case singleDefnStartRe.Match(ln):
156 | // if it is in any kind of definition (single/numbered), it is part of it.
157 | meaning.Text += " " + string(singleDefnStartRe.ReplaceAllLiteral(lnt, nil))
158 | case numberedDefnStartRe.Match(ln):
159 | meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))}
160 | entry.Meanings = append(entry.Meanings, meaning)
161 | state = StateEntryMeaningText
162 | case phraseDefnStartRe.Match(ln):
163 | meaning = nil
164 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
165 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
166 | state = StateEntryPhraseDefn
167 | case len(meaning.Text) > 5 && len(lnt) < 55 && bytes.HasSuffix(lnt, []byte{'.'}) && !noteStartRe.Match(ln):
168 | // if there is already some body text, it is not a hard-wrapped
169 | // line, and it ends with a period, and is not a note, then it's
170 | // the last line of the text before the example.
171 | meaning.Text += " " + string(lnt)
172 | state = StateEntryMeaningExample
173 | case blankLine:
174 | // ignore
175 | default:
176 | meaning.Text += " " + string(lnt)
177 | }
178 | case StateEntryMeaningExample:
179 | switch {
180 | case synStartRe.Match(ln):
181 | meaning = nil
182 | state = StateEntrySynonym
183 | case singleDefnStartRe.Match(ln):
184 | meaning = &EntryMeaning{Text: string(singleDefnStartRe.ReplaceAllLiteral(ln, nil))}
185 | entry.Meanings = append(entry.Meanings, meaning)
186 | state = StateEntryMeaningText
187 | case numberedDefnStartRe.Match(ln):
188 | meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))}
189 | entry.Meanings = append(entry.Meanings, meaning)
190 | state = StateEntryMeaningText
191 | case phraseDefnStartRe.Match(ln):
192 | meaning = nil
193 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
194 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
195 | state = StateEntryPhraseDefn
196 | case blankLine:
197 | // ignore
198 | default:
199 | if meaning.Example != "" {
200 | meaning.Example += " "
201 | }
202 | meaning.Example += string(lnt)
203 | }
204 | case StateEntrySynonym:
205 | switch {
206 | case blankLine:
207 | state = StateEntryExtra
208 | case synItemStartRe.Match(ln):
209 | entry.Synonyms = append(entry.Synonyms, string(synItemStartRe.ReplaceAllLiteral(ln, nil)))
210 | case len(entry.Synonyms) == 0:
211 | // there was a "Syn." without any valid synonyms under it
212 | state = StateEntryExtra
213 | case phraseDefnStartRe.Match(ln):
214 | meaning = nil
215 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
216 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
217 | state = StateEntryPhraseDefn
218 | default:
219 | entry.Synonyms[len(entry.Synonyms)-1] += " " + string(lnt)
220 | }
221 | case StateEntryPhraseDefn:
222 | switch {
223 | case phraseDefnStartRe.Match(ln):
224 | meaning = nil
225 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
226 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
227 | state = StateEntryPhraseDefn
228 | case blankLine:
229 | // allow a blank line to end it for reducing the chance of bugs.
230 | state = StateEntryExtra
231 | default:
232 | // phrase definitions are always last, so no need for checking
233 | // for any other state changes (e.g. the start of a numbered
234 | // definition) (and the previous case should deal with any
235 | // edge-cases).
236 | entry.PhraseDefns[len(entry.PhraseDefns)-1] += " " + string(lnt)
237 | }
238 | }
239 |
240 | if i%10000 == 0 {
241 | debug.FreeOSMemory() // hack to try and limit memory usage
242 | }
243 | i++
244 | }
245 |
246 | if serr := sc.Err(); serr != nil {
247 | return nil, serr
248 | }
249 | if perr != nil {
250 | return nil, perr
251 | }
252 | return wd, nil
253 | }
254 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/pgaskin/dictutil
2 |
3 | go 1.14
4 |
5 | require (
6 | github.com/disintegration/imaging v1.6.2
7 | github.com/mattn/go-sqlite3 v2.0.3+incompatible
8 | github.com/pgaskin/koboutils/v2 v2.1.0
9 | github.com/pmezard/go-difflib v1.0.0 // indirect
10 | github.com/russross/blackfriday/v2 v2.0.1
11 | github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
12 | github.com/spf13/pflag v1.0.5
13 | gopkg.in/yaml.v2 v2.2.8
14 | )
15 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/disintegration/imaging v1.6.2 h1:w1LecBlG2Lnp8B3jk5zSuNqd7b4DXhcjwek1ei82L+c=
2 | github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4=
3 | github.com/mattn/go-sqlite3 v2.0.3+incompatible h1:gXHsfypPkaMZrKbD5209QV9jbUTJKjyR5WD3HYQSd+U=
4 | github.com/mattn/go-sqlite3 v2.0.3+incompatible/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
5 | github.com/pgaskin/koboutils/v2 v2.1.0 h1:J5KzLWvj0zDvoP5aJ7RxWuzFA32CcnD+hqH6tw/3uRE=
6 | github.com/pgaskin/koboutils/v2 v2.1.0/go.mod h1:wTzkDIlsxmUyfwfspGcm0Ap+HOxSUYV0S8kMYrf+0gM=
7 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
9 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
10 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
11 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
12 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
13 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
14 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
15 | golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 h1:hVwzHzIUGRjiF7EcUjqNxk3NCfkPxbDKRdnNE1Rpg0U=
16 | golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
17 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
18 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
19 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
20 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
21 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
22 |
--------------------------------------------------------------------------------
/kobodict/crypt.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | import (
4 | "bytes"
5 | "crypto/aes"
6 | "crypto/cipher"
7 | "fmt"
8 | )
9 |
10 | // Crypter represents a symmetric dictionary encryption method.
11 | type Crypter interface {
12 | Encrypter
13 | Decrypter
14 | }
15 |
16 | // CryptMethodAES represents AES-128-ECB encryption with PKCS#7 padding.
17 | const CryptMethodAES string = "aes"
18 |
19 | // NewCrypter creates the specified type of Crypter with the specified key.
20 | func NewCrypter(method string, key []byte) (Crypter, error) {
21 | switch method {
22 | case CryptMethodAES:
23 | c, err := newCryptAES(key)
24 | return c, err
25 | default:
26 | return nil, fmt.Errorf("unknown encryption method %#v", method)
27 | }
28 | }
29 |
30 | type cryptAES struct {
31 | b cipher.Block
32 | }
33 |
34 | func newCryptAES(key []byte) (*cryptAES, error) {
35 | if b, err := aes.NewCipher(key); err != nil {
36 | return nil, err
37 | } else {
38 | return &cryptAES{b}, nil
39 | }
40 | }
41 |
42 | // Encrypt implements Encrypter.
43 | func (c *cryptAES) Encrypt(buf []byte) ([]byte, error) {
44 | if dst, err := cryptPKCS7Pad(buf, aes.BlockSize); err != nil {
45 | return nil, err
46 | } else if dst, err = cryptAES128ECBEncrypt(c.b, dst); err != nil {
47 | return nil, err
48 | } else {
49 | return dst, nil
50 | }
51 | }
52 |
53 | // Decrypt implements Decrypter.
54 | func (c *cryptAES) Decrypt(buf []byte) ([]byte, error) {
55 | if dst, err := cryptAES128ECBDecrypt(c.b, buf); err != nil {
56 | return nil, err
57 | } else if dst, err := cryptPKCS7Unpad(dst, aes.BlockSize); err != nil {
58 | return nil, err
59 | } else {
60 | return dst, nil
61 | }
62 | }
63 |
64 | func cryptPKCS7Unpad(src []byte, blockSize int) ([]byte, error) {
65 | if blockSize > 0xFF || blockSize < 0x00 {
66 | return nil, fmt.Errorf("block size %d out of bounds", blockSize)
67 | } else if len(src)%blockSize != 0 || len(src) == 0 {
68 | return nil, fmt.Errorf("data length %d is empty or not a multiple of block size %d", len(src), blockSize)
69 | }
70 | plen := int(src[len(src)-1])
71 | if len(src) <= plen {
72 | return nil, fmt.Errorf("invalid padding: padding length %d out of bounds", plen)
73 | }
74 | for _, v := range src[len(src)-plen:] {
75 | if int(v) != plen {
76 | return nil, fmt.Errorf("invalid padding: expected %d, got %d", plen, v)
77 | }
78 | }
79 | return src[:len(src)-plen], nil
80 | }
81 |
82 | func cryptPKCS7Pad(src []byte, blockSize int) ([]byte, error) {
83 | if blockSize > 0xFF || blockSize < 0x00 {
84 | return nil, fmt.Errorf("block size %d out of bounds", blockSize)
85 | }
86 | plen := blockSize - len(src)%blockSize
87 | return append(src, bytes.Repeat([]byte{byte(plen)}, plen)...), nil
88 | }
89 |
90 | func cryptAES128ECBDecrypt(cb cipher.Block, src []byte) ([]byte, error) {
91 | if len(src)%aes.BlockSize != 0 {
92 | return nil, fmt.Errorf("src not a multiple of block size %d", aes.BlockSize)
93 | }
94 | dst := make([]byte, len(src))
95 | for i := aes.BlockSize; i <= len(src); i += aes.BlockSize {
96 | cb.Decrypt(dst[i-aes.BlockSize:i], src[i-aes.BlockSize:i])
97 | }
98 | return dst, nil
99 | }
100 |
101 | func cryptAES128ECBEncrypt(cb cipher.Block, src []byte) ([]byte, error) {
102 | if len(src)%aes.BlockSize != 0 {
103 | return nil, fmt.Errorf("src not a multiple of block size %d", aes.BlockSize)
104 | }
105 | dst := make([]byte, len(src))
106 | for i := aes.BlockSize; i <= len(src); i += aes.BlockSize {
107 | cb.Encrypt(dst[i-aes.BlockSize:i], src[i-aes.BlockSize:i])
108 | }
109 | return dst, nil
110 | }
111 |
--------------------------------------------------------------------------------
/kobodict/crypt_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | // TODO(v1)
4 |
--------------------------------------------------------------------------------
/kobodict/fs.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "io"
7 | "io/ioutil"
8 | "os"
9 | "path/filepath"
10 | "strings"
11 | "unicode/utf8"
12 | )
13 |
14 | // Unpack is a helper function to unpack the contents of a Reader to a folder
15 | // on-disk. The provided dir must be non-existent. Unpack will not close the
16 | // reader.
17 | func Unpack(r *Reader, dir string) error {
18 | if _, err := os.Stat(dir); !os.IsNotExist(err) {
19 | return fmt.Errorf("dir %#v already exists", dir)
20 | }
21 | if err := os.Mkdir(dir, 0755); err != nil {
22 | return fmt.Errorf("create dir %#v: %w", dir, err)
23 | }
24 | for _, f := range r.File {
25 | if err := unpackFile(dir, f.Open, f.Name); err != nil {
26 | return fmt.Errorf("unpack file %#v: %w", f.Name, err)
27 | }
28 | }
29 | for _, f := range r.Dicthtml {
30 | if err := unpackFile(dir, f.Open, f.Name); err != nil {
31 | return fmt.Errorf("unpack dicthtml %#v (prefix: %s): %w", f.Name, f.Prefix, err)
32 | }
33 | }
34 | if err := ioutil.WriteFile(filepath.Join(dir, "words"), []byte(strings.Join(r.Word, "\n")), 0644); err != nil {
35 | return fmt.Errorf("write words file: %w", err)
36 | }
37 | return nil
38 | }
39 |
40 | func unpackFile(dir string, open func() (io.ReadCloser, error), name string) error {
41 | fr, err := open()
42 | if err != nil {
43 | return fmt.Errorf("read contents: %w", err)
44 | }
45 | defer fr.Close()
46 |
47 | fw, err := os.OpenFile(filepath.Join(dir, name), os.O_CREATE|os.O_WRONLY|os.O_EXCL, 0644)
48 | if err != nil {
49 | return fmt.Errorf("create output file: %w", err)
50 | }
51 | defer fw.Close()
52 |
53 | if _, err := io.Copy(fw, fr); err != nil {
54 | return fmt.Errorf("write output file: %w", err)
55 | }
56 |
57 | if err := fw.Close(); err != nil {
58 | return fmt.Errorf("write output file: %w", err)
59 | }
60 |
61 | return nil
62 | }
63 |
64 | // Pack is a helper function to pack the contents a folder unpacked using Unpack
65 | // into a Writer. It is assumed that the writer has not been used. The provided
66 | // file will be overwritten if it exists and is a regular file, or created if it
67 | // doesn't exist. Pack will not close the writer.
68 | func Pack(w *Writer, dir string) error {
69 | if fi, err := os.Stat(filepath.Join(dir, "words")); os.IsNotExist(err) || (err == nil && fi.IsDir()) {
70 | return fmt.Errorf("dir %#v is not an unpacked dictzip (no words file)", dir)
71 | }
72 |
73 | fis, err := ioutil.ReadDir(dir) // note: this is sorted
74 | if err != nil {
75 | return fmt.Errorf("read dir %#v: %w", dir, err)
76 | }
77 |
78 | for _, fi := range fis {
79 | switch {
80 | case fi.IsDir():
81 | return fmt.Errorf("invalid dir %#v: dirs are not supported", fi.Name())
82 | case fi.Name() == "words":
83 | continue
84 | case strings.HasSuffix(fi.Name(), ".html"):
85 | if err := func() error {
86 | fr, err := os.OpenFile(filepath.Join(dir, fi.Name()), os.O_RDONLY, 0)
87 | if err != nil {
88 | return fmt.Errorf("open file: %w", err)
89 | }
90 | defer fr.Close()
91 |
92 | tmp := make([]byte, 2)
93 | if _, err := fr.Read(tmp); err != nil {
94 | return fmt.Errorf("read file: %w", err)
95 | } else if tmp[0] == 0x1F && tmp[1] == 0x8B {
96 | return fmt.Errorf("invalid unpacked dicthtml file: already compressed")
97 | } else if _, err := fr.Seek(0, os.SEEK_SET); err != nil {
98 | return fmt.Errorf("read file: %w", err)
99 | }
100 |
101 | fw, err := w.CreateDicthtml(strings.TrimSuffix(fi.Name(), ".html"))
102 | if err != nil {
103 | return fmt.Errorf("create dictzip entry: %w", err)
104 | }
105 |
106 | if _, err := io.Copy(fw, fr); err != nil {
107 | return fmt.Errorf("write file: %w", err)
108 | }
109 |
110 | return nil
111 | }(); err != nil {
112 | return fmt.Errorf("add dicthtml %#v: %w", fi.Name(), err)
113 | }
114 | default:
115 | if err := func() error {
116 | fr, err := os.OpenFile(filepath.Join(dir, fi.Name()), os.O_RDONLY, 0)
117 | if err != nil {
118 | return fmt.Errorf("open file: %w", err)
119 | }
120 | defer fr.Close()
121 |
122 | fw, err := w.CreateFile(strings.TrimSuffix(fi.Name(), ".html"))
123 | if err != nil {
124 | return fmt.Errorf("create dictzip entry: %w", err)
125 | }
126 |
127 | if _, err := io.Copy(fw, fr); err != nil {
128 | return fmt.Errorf("write file: %w", err)
129 | }
130 |
131 | return nil
132 | }(); err != nil {
133 | return fmt.Errorf("add file %#v: %w", fi.Name(), err)
134 | }
135 | }
136 | }
137 |
138 | if err := func() error {
139 | fr, err := os.OpenFile(filepath.Join(dir, "words"), os.O_RDONLY, 0)
140 | if err != nil {
141 | return fmt.Errorf("open words file: %w", err)
142 | }
143 | defer fr.Close()
144 |
145 | sc := bufio.NewScanner(fr)
146 | for sc.Scan() {
147 | if !utf8.Valid(sc.Bytes()) {
148 | return fmt.Errorf("invalid word: %#v", sc.Text())
149 | }
150 | if word := strings.TrimSpace(sc.Text()); len(word) != 0 {
151 | if err := w.AddWord(word); err != nil {
152 | return fmt.Errorf("add word %#v: %s", word, err)
153 | }
154 | }
155 | }
156 | if sc.Err() != nil {
157 | return fmt.Errorf("read words file: %w", err)
158 | }
159 |
160 | return nil
161 | }(); err != nil {
162 | return fmt.Errorf("add words index: %w", err)
163 | }
164 |
165 | return nil
166 | }
167 |
--------------------------------------------------------------------------------
/kobodict/fs_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | // TODO(v1)
4 |
--------------------------------------------------------------------------------
/kobodict/marisa.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | import "io"
4 |
5 | // Marisa is used by Reader and Writer for reading/writing Marisa tries. It is
6 | // automatically set on supported platforms if
7 | // github.com/pgaskin/dictutil/kobodict/marisa is imported, but can be
8 | // overridden manually.
9 | var Marisa interface {
10 | MarisaReader
11 | MarisaWriter
12 | }
13 |
14 | // MarisaReader represents a simplified abstraction for reading Marisa tries.
15 | type MarisaReader interface {
16 | ReadAll(io.Reader) ([]string, error)
17 | }
18 |
19 | // MarisaWriter represents a simplified abstraction for writing Marisa tries.
20 | type MarisaWriter interface {
21 | WriteAll(io.Writer, []string) error
22 | }
23 |
--------------------------------------------------------------------------------
/kobodict/marisa/marisa.go:
--------------------------------------------------------------------------------
1 | // Package marisa is imported with _ to enable marisa for the kobodict, if
2 | // supported. It is in a separate package so functions in kobodict which don't
3 | // require marisa can be used without compiling it. As an alternative to
4 | // importing this package, you can provide your own implementation of marisa in
5 | // kobodict.Marisa. If imported, this package will fail to compile unless marisa
6 | // is available for your GOOS/GOARCH.
7 | package marisa
8 |
9 | import "github.com/pgaskin/dictutil/kobodict"
10 |
11 | // This is done so it can still be instantiated even if not implemented for the
12 | // current platform (it will be caught when assigning it to kobodict.Marisa),
13 | // named platform for better error messages.
14 |
15 | type platform struct{}
16 |
17 | func init() {
18 | kobodict.Marisa = new(platform) // platform-specific implementation
19 | }
20 |
--------------------------------------------------------------------------------
/kobodict/marisa/marisa_cgo.go:
--------------------------------------------------------------------------------
1 | //+build cgo
2 |
3 | package marisa
4 |
5 | import (
6 | "io"
7 |
8 | "github.com/pgaskin/dictutil/marisa"
9 | )
10 |
11 | func (*platform) ReadAll(r io.Reader) (wd []string, err error) {
12 | return marisa.ReadAll(r)
13 | }
14 |
15 | func (*platform) WriteAll(w io.Writer, wd []string) (err error) {
16 | return marisa.WriteAll(w, wd)
17 | }
18 |
--------------------------------------------------------------------------------
/kobodict/marisa/marisa_test.go:
--------------------------------------------------------------------------------
1 | package marisa
2 |
3 | import (
4 | "bytes"
5 | "crypto/sha1"
6 | "encoding/hex"
7 | "io"
8 | "reflect"
9 | "runtime"
10 | "testing"
11 |
12 | "github.com/pgaskin/dictutil/kobodict"
13 | )
14 |
15 | func TestMarisa(t *testing.T) {
16 | impl, ok := (interface{})(new(platform)).(interface {
17 | kobodict.MarisaReader
18 | kobodict.MarisaWriter
19 | })
20 | if !ok {
21 | t.Skipf("warning: Marisa not supported on platform GOOS=%s GOARCH=%s and must be provided externally", runtime.GOOS, runtime.GOARCH)
22 | }
23 |
24 | w := []string{
25 | "asd",
26 | "dfg",
27 | "sdf",
28 | }
29 |
30 | buf := bytes.NewBuffer(nil)
31 | if err := impl.WriteAll(buf, w); err != nil {
32 | t.Fatalf("unexpected error when writing trie: %v", err)
33 | } else if buf.Len() == 0 {
34 | t.Errorf("written trie is empty")
35 | }
36 |
37 | ss := sha1.New()
38 |
39 | nw, err := impl.ReadAll(io.TeeReader(buf, ss))
40 | if err != nil {
41 | t.Fatalf("unexpected error when reading written trie: %v", err)
42 | } else if len(nw) == 0 {
43 | t.Errorf("read trie is empty")
44 | } else if !reflect.DeepEqual(nw, w) {
45 | t.Errorf("read tree: expected %+s, got %+s", w, nw)
46 | }
47 |
48 | if runtime.GOARCH == "amd64" {
49 | if x, y := hex.EncodeToString(ss.Sum(nil)), "ea7252fc4e86585dea884e4bcb5ce7be90676474"; x != y {
50 | t.Errorf("trie output is incorrect or non-determinstic, expected sha1 %s, got %s", y, x)
51 | }
52 | } else {
53 | t.Logf("skipping sha1 check on non-amd64 architecture, as the correct file differs slightly on each one (usually by ~4 bytes)")
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/kobodict/reader.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "compress/gzip"
7 | "fmt"
8 | "io"
9 | "io/ioutil"
10 | "strings"
11 | )
12 |
13 | // Reader provides access to the contents of a dictzip file.
14 | type Reader struct {
15 | Word []string
16 | Dicthtml []*ReaderDicthtml
17 | File []*ReaderFile
18 | z *zip.Reader
19 | d Decrypter
20 | }
21 |
22 | // ReaderDicthtml represents a dicthtml file from a Reader.
23 | type ReaderDicthtml struct {
24 | Name string
25 | Prefix string
26 | f *zip.File
27 | r *Reader
28 | }
29 |
30 | // ReaderDicthtml represents a raw file from a Reader (e.g. images).
31 | type ReaderFile struct {
32 | Name string
33 | f *zip.File
34 | r *Reader
35 | }
36 |
37 | // Decrypter decrypts dicthtml files.
38 | type Decrypter interface {
39 | // Decrypt decrypts the dicthtml bytes. It will only be called if the
40 | // dicthtml is not otherwise readable. An error should be returned if the
41 | // decryption itself encounters an error; the decryptor should not try to
42 | // judge if the resulting bytes are valid.
43 | Decrypt([]byte) ([]byte, error)
44 | }
45 |
46 | // NewReader returns a new dictzip reader which reads from r, with the given
47 | // file size.
48 | func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
49 | zr, err := zip.NewReader(r, size)
50 | if err != nil {
51 | return nil, fmt.Errorf("open zip: %w", err)
52 | }
53 |
54 | kr := &Reader{
55 | z: zr,
56 | }
57 |
58 | var found bool
59 | for _, zf := range zr.File {
60 | if zf.Name == "words" {
61 | if fr, err := zf.Open(); err != nil {
62 | return nil, fmt.Errorf("open words index: %w", err)
63 | } else if Marisa == nil {
64 | return nil, fmt.Errorf("no marisa bindings found")
65 | } else if kr.Word, err = Marisa.ReadAll(fr); err != nil {
66 | return nil, fmt.Errorf("read words index: %w", err)
67 | }
68 | found = true
69 | break
70 | }
71 | }
72 | if !found {
73 | return nil, fmt.Errorf("not a dictzip: no words index found")
74 | }
75 |
76 | for _, f := range zr.File {
77 | switch {
78 | case !f.Mode().IsRegular():
79 | continue
80 | case f.Name == "words":
81 | continue
82 | case strings.Contains(f.Name, "/"):
83 | return nil, fmt.Errorf("read zip: illegal file %#v: contains slash (not in root dir)", f.Name)
84 | case strings.HasSuffix(f.Name, ".html"):
85 | kr.Dicthtml = append(kr.Dicthtml, &ReaderDicthtml{
86 | Name: f.Name,
87 | Prefix: strings.TrimSuffix(f.Name, ".html"),
88 | f: f,
89 | r: kr,
90 | })
91 | default:
92 | kr.File = append(kr.File, &ReaderFile{
93 | Name: f.Name,
94 | f: f,
95 | r: kr,
96 | })
97 | }
98 | }
99 |
100 | return kr, nil
101 | }
102 |
103 | // SetDecrypter sets the Decrypter used to decrypt encrypted dicthtml files.
104 | func (r *Reader) SetDecrypter(d Decrypter) {
105 | r.d = d
106 | }
107 |
108 | // Open returns an io.ReadCloser which reads the decoded dicthtml file. Multiple
109 | // files can be read at once.
110 | func (f *ReaderDicthtml) Open() (io.ReadCloser, error) {
111 | enc, err := func() (bool, error) {
112 | fr, err := f.f.Open()
113 | if err != nil {
114 | return false, fmt.Errorf("open zip entry: %v", err)
115 | }
116 | defer fr.Close()
117 |
118 | tmp := make([]byte, 2)
119 | if n, err := fr.Read(tmp); err != nil {
120 | return false, fmt.Errorf("read zip entry: %v", err)
121 | } else if n != len(tmp) {
122 | return false, fmt.Errorf("corrupt dicthtml: too short (%d)", n)
123 | }
124 |
125 | if tmp[0] == 0x1F && tmp[1] == 0x8B {
126 | return false, nil
127 | }
128 |
129 | if f.r.d == nil {
130 | return true, fmt.Errorf("corrupt or encrypted dicthtml: invalid header")
131 | }
132 |
133 | // maybe optimize this later?
134 | if buf, err := ioutil.ReadAll(io.MultiReader(bytes.NewReader(tmp), fr)); err != nil {
135 | return true, fmt.Errorf("read zip entry: %v", err)
136 | } else if dec, err := f.r.d.Decrypt(buf); err != nil {
137 | return true, fmt.Errorf("decrypt dicthtml: %v", err)
138 | } else if dec[0] != 0x1F || dec[1] != 0x8B {
139 | return true, fmt.Errorf("corrupt dicthtml or invalid encryption key: invalid header")
140 | }
141 | return true, nil
142 | }()
143 | if err != nil {
144 | return nil, err
145 | }
146 |
147 | fr, err := f.f.Open()
148 | if err != nil {
149 | return nil, fmt.Errorf("open zip entry: %v", err)
150 | }
151 |
152 | var dr io.Reader
153 | if enc {
154 | if buf, err := ioutil.ReadAll(fr); err != nil {
155 | return nil, fmt.Errorf("read zip entry: %v", err)
156 | } else if dec, err := f.r.d.Decrypt(buf); err != nil {
157 | return nil, fmt.Errorf("decrypt dicthtml: %v", err)
158 | } else if dec[0] != 0x1F || dec[1] != 0x8B {
159 | return nil, fmt.Errorf("corrupt dicthtml or invalid encryption key: invalid header")
160 | } else {
161 | dr = bytes.NewReader(dec)
162 | }
163 | } else {
164 | dr = fr
165 | }
166 |
167 | zr, err := gzip.NewReader(dr)
168 | if err != nil {
169 | return nil, fmt.Errorf("decompress dicthtml: %v", err)
170 | }
171 |
172 | return &funcReadCloser{
173 | Reader: zr,
174 | Closer: func() error {
175 | if err := zr.Close(); err != nil {
176 | fr.Close()
177 | return err
178 | }
179 | return fr.Close()
180 | },
181 | }, nil
182 | }
183 |
184 | // Open returns an io.ReadCloser which reads the contents of the file. Multiple
185 | // files can be read at once.
186 | func (f *ReaderFile) Open() (io.ReadCloser, error) {
187 | return f.f.Open()
188 | }
189 |
190 | type funcReadCloser struct {
191 | io.Reader
192 | Closer func() error
193 | }
194 |
195 | func (f *funcReadCloser) Close() error {
196 | if f.Closer != nil {
197 | return f.Closer()
198 | }
199 | return nil
200 | }
201 |
--------------------------------------------------------------------------------
/kobodict/reader_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | // TODO(v1)
4 |
--------------------------------------------------------------------------------
/kobodict/util.go:
--------------------------------------------------------------------------------
1 | // Package kobodict implements reading, writing, and other utilities for Kobo
2 | // dictionaries (v2).
3 | //
4 | // A marisa implementation must be provided by
5 | // github.com/pgaskin/kobodict/marisa or a custom one if Writer or Reader is
6 | // used.
7 | package kobodict
8 |
9 | import (
10 | "strings"
11 | "unicode"
12 | )
13 |
14 | // NormalizeWordReference normalizes a word for use in an dicthtml headword
15 | // (= 2 || c == '\x00' { // limit to 2 chars, also cut at null
49 | pfx = pfx[:i] // trim up to current char
50 | break
51 | }
52 | pfx[i] = unicode.ToLower(c) // this includes accented chars
53 | }
54 |
55 | for len(pfx) != 0 {
56 | if unicode.IsSpace(pfx[0]) {
57 | pfx = pfx[1:] // trim left space
58 | } else {
59 | break
60 | }
61 | }
62 |
63 | for len(pfx) != 0 {
64 | if unicode.IsSpace(pfx[len(pfx)-1]) {
65 | pfx = pfx[:len(pfx)-1] // trim right space
66 | } else {
67 | break
68 | }
69 | }
70 |
71 | if len(pfx) == 0 {
72 | return "11" // if empty, return "11"
73 | }
74 |
75 | if !unicode.Is(unicode.Cyrillic, pfx[0]) {
76 | for len(pfx) < 2 {
77 | pfx = append(pfx, 'a') // pad right with 'a's to 2 chars
78 | }
79 | if !unicode.IsLetter(pfx[0]) || !unicode.IsLetter(pfx[1]) {
80 | return "11" // if neither of the first 2 chars are letters, return "11"
81 | }
82 | }
83 |
84 | return string(pfx)
85 | }
86 |
87 | // wordPrefix gets the prefix of a word for sharding dicthtml files.
88 | //
89 | // This is not to be used with Kanji, as those are handled by a separate
90 | // function for Japanese dictionaries.
91 | //
92 | // The logic is reversed from DictionaryParser::htmlForWord in libnickel. It
93 | // matches it as closely as possible.
94 | func wordPrefix(w string) string {
95 | // w
96 | // QString::toLower()
97 | w = strings.ToLower(w)
98 |
99 | // QString::leftRef(2)
100 | if len(w) > 2 {
101 | w = string([]rune(w)[:2])
102 | }
103 |
104 | // QString::trimmed()
105 | w = strings.TrimSpace(w)
106 |
107 | // simplify the following code by converting to rune slice
108 | r := []rune(w)
109 |
110 | // A null byte is a valid Unicode character, but in C, it's treated as
111 | // the end of a string. To keep compatibility with libnickel, we need to
112 | // end a string there if necessary.
113 | for i, c := range r {
114 | if c == '\x00' {
115 | r = r[:i]
116 | break
117 | }
118 | }
119 |
120 | // DictionaryParser::isCyrillic(w[0])
121 | // skip if true
122 | if !(len(r) != 0 && unicode.Is(unicode.Cyrillic, r[0])) {
123 | // add an 'a' for right padding if not 2 chars
124 | if len(r) != 2 {
125 | r = append(r, 'a')
126 | }
127 | }
128 |
129 | // DictionaryParser::isCyrillic(w[0])
130 | // skip if != false
131 | switch {
132 | case !(len(r) != 0 && unicode.Is(unicode.Cyrillic, r[0])):
133 | // inlined QChar::isLetter(w[0]), QChar::isLetter(w[1]), unnecessary length check
134 | // skip if both true
135 | if (len(r) >= 1 && unicode.IsLetter(r[0])) && (len(r) >= 2 && unicode.IsLetter(r[1])) {
136 | break
137 | }
138 | fallthrough
139 | case len(r) == 0:
140 | // w = QString::fromLatin1_helper("11"..., 2)
141 | return "11"
142 | }
143 |
144 | return string(r)
145 | }
146 |
--------------------------------------------------------------------------------
/kobodict/util_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | import (
4 | "strconv"
5 | "testing"
6 | )
7 |
8 | func TestNormalizeWordReference(t *testing.T) {
9 | for _, tc := range []struct {
10 | v bool
11 | i, o string
12 | }{
13 | {true, "Asd", "asd"},
14 | {false, "Asd", "Asd"},
15 | {true, " Asd", "asd"},
16 | {false, " Asd", "Asd"},
17 | {true, " Asd ", "asd"},
18 | {false, " Asd ", "Asd"},
19 | {true, " Asd \n", "asd"},
20 | {false, " Asd \n", "Asd"},
21 | {true, " Ȃsd \n", "ȃsd"},
22 | {false, " Ȃsd \n", "Ȃsd"},
23 | } {
24 | t.Logf("word %#v [variant:%t] (%#v)", tc.i, tc.v, tc.o)
25 | if o := NormalizeWordReference(tc.i, tc.v); o != tc.o {
26 | t.Errorf(" got %#v", o)
27 | }
28 | }
29 | }
30 |
31 | var tcs = []struct{ w, p string }{
32 | // dicthtml-en
33 | {"test", "te"},
34 | {"a-", "11"},
35 | {"-an", "11"},
36 | {"GB", "gb"},
37 |
38 | // dicthtml-fr
39 | {"ébahir", "éb"},
40 | {"à", "àa"},
41 | {"a1", "11"},
42 | {"ô", "ôa"},
43 | {"kébab", "ké"},
44 | {"aérer", "aé"},
45 | {"living-room", "li"},
46 |
47 | // dicthtml-ja
48 | // Note, Kanji not currently implemented, so not testing (note, the logic
49 | // is in a separate function, anyways).
50 | // {"あ", "あ"},
51 | // {"アークとう", "アー"},
52 |
53 | // generated by dictword-test: spaces
54 | {" x", "xa"},
55 | {" ", "11"},
56 | {"x ", "xa"},
57 | {" ", "11"},
58 | {" ", "11"},
59 | {"\t\t", "11"},
60 | {"\t\f\t", "11"},
61 | {"x ", "xa"},
62 | {" xx", "xa"},
63 |
64 | // generated by dictword-test: spaces where trim/prefix order matters
65 | {" x", "11"},
66 | {" xy", "11"},
67 | {" xyz", "11"},
68 | {"x z", "xa"},
69 | {"x z", "xa"},
70 |
71 | // generated by dictword-test: cyrillic
72 | {" д", "д"},
73 | {"д ", "д"},
74 | {" ", "11"},
75 | {" ", "11"},
76 | {" ", "11"},
77 | {" дд", "д"},
78 | {"д ", "д"},
79 | {"д", "д"},
80 | {"aд", "aд"},
81 | {"дa", "дa"},
82 | {"aдa", "aд"},
83 | {"дaд", "дa"},
84 |
85 | // generated by dictword-test: uppercase accented letters
86 | {"Ȅe", "ȅe"},
87 | {"eȄ", "eȅ"},
88 | {"Ȅ", "ȅa"},
89 | {"Ȅ!", "11"},
90 |
91 | // generated by dictword-test: cjk
92 | {" 未", "未a"},
93 | {" 未", "11"},
94 | {"未", "未a"},
95 | {"未未", "未未"},
96 | {"x未", "x未"},
97 | {"未x", "未x"},
98 | {"xy未", "xy"},
99 | {"还没", "还没"},
100 |
101 | // generated by dictword-test: misc
102 | {"!", "11"},
103 | {"!!", "11"},
104 | {"!!!", "11"},
105 | {"x!", "11"},
106 | {"x!!", "11"},
107 | {"xx!", "xx"},
108 | {"xxx!", "xx"},
109 | {" !", "11"},
110 | {" !!", "11"},
111 | {" !!!", "11"},
112 | {" !", "11"},
113 | {" !!", "11"},
114 | {" !!!", "11"},
115 | {" x!", "xa"},
116 | {" x!!", "xa"},
117 | {" xx!", "xa"},
118 | {" xxx!", "xa"},
119 |
120 | // synthetic
121 | {"x\x00y", "xa"},
122 | {"\x00xy", "11"},
123 | }
124 |
125 | func TestWordPrefix(t *testing.T) {
126 | for _, tc := range tcs {
127 | t.Logf("word %#v (%#v)", tc.w, tc.p)
128 | if p := wordPrefix(tc.w); p != tc.p {
129 | t.Errorf(" got (original version) %#v", p)
130 | }
131 | if p := WordPrefix(tc.w); p != tc.p {
132 | t.Errorf(" got (simplified version) %#v", p)
133 | }
134 | }
135 | }
136 |
137 | func BenchmarkWordPrefix(b *testing.B) {
138 | for _, tcf := range []struct {
139 | n string
140 | fn func(string) string
141 | }{
142 | {"Orig/", wordPrefix},
143 | {"Smpl/", WordPrefix},
144 | } {
145 | // all test cases
146 | b.Run(tcf.n+"All"+strconv.Itoa(len(tcs)), func(b *testing.B) {
147 | for i := 0; i < b.N; i++ {
148 | for _, tc := range tcs {
149 | tcf.fn(tc.w)
150 | }
151 | }
152 | })
153 |
154 | // near-worst possible case
155 | b.Run(tcf.n+"Worst", func(b *testing.B) {
156 | for i := 0; i < b.N; i++ {
157 | tcf.fn(" 还д 没")
158 | }
159 | })
160 |
161 | // normal case
162 | b.Run(tcf.n+"Normal", func(b *testing.B) {
163 | for i := 0; i < b.N; i++ {
164 | tcf.fn("Test")
165 | }
166 | })
167 |
168 | // best case
169 | b.Run(tcf.n+"Best", func(b *testing.B) {
170 | for i := 0; i < b.N; i++ {
171 | tcf.fn("aa")
172 | }
173 | })
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/kobodict/writer.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "compress/gzip"
7 | "fmt"
8 | "io"
9 | "sort"
10 | "strings"
11 | )
12 |
13 | // Writer creates dictzips. It does not do any validation; it only does what it
14 | // is told. It is up to the user to ensure the input is valid.
15 | type Writer struct {
16 | z *zip.Writer
17 | e Encrypter
18 | words map[string]struct{} // doesn't take up space for values
19 | used map[string]struct{}
20 | closed bool
21 | last io.WriteCloser
22 | }
23 |
24 | // Encrypter encrypts dicthtml files.
25 | type Encrypter interface {
26 | // Encrypt encrypts the provided bytes.
27 | Encrypt([]byte) ([]byte, error)
28 | }
29 |
30 | // NewWriter creates a dictzip writer writing to w.
31 | func NewWriter(w io.Writer) *Writer {
32 | return &Writer{
33 | z: zip.NewWriter(w),
34 | words: map[string]struct{}{},
35 | used: map[string]struct{}{},
36 | }
37 | }
38 |
39 | // AddWord normalizes and adds a word to the index. If the word has already been
40 | // added, it does nothing.
41 | func (w *Writer) AddWord(word string) error {
42 | if w.closed {
43 | return fmt.Errorf("write to closed writer")
44 | }
45 | w.words[strings.TrimSpace(word)] = struct{}{} // index words aren't normalized except for trimming spaces
46 | return nil
47 | }
48 |
49 | // CreateDicthtml adds a dicthtml file for the specified prefix and returns a
50 | // writer which is valid until the next file is created.
51 | func (w *Writer) CreateDicthtml(prefix string) (io.Writer, error) {
52 | if strings.Contains(prefix, "/") {
53 | return nil, fmt.Errorf("invalid prefix: must not contain slashes")
54 | }
55 | if w.closed {
56 | return nil, fmt.Errorf("writer already closed")
57 | }
58 | if w.last != nil {
59 | if err := w.last.Close(); err != nil {
60 | return nil, fmt.Errorf("close last file writer: %w", err)
61 | }
62 | w.last = nil
63 | }
64 |
65 | filename := prefix + ".html"
66 | if _, ok := w.used[filename]; ok {
67 | return nil, fmt.Errorf("file %#v already exists in dictzip", filename)
68 | }
69 |
70 | fw, err := w.z.Create(filename)
71 | if err != nil {
72 | return nil, fmt.Errorf("create zip entry: %w", err)
73 | }
74 |
75 | if w.e != nil {
76 | ew := newEncryptWriter(w.e, fw)
77 | zw := gzip.NewWriter(ew)
78 |
79 | w.last = &funcWriteCloser{
80 | Writer: zw,
81 | Closer: func() error {
82 | if err := zw.Close(); err != nil {
83 | return err
84 | }
85 | return ew.Close()
86 | },
87 | }
88 | } else {
89 | w.last = gzip.NewWriter(fw)
90 | }
91 |
92 | w.used[filename] = struct{}{}
93 | return w.last, nil
94 | }
95 |
96 | // CreateFile adds a raw file with the specified name. Note that Kobo only
97 | // supports GIF and JPEG files starting with the "GIF" and "JFIF" magic, and the
98 | // treatment of other files is undefined. In addition, subdirectories are not
99 | // supported. The behaviour is undefined if a dicthtml file is added this way.
100 | func (w *Writer) CreateFile(filename string) (io.Writer, error) {
101 | if strings.Contains(filename, "/") || strings.Contains(filename, "\\") {
102 | return nil, fmt.Errorf("invalid filename: must not contain slashes")
103 | } else if strings.Contains(filename, "words") {
104 | return nil, fmt.Errorf("invalid filename: must not be 'words'")
105 | } else if _, ok := w.used[filename]; ok {
106 | return nil, fmt.Errorf("file %#v already exists in dictzip", filename)
107 | }
108 | if w.last != nil {
109 | if err := w.last.Close(); err != nil {
110 | return nil, fmt.Errorf("close last file writer: %w", err)
111 | }
112 | w.last = nil
113 | }
114 |
115 | fw, err := w.z.Create(filename)
116 | if err != nil {
117 | return nil, fmt.Errorf("create zip entry: %w", err)
118 | }
119 |
120 | w.last = &funcWriteCloser{
121 | Writer: fw,
122 | Closer: nil,
123 | }
124 | w.used[filename] = struct{}{}
125 | return w.last, nil
126 | }
127 |
128 | // Exists checks if a file already exists in the dictzip with the specified name.
129 | func (w *Writer) Exists(fn string) bool {
130 | _, ok := w.used[fn]
131 | return ok
132 | }
133 |
134 | // Close writes the marisa index and the zip footer. The error should not be
135 | // ignored. It does not close the underlying writer.
136 | func (w *Writer) Close() error {
137 | if w.closed {
138 | return fmt.Errorf("writer already closed")
139 | }
140 | if w.last != nil {
141 | if err := w.last.Close(); err != nil {
142 | return fmt.Errorf("close last file writer: %w", err)
143 | }
144 | w.last = nil
145 | }
146 |
147 | var words []string
148 | for word := range w.words {
149 | words = append(words, word)
150 | }
151 | sort.Strings(words)
152 |
153 | if fw, err := w.z.Create("words"); err != nil {
154 | return fmt.Errorf("create index zip entry: %w", err)
155 | } else if Marisa == nil {
156 | return fmt.Errorf("no marisa bindings found")
157 | } else if err := Marisa.WriteAll(fw, words); err != nil {
158 | return fmt.Errorf("write index: %w", err)
159 | }
160 |
161 | if err := w.z.Close(); err != nil {
162 | return fmt.Errorf("close zip: %w", err)
163 | }
164 | return nil
165 | }
166 |
167 | // SetEncrypter sets the Encrypter used to encrypt dicthtml files. This must be
168 | // will only apply to dicthtml files added after the encrypter is set.
169 | func (w *Writer) SetEncrypter(e Encrypter) {
170 | w.e = e
171 | }
172 |
173 | type encryptWriter struct {
174 | e Encrypter
175 | w io.Writer
176 | b *bytes.Buffer
177 | c bool
178 | }
179 |
180 | func newEncryptWriter(e Encrypter, w io.Writer) io.WriteCloser {
181 | return &encryptWriter{
182 | e: e,
183 | w: w,
184 | b: bytes.NewBuffer(nil),
185 | c: false,
186 | }
187 | }
188 |
189 | func (e encryptWriter) Write(buf []byte) (n int, err error) {
190 | if e.c {
191 | return 0, fmt.Errorf("write to closed writer")
192 | }
193 | return e.b.Write(buf)
194 | }
195 |
196 | // Close encrypts and writes the buffer to the underlying writer. The error
197 | // should be checked.
198 | func (e encryptWriter) Close() error {
199 | if e.c {
200 | return fmt.Errorf("writer already closed")
201 | }
202 | if buf, err := e.e.Encrypt(e.b.Bytes()); err != nil {
203 | return fmt.Errorf("encrypt bytes: %w", err)
204 | } else if _, err := e.w.Write(buf); err != nil {
205 | return fmt.Errorf("write encrypted bytes: %w", err)
206 | }
207 | return nil
208 | }
209 |
210 | type funcWriteCloser struct {
211 | io.Writer
212 | Closer func() error
213 | }
214 |
215 | func (f *funcWriteCloser) Close() error {
216 | if f.Closer != nil {
217 | return f.Closer()
218 | }
219 | return nil
220 | }
221 |
--------------------------------------------------------------------------------
/kobodict/writer_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 |
3 | // TODO(v1)
4 |
--------------------------------------------------------------------------------
/marisa/libmarisa_generate.go:
--------------------------------------------------------------------------------
1 | //+build libmarisa_generate
2 |
3 | package main
4 |
5 | import (
6 | "archive/tar"
7 | "bytes"
8 | "compress/gzip"
9 | "fmt"
10 | "io"
11 | "io/ioutil"
12 | "net/http"
13 | "os"
14 | "path"
15 | "regexp"
16 | "strings"
17 | )
18 |
19 | func main() {
20 | url := "https://github.com/s-yata/marisa-trie/archive/970b20c141f11d9d7572a6bb8d0488f2e0520e22.tar.gz"
21 | version := "970b20c"
22 |
23 | if files, err := tarball(url); err != nil {
24 | fmt.Fprintf(os.Stderr, "Error: download tarball %#v: %v\n", url, err)
25 | os.Exit(1)
26 | return
27 | } else if err := func() error {
28 | if mr, err := libmarisa(files, version); err != nil {
29 | return err
30 | } else if mf, err := os.OpenFile("libmarisa.cc", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644); err != nil {
31 | return err
32 | } else if _, err := io.Copy(mf, mr); err != nil {
33 | mf.Close()
34 | return err
35 | } else {
36 | return mf.Close()
37 | }
38 | }(); err != nil {
39 | fmt.Fprintf(os.Stderr, "Error: generate libmarisa.cc: %v\n", err)
40 | os.Exit(1)
41 | return
42 | } else if err := func() error {
43 | if mr, err := hmarisa(files, version); err != nil {
44 | return err
45 | } else if mf, err := os.OpenFile("libmarisa.h", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644); err != nil {
46 | return err
47 | } else if _, err := io.Copy(mf, mr); err != nil {
48 | mf.Close()
49 | return err
50 | } else {
51 | return mf.Close()
52 | }
53 | }(); err != nil {
54 | fmt.Fprintf(os.Stderr, "Error: generate libmarisa.h: %v\n", err)
55 | os.Exit(1)
56 | return
57 | }
58 | }
59 | func hmarisa(files map[string][]byte, version string) (io.Reader, error) {
60 | marisaH, err := resolve(files, []string{
61 | "include/marisa.h",
62 | }, "include", "lib")
63 | if err != nil {
64 | return nil, err
65 | }
66 |
67 | fmt.Printf("Generating libmarisa.h\n")
68 | return io.MultiReader(
69 | // A custom header.
70 | strings.NewReader("// AUTOMATICALLY GENERATED, DO NOT EDIT!\n"),
71 | strings.NewReader("// merged from marisa-trie "+version+".\n"),
72 | // Include the license info.
73 | bytes.NewReader([]byte{'\n', '/', '/', ' '}),
74 | bytes.NewReader(bytes.ReplaceAll(files["COPYING.md"], []byte{'\n'}, []byte{'\n', '/', '/', ' '})),
75 | bytes.NewReader([]byte{'\n', '\n'}),
76 | // Include the header.
77 | bytes.NewReader(marisaH),
78 | ), nil
79 | }
80 |
81 | func libmarisa(files map[string][]byte, version string) (io.Reader, error) {
82 | marisaGrimoireIOLib, err := resolve(files, []string{
83 | "lib/marisa/grimoire/io/mapper.cc",
84 | "lib/marisa/grimoire/io/reader.cc",
85 | "lib/marisa/grimoire/io/writer.cc",
86 | }, "include", "lib")
87 | if err != nil {
88 | return nil, err
89 | }
90 |
91 | marisaGrimoireTrieLib, err := resolve(files, []string{
92 | "lib/marisa/grimoire/trie/tail.cc",
93 | "lib/marisa/grimoire/trie/louds-trie.cc",
94 | }, "include", "lib")
95 | if err != nil {
96 | return nil, err
97 | }
98 |
99 | marisaGrimoireVectorLib, err := resolve(files, []string{
100 | "lib/marisa/grimoire/vector/bit-vector.cc",
101 | }, "include", "lib")
102 | if err != nil {
103 | return nil, err
104 | }
105 |
106 | marisaLib, err := resolve(files, []string{
107 | "lib/marisa/agent.cc",
108 | "lib/marisa/keyset.cc",
109 | "lib/marisa/trie.cc",
110 | }, "include", "lib")
111 | if err != nil {
112 | return nil, err
113 | }
114 |
115 | fmt.Printf("Generating libmarisa.cc\n")
116 | return io.MultiReader(
117 | // A custom header.
118 | strings.NewReader("// AUTOMATICALLY GENERATED, DO NOT EDIT!\n"),
119 | strings.NewReader("// merged from marisa-trie "+version+".\n"),
120 | // Include the license info.
121 | bytes.NewReader([]byte{'\n', '/', '/', ' '}),
122 | bytes.NewReader(bytes.ReplaceAll(files["COPYING.md"], []byte{'\n'}, []byte{'\n', '/', '/', ' '})),
123 | bytes.NewReader([]byte{'\n', '\n'}),
124 | // Include the warnings from the Makefile.am CXXFLAGS.
125 | // - Note that Clang also recognizes the GCC pragmas.
126 | strings.NewReader("#pragma GCC diagnostic warning \"-Wall\"\n"),
127 | strings.NewReader("#pragma GCC diagnostic warning \"-Weffc++\"\n"),
128 | strings.NewReader("#pragma GCC diagnostic warning \"-Wextra\"\n"),
129 | strings.NewReader("#pragma GCC diagnostic warning \"-Wconversion\"\n"),
130 | // Silence a warning.
131 | strings.NewReader("#pragma GCC diagnostic ignored \"-Wimplicit-fallthrough=\"\n"),
132 | // Include the libs themselves.
133 | bytes.NewReader(marisaGrimoireIOLib),
134 | bytes.NewReader(marisaGrimoireTrieLib),
135 | bytes.NewReader(marisaGrimoireVectorLib),
136 | bytes.NewReader(marisaLib),
137 | // Show info about the generated file.
138 | strings.NewReader("#line 1 \"libmarisa_generate.go\"\n"),
139 | strings.NewReader("#pragma GCC warning \"Using generated built-in marisa-trie "+version+".\"\n"),
140 | ), nil
141 | }
142 |
143 | func tarball(url string) (map[string][]byte, error) {
144 | fmt.Printf("Downloading tarball from %s\n", url)
145 |
146 | resp, err := http.Get(url)
147 | if err != nil {
148 | return nil, err
149 | }
150 | defer resp.Body.Close()
151 |
152 | zr, err := gzip.NewReader(resp.Body)
153 | if err != nil {
154 | return nil, err
155 | }
156 |
157 | var pfx string
158 | files := map[string][]byte{}
159 |
160 | tr := tar.NewReader(zr)
161 | for {
162 | fh, err := tr.Next()
163 | if err == io.EOF {
164 | break
165 | } else if err != nil {
166 | return nil, err
167 | }
168 |
169 | if fh.Name == "pax_global_header" || fh.FileInfo().IsDir() {
170 | continue
171 | }
172 |
173 | if pfx == "" {
174 | if strings.HasPrefix(fh.Name, "./") {
175 | pfx = "./" + strings.Split(fh.Name, "/")[1] + "/"
176 | } else {
177 | pfx = strings.Split(fh.Name, "/")[0] + "/"
178 | }
179 | }
180 |
181 | if !strings.HasPrefix(fh.Name, pfx) {
182 | return nil, fmt.Errorf("extract file %#v: doesn't have common prefix %#v", fh.Name, pfx)
183 | }
184 |
185 | buf, err := ioutil.ReadAll(tr)
186 | if err != nil {
187 | return nil, fmt.Errorf("extract file %#v: %w", fh.Name, err)
188 | }
189 |
190 | fn := strings.TrimPrefix(fh.Name, pfx)
191 | files[fn] = buf
192 |
193 | fmt.Printf(" [D] %s\n", fn) // downloaded
194 | }
195 |
196 | return files, nil
197 | }
198 |
199 | func resolve(files map[string][]byte, filenames []string, includePath ...string) (resolvedFile []byte, err error) {
200 | fmt.Printf("Resolving C* source files %s (against:%s) (I = included, S = preserved because not found, R = skipped because already included)\n", filenames, includePath)
201 |
202 | var resolveFn func(indent string, files map[string][]byte, filename string, buf []byte, done []string, includePath []string) (resolvedFile []byte, err error)
203 | resolveFn = func(indent string, files map[string][]byte, filename string, buf []byte, done []string, includePath []string) (resolvedFile []byte, err error) {
204 | defer func() {
205 | if rerr := recover(); rerr != nil {
206 | resolvedFile, err = nil, rerr.(error)
207 | }
208 | }()
209 |
210 | resolvedFile = regexp.MustCompile(`(?m)^\s*#\s*include\s+["'<][^"'>]+["'>]$`).ReplaceAllFunc(buf, func(importBuf []byte) []byte {
211 | fn := string(regexp.MustCompile(`["'<]([^"'>]+)["'>]`).FindSubmatch(importBuf)[1])
212 |
213 | for _, ip := range includePath {
214 | ifn := path.Join(ip, fn)
215 | for _, dfn := range done {
216 | if m, _ := path.Match(dfn, ifn); m {
217 | fmt.Printf("%s[R] %s\n", indent, fn) // already included
218 | return nil
219 | }
220 | }
221 |
222 | ibuf, ok := files[ifn]
223 | if ok {
224 | fmt.Printf("%s[I] %s => %s\n", indent, fn, ifn) // include
225 | ibuf, err := resolveFn(indent+" ", files, ifn, ibuf, append(done, ifn), append(includePath, path.Dir(ifn)))
226 | if err != nil {
227 | panic(fmt.Errorf("resolve %#v: %w", ifn, err))
228 | }
229 | return append(append([]byte{'\n', '\n'}, ibuf...), '\n', '\n')
230 | }
231 | }
232 |
233 | fmt.Printf("%s[S] %s\n", indent, fn) // preserve
234 | return importBuf
235 | })
236 |
237 | return
238 | }
239 |
240 | for _, fn := range filenames {
241 | if buf, ok := files[fn]; !ok {
242 | return nil, fmt.Errorf("file %#v: not found", fn)
243 | } else if buf, err := resolveFn(" ", files, fn, buf, []string{fn}, append(includePath, path.Dir(fn))); err != nil {
244 | return nil, fmt.Errorf("file %v: %w", fn, err)
245 | } else {
246 | resolvedFile = append(resolvedFile, buf...)
247 | resolvedFile = append(resolvedFile, '\n', '\n')
248 | }
249 | }
250 |
251 | return resolvedFile, nil
252 | }
253 |
--------------------------------------------------------------------------------
/marisa/marisa.cc:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #include "libmarisa.h"
7 | #include "shim.h"
8 |
9 | #define catch_go_ex(t, ctx) \
10 | catch (const t &ex) { \
11 | const char* b = ctx; \
12 | char* err = reinterpret_cast( \
13 | calloc(strlen(b)+strlen(ex.what())+1, sizeof(char))); \
14 | strcpy(err, b); \
15 | strcat(err, ex.what()); \
16 | return err; \
17 | }
18 |
19 | #define catch_go \
20 | catch_go_ex(marisa::Exception, "marisa: ") \
21 | catch_go_ex(go::error, "go shim: ") \
22 | catch_go_ex(std::runtime_error, "c++ runtime: ") \
23 | catch_go_ex(std::exception, "c++ error: ") \
24 | catch (...) { return strdup("marisa: unknown c++ exception"); } \
25 | return NULL;
26 |
27 | #define go_func extern "C" const char*
28 |
29 | go_func marisa_read_all(int iid, char ***out_wd, size_t *out_wd_sz) {
30 | try {
31 | if (!out_wd || !out_wd_sz)
32 | throw std::runtime_error("parameter is null");
33 | go::rstream r(iid);
34 | marisa::Trie t;
35 | marisa::read(r, &t);
36 | marisa::Agent a;
37 | a.set_query("");
38 | *out_wd_sz = 0;
39 | *out_wd = reinterpret_cast(calloc(t.num_keys(), sizeof(char**)));
40 | while (t.predictive_search(a)) {
41 | if (*out_wd_sz == t.num_keys())
42 | throw std::runtime_error("expected " + std::to_string(t.num_keys()) + " keys, got more");
43 | memcpy((*out_wd)[(*out_wd_sz)++] = reinterpret_cast(calloc(a.key().length()+1, sizeof(char))), a.key().ptr(), a.key().length());
44 | }
45 | if (*out_wd_sz != t.num_keys())
46 | throw std::runtime_error("expected " + std::to_string(t.num_keys()) + " keys, got " + std::to_string(*out_wd_sz));
47 | } catch_go
48 | }
49 |
50 | go_func marisa_write_all(int iid, const char** wd, size_t wd_sz) {
51 | try {
52 | if (wd_sz && !wd)
53 | throw std::runtime_error("parameter is null");
54 | marisa::Keyset k;
55 | for (size_t i = 0; i < wd_sz; i++)
56 | k.push_back(wd[i]);
57 | marisa::Trie t;
58 | t.build(k);
59 | go::wstream w(iid);
60 | marisa::write(w, t);
61 | } catch_go
62 | }
63 |
--------------------------------------------------------------------------------
/marisa/marisa.go:
--------------------------------------------------------------------------------
1 | // Package marisa provides a simplified self-contained CGO wrapper for
2 | // marisa-trie (https://github.com/s-yata/marisa-trie).
3 | package marisa
4 |
5 | //go:generate go run -tags libmarisa_generate libmarisa_generate.go
6 |
7 | //#cgo CPPFLAGS: -Wall
8 | //#cgo LDFLAGS:
9 | //#include
10 | //#include
11 | //const char* marisa_read_all(int iid, char ***out_wd, size_t *out_wd_sz);
12 | //const char* marisa_write_all(int iid, const char** wd, size_t wd_sz);
13 | import "C"
14 |
15 | import (
16 | "errors"
17 | "io"
18 | "unsafe"
19 | )
20 |
21 | func ReadAll(r io.Reader) ([]string, error) {
22 | iid := iopPut(r)
23 | var out_wd **C.char
24 | var out_wd_sz C.size_t
25 | err := C.marisa_read_all(
26 | (C.int)(iid),
27 | (***C.char)(unsafe.Pointer(&out_wd)),
28 | (*C.size_t)(unsafe.Pointer(&out_wd_sz)),
29 | )
30 | iopDel(iid)
31 | return gostrs(out_wd, out_wd_sz), goerr(err)
32 | }
33 |
34 | func WriteAll(w io.Writer, wd []string) error {
35 | iid := iopPut(w)
36 | wd_ptr, wd_sz, wd_free := cstrs(wd)
37 | err := C.marisa_write_all(
38 | (C.int)(iid),
39 | (**C.char)(wd_ptr),
40 | (C.size_t)(wd_sz),
41 | )
42 | wd_free()
43 | iopDel(iid)
44 | return goerr(err)
45 | }
46 |
47 | func goerr(p *C.char) (err error) {
48 | if p != nil {
49 | err = errors.New(C.GoString(p))
50 | C.free(unsafe.Pointer(p))
51 | }
52 | return
53 | }
54 |
55 | func gostrs(p **C.char, n C.size_t) (s []string) {
56 | if p != nil {
57 | s = make([]string, int(n))
58 | for i, v := range (*[1 << 28]*C.char)(unsafe.Pointer(p))[:int(n):int(n)] {
59 | s[i] = C.GoString(v)
60 | C.free(unsafe.Pointer(v))
61 | }
62 | C.free(unsafe.Pointer(p))
63 | }
64 | return
65 | }
66 |
67 | func cstrs(s []string) (p **C.char, n C.size_t, free func()) {
68 | n = (C.size_t)(len(s))
69 | if len(s) == 0 {
70 | free = func() {}
71 | return
72 | }
73 | c := make([]*C.char, len(s))
74 | for i, v := range s {
75 | c[i] = C.CString(v)
76 | }
77 | p = (**C.char)(unsafe.Pointer(&c[0]))
78 | free = func() {
79 | for _, v := range c {
80 | C.free(unsafe.Pointer(v))
81 | }
82 | }
83 | return
84 | }
85 |
--------------------------------------------------------------------------------
/marisa/marisa_test.go:
--------------------------------------------------------------------------------
1 | package marisa
2 |
3 | import (
4 | "bytes"
5 | "crypto/sha1"
6 | "encoding/hex"
7 | "errors"
8 | "io"
9 | "reflect"
10 | "runtime"
11 | "strings"
12 | "testing"
13 | )
14 |
15 | func TestTrieIO(t *testing.T) {
16 | emptyBuf := bytes.NewBuffer(nil)
17 | emptyS := "1aa6c451104c2c1b24ecb66ecb84bde2403c49b1" // marisa-build
5 | //#include
6 | import "C"
7 |
8 | import (
9 | "fmt"
10 | "io"
11 | "sync"
12 | "unsafe"
13 | )
14 |
15 | // shim.go and shim.h (plus _cgo_export.h implicitly), implement a shim to
16 | // access Go I/O interfaces efficiently, concurrently, cleanly, and safely from
17 | // C/C++ code. Note that if any C strings are returned by the Go side, they must
18 | // be freed on the C side.
19 |
20 | // https://golang.org/issue/13656#issuecomment-253600758
21 | // https://golang.org/cmd/cgo/#hdr-C_references_to_Go
22 | // https://stackoverflow.com/a/49879469
23 |
24 | var (
25 | iopMu sync.RWMutex // for controlling access to the slice header (i.e. https://stackoverflow.com/a/49879469)
26 | iop = []interface{}{nil} // the 0th element is reserved to prevent mistakes
27 | )
28 |
29 | // iopPut adds the io.Reader and/or io.Writer, and returns its new iid. The iid
30 | // will be valid until iopDel is called, but will never be reused.
31 | func iopPut(rw interface{}) int {
32 | switch rw.(type) {
33 | case io.Reader, io.Writer:
34 | iopMu.Lock()
35 | iop = append(iop, rw)
36 | iid := len(iop) - 1
37 | iopMu.Unlock()
38 | return iid
39 | default:
40 | panic("not a reader, writer, or both")
41 | }
42 | }
43 |
44 | // iopGet gets the interface referenced by iid. It will panic if iid has never
45 | // been issued by iopPut, and will return nil if it has been deleted by iopDel.
46 | func iopGet(iid int) interface{} {
47 | iopMu.RLock()
48 | if iid <= 0 || iid >= len(iop) {
49 | panic("invalid iid")
50 | }
51 | r := iop[iid]
52 | iopMu.RUnlock()
53 | return r
54 | }
55 |
56 | // iopDel sets the interface referenced by iid to nil to prevent future usage.
57 | // It will panic if iid has never been issued by iopPut.
58 | func iopDel(iid int) {
59 | iopMu.RLock()
60 | if iid <= 0 || iid >= len(iop) {
61 | panic("invalid iid")
62 | }
63 | iop[iid] = nil
64 | iopMu.RUnlock()
65 | }
66 |
67 | //export go_iop_check
68 | func go_iop_check(iid C.int, t C.int, out_err **C.char) bool /*C.bool*/ {
69 | var n []string
70 | i := iopGet(int(iid))
71 | if t&(1<<0) != 0 { // go_iop_type::reader
72 | if _, ok := iopGet(int(iid)).(io.Reader); !ok {
73 | n = append(n, "io.Reader")
74 | }
75 | }
76 | if t&(1<<1) != 0 { // go_iop_type::writer
77 | if _, ok := iopGet(int(iid)).(io.Writer); !ok {
78 | n = append(n, "io.Writer")
79 | }
80 | }
81 | if out_err != nil {
82 | if len(n) != 0 {
83 | *out_err = C.CString(fmt.Sprintf("iid %d: underlying type %T does not implement types %s", int(iid), i, n))
84 | } else {
85 | *out_err = nil
86 | }
87 | }
88 | return len(n) == 0
89 | }
90 |
91 | //export go_iop_read
92 | func go_iop_read(iid C.int, buf *C.char, buf_n C.size_t, out_err **C.char) C.ptrdiff_t {
93 | *out_err = nil
94 | switch i := iopGet(int(iid)).(type) {
95 | case io.Reader:
96 | n, err := i.Read((*[1 << 28]byte)(unsafe.Pointer(buf))[:int(buf_n):int(buf_n)])
97 | if err == io.EOF {
98 | if n == 0 {
99 | return C.ptrdiff_t(-1)
100 | }
101 | } else if err != nil {
102 | *out_err = C.CString(fmt.Sprintf("go_iop_read: read up to %d bytes from iid %d: %v", buf_n, int(iid), err))
103 | }
104 | return C.ptrdiff_t(n)
105 | case nil:
106 | *out_err = C.CString(fmt.Sprintf("go_iop_read: iid %d has been deleted", int(iid)))
107 | return C.ptrdiff_t(0)
108 | default:
109 | *out_err = C.CString(fmt.Sprintf("go_iop_read: iid %d is a %T, not an io.Reader", int(iid), i))
110 | return C.ptrdiff_t(0)
111 | }
112 | }
113 |
114 | //export go_iop_write
115 | func go_iop_write(iid C.int, buf *C.char, buf_n C.size_t, out_err **C.char) C.ptrdiff_t {
116 | *out_err = nil
117 | switch i := iopGet(int(iid)).(type) {
118 | case io.Writer:
119 | n, err := i.Write((*[1 << 28]byte)(unsafe.Pointer(buf))[:int(buf_n):int(buf_n)])
120 | if err == io.EOF {
121 | if n == 0 {
122 | return C.ptrdiff_t(-1)
123 | }
124 | } else if err != nil {
125 | *out_err = C.CString(fmt.Sprintf("go_iop_write: write up to %d bytes to iid %d: %v", buf_n, int(iid), err))
126 | }
127 | return C.ptrdiff_t(n)
128 | case nil:
129 | *out_err = C.CString(fmt.Sprintf("go_iop_write: iid %d has been deleted", int(iid)))
130 | return C.ptrdiff_t(0)
131 | default:
132 | *out_err = C.CString(fmt.Sprintf("go_iop_write: iid %d is a %T, not an io.Writer", int(iid), i))
133 | return C.ptrdiff_t(0)
134 | }
135 | }
136 |
--------------------------------------------------------------------------------
/marisa/shim.h:
--------------------------------------------------------------------------------
1 | #ifndef GO_SHIM_H
2 | #define GO_SHIM_H
3 |
4 | #ifdef __cplusplus
5 | #include
6 | extern "C" {
7 | #else
8 | #include
9 | #include
10 | #endif
11 |
12 | // go_iop_type represents interfaces an iid may implement.
13 | enum go_iop_type {
14 | reader = 1 << 0, // io.Reader
15 | writer = 1 << 1, // io.Writer
16 | };
17 |
18 | // go_iop_reader checks if the specified iid implements the specified ORed type
19 | // flags. Note that it doesn't have to be checked here, as go_iop_* will return
20 | // an error if it doesn't implement the necessary interfaces. If out_err is not
21 | // NULL and the return value is false, it will be set to an error message, which
22 | // must be freed by the caller, if the iid doesn't implement the specified
23 | // flags.
24 | bool go_iop_check(int iid, int t, char **out_err);
25 |
26 | // Note: we use ptrdiff_t over ssize_t for portability (and not size_t because
27 | // it will return -1 for EOF). Also, note that C++'s std::streamsize uses this
28 | // internally too, which is a nice advantage.
29 |
30 | // go_iop_read reads from the iid's underlying io.Reader. It has the same
31 | // semantics as the Go one, but io.EOF is returned as -1. out_err must be a
32 | // valid pointer to a char pointer. If an error occured, it is set and must be
33 | // freed by the caller.
34 | ptrdiff_t go_iop_read(int iid, const char *p, size_t n, char **out_err);
35 | // go_iop_write writes to the iid's underlying io.Writer. It has the same
36 | // semantics as the Go one, but io.EOF is returned as -1. out_err must be a
37 | // valid pointer to a char pointer. If an error occured, it is set and must be
38 | // freed by the caller.
39 | ptrdiff_t go_iop_write(int iid, const char *p, size_t n, char **out_err);
40 |
41 | #ifdef __cplusplus
42 | }
43 |
44 | #include
45 | #include
46 | #include
47 | #include
48 |
49 | // https://golang.org/cmd/cgo/#hdr-C_references_to_Go
50 | // https://en.cppreference.com/w/cpp/io/basic_streambuf <- this describes it better than many of the other sites I found
51 |
52 | namespace go {
53 |
54 | bool dbg(const char* format, ...) {
55 | static bool _dbg = getenv("GOSHIMDEBUG") ? getenv("GOSHIMDEBUG")[0] == '1' && getenv("GOSHIMDEBUG")[1] == '\0' : false;
56 | if (!_dbg)
57 | return false;
58 | fprintf(stderr, "GOSHIMDEBUG: ");
59 | va_list arg;
60 | va_start(arg, format);
61 | vfprintf(stderr, format, arg);
62 | va_end(arg);
63 | fflush(stderr);
64 | return true;
65 | }
66 |
67 | class error : public std::runtime_error {
68 | public:
69 | error(const char* what) : std::runtime_error(what) {
70 | go::dbg("new go::error(%s)\n", what);
71 | };
72 |
73 | // check checks an output err pointer and frees+throws it if set.
74 | static void check(char* err) {
75 | if (!err)
76 | return;
77 | go::error ex = go::error(err);
78 | free(err);
79 | throw ex;
80 | }
81 | };
82 |
83 | class iopbuf : public std::basic_streambuf {
84 | int iid_;
85 | char rbuf_; // single-byte read buffer (i.e. direct access to the io.Reader)
86 | public:
87 | static_assert((std::is_same::value && std::is_same::value), "Go shim only supports char"); // just to be safe
88 | #ifndef __clang__
89 | static_assert(iopbuf::traits_type::eof() != iopbuf::traits_type::to_int_type((char) 0xFF), "EOF not distinct from 0xFF"); // this is already specified in the spec, but just to make sure
90 | #endif
91 |
92 | iopbuf(int iid) : iid_(iid) {
93 | this->setg(&this->rbuf_, &this->rbuf_ + 1, &this->rbuf_ + 1); // set the buffer, but at the end to force the next read to underflow
94 | }
95 |
96 | iopbuf(int iid, int t) : iopbuf(iid) {
97 | char* err = NULL;
98 | go_iop_check(iid, t, &err);
99 | go::error::check(err);
100 | }
101 |
102 | iopbuf::int_type underflow() override {
103 | // This is all that's strictly needed for reading. Note that we can't
104 | // just return the char, and we must set the buffer to point to it to
105 | // conform to the expected postconditions and prevent unusual bugs from
106 | // popping up.
107 |
108 | char* err = NULL;
109 | ptrdiff_t n = go_iop_read(this->iid_, &this->rbuf_, 1, &err);
110 | go::dbg("underflow: go_iop_read(%d, 1) = %td %02x err=%s\n", this->iid_, n, this->rbuf_, err); fflush(stdout);
111 | go::error::check(err);
112 |
113 | this->setg(&this->rbuf_, &this->rbuf_, &this->rbuf_ + (n>0 ? n : 0)); // Update the current byte.
114 | return this->gptr() == this->egptr() // If the new current pos == past end of buffer, no byte was read (n<=0).
115 | ? iopbuf::traits_type::eof() // If no byte was read (and no error was thrown earlier), it's an EOF.
116 | : iopbuf::traits_type::to_int_type(this->rbuf_); // Otherwise, return the byte we just read (note: without to_int_type, 0xFF would be sign extended to -1/eof).
117 | }
118 |
119 | std::streamsize xsgetn(iopbuf::char_type* buf, std::streamsize buf_n) override {
120 | // We can provide a more efficient bulk read implementation than the
121 | // default one which gets each byte one-by-one in a loop.
122 | // Note: Remember to test ::underflow by forcing it to use the default
123 | // implementation: return std::streambuf::xsgetn(buf, buf_n);
124 |
125 | std::streamsize t = 0;
126 |
127 | ptrdiff_t n = 0;
128 | char* err = NULL;
129 | while (t != buf_n && n != -1) {
130 | n = go_iop_read(this->iid_, buf+t, buf_n-t, &err);
131 | go::dbg("xsgetn: go_iop_read(%d, %zu) = %td (%td/%td) err=%s\n", this->iid_, buf_n-t, n, t+(n>0 ? n : 0), buf_n, err); fflush(stdout);
132 | t += n>0 ? n : 0;
133 | if (t > buf_n)
134 | throw go::error("read returned too many bytes!");
135 | go::error::check(err);
136 | }
137 |
138 | this->rbuf_ = t>0 ? buf[t-1] : 0; // Set the current byte to the last one read, if any.
139 | this->setg(&this->rbuf_, &this->rbuf_, &this->rbuf_ + (t>0 ? 1 : 0)); // Update the current byte.
140 | return this->gptr() == this->egptr() // If the new current pos == past end of buffer, no byte was read (n<=0).
141 | ? iopbuf::traits_type::eof() // If no byte was read (and no error was thrown earlier), it's an EOF
142 | : t; // Otherwise, return the number of bytes read.
143 | }
144 |
145 | iopbuf::int_type overflow(iopbuf::int_type c = iopbuf::traits_type::eof()) override {
146 | // Unlike for reading, we don't have to use a buffer (you can read a
147 | // byte advancing, but you can't do that kind of thing when writing),
148 | // so we'll just write it directly. This makes the implementation much
149 | // simpler, as we're basically just passing the calls to the Go funcs
150 | // directly.
151 |
152 | // Usually, we would flush the buffer if given an EOF instead of a char,
153 | // but we're not using one, so it's a no-op.
154 | if (iopbuf::traits_type::eq_int_type(c, iopbuf::traits_type::eof()))
155 | return 0;
156 |
157 | // Since the logic is basically a simplified version of xsputn, just
158 | // with a single char, it's easier just to call it and implement the
159 | // bulk of the logic there.
160 | if (this->xsputn(reinterpret_cast(&c), 1) != 1)
161 | throw go::error("short write"); // we still need to check for a short write
162 | return c;
163 | }
164 |
165 | std::streamsize xsputn(const iopbuf::char_type* buf, std::streamsize buf_n) override {
166 | char* err = NULL;
167 | ptrdiff_t n = go_iop_write(this->iid_, buf, buf_n, &err);
168 | go::error::check(err);
169 | if (n == -1)
170 | throw go::error("EOF while writing to Go writer");
171 | return n;
172 | }
173 | };
174 |
175 | class rwstream : private iopbuf, public std::iostream {
176 | public: rwstream(int iid) : iopbuf(iid, go_iop_type::reader|go_iop_type::writer), std::iostream(this) {}
177 | };
178 |
179 | class wstream : private iopbuf, public std::ostream {
180 | public: wstream(int iid) : iopbuf(iid, go_iop_type::writer), std::ostream(this) {}
181 | };
182 |
183 | class rstream : private iopbuf, public std::istream {
184 | public: rstream(int iid) : iopbuf(iid, go_iop_type::reader), std::istream(this) {}
185 | };
186 |
187 | }
188 |
189 | #endif
190 | #endif
--------------------------------------------------------------------------------