├── .appveyor.yml
├── .drone.yml
├── .gitattributes
├── .travis.yml
├── LICENSE.md
├── README.md
├── cmd
    ├── dictgen
    │   └── main.go
    └── dictutil
    │   ├── install.go
    │   ├── main.go
    │   ├── pack.go
    │   ├── prefix.go
    │   ├── uninstall.go
    │   └── unpack.go
├── dictgen
    ├── dictfile.go
    ├── dictfile_test.go
    ├── dictgen.go
    ├── image.go
    └── image_test.go
├── docs
    ├── _config.yml
    ├── _includes
    │   └── head_custom.html
    ├── dictgen
    │   └── index.md
    ├── dicthtml
    │   ├── format.md
    │   ├── index.md
    │   ├── install.md
    │   ├── matching.md
    │   ├── prefixes.md
    │   ├── v1v2-1.png
    │   ├── v1v2-2.png
    │   └── v1v2.md
    ├── dictutil
    │   ├── index.md
    │   ├── install.md
    │   ├── pack.md
    │   ├── prefix.md
    │   ├── uninstall.md
    │   └── unpack.md
    ├── examples
    │   ├── bgl-convert.md
    │   ├── dictzip-decompile.md
    │   ├── gotdict-convert.md
    │   ├── index.md
    │   └── webster1913-convert.md
    └── index.md
├── examples
    ├── bgl-convert
    │   └── index.html
    ├── dictzip-decompile
    │   ├── main.go
    │   └── parse.go
    ├── gotdict-convert
    │   ├── gotdict
    │   │   └── parser.go
    │   └── main.go
    └── webster1913-convert
    │   ├── main.go
    │   └── webster1913
    │       └── parser.go
├── go.mod
├── go.sum
├── kobodict
    ├── crypt.go
    ├── crypt_test.go
    ├── fs.go
    ├── fs_test.go
    ├── marisa.go
    ├── marisa
    │   ├── marisa.go
    │   ├── marisa_cgo.go
    │   └── marisa_test.go
    ├── reader.go
    ├── reader_test.go
    ├── util.go
    ├── util_test.go
    ├── writer.go
    └── writer_test.go
└── marisa
    ├── libmarisa.cc
    ├── libmarisa.h
    ├── libmarisa_generate.go
    ├── marisa.cc
    ├── marisa.go
    ├── marisa_test.go
    ├── shim.go
    └── shim.h


/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | image: ubuntu
 2 | version: "{build}"
 3 | 
 4 | environment:
 5 |   GO111MODULE: on
 6 | 
 7 | install:
 8 | - go mod download
 9 | 
10 | build_script:
11 | - mkdir bin gotdict webster1913
12 | - CGO_ENABLED=1 go build -o ./bin/dictgen             ./cmd/dictgen
13 | - CGO_ENABLED=0 go build -o ./bin/gotdict-convert     ./examples/gotdict-convert
14 | - CGO_ENABLED=0 go build -o ./bin/webster1913-convert ./examples/webster1913-convert
15 | - curl -#Lo ./webster1913/webster1913.txt http://www.gutenberg.org/ebooks/29765.txt.utf-8
16 | - curl -#Lo - https://github.com/wjdp/gotdict/archive/6b4d6cdbb1f5d899d418783ab842f487aafa79ec.tar.gz | tar -xzf - --strip-components=1 -C ./gotdict
17 | - ./bin/gotdict-convert     -o ./gotdict/gotdict.df         -g ./gotdict --images
18 | - ./bin/gotdict-convert     -o ./gotdict/gotdict.noimg.df   -g ./gotdict
19 | - ./bin/webster1913-convert -o ./webster1913/webster1913.df ./webster1913/webster1913.txt
20 | - ./bin/dictgen -Ibase64 -o ./gotdict/dicthtml-gt.zip       ./gotdict/gotdict.df
21 | - ./bin/dictgen -Iremove -o ./gotdict/dicthtml-gt.noimg.zip ./gotdict/gotdict.noimg.df
22 | - ./bin/dictgen -Iremove -o ./webster1913/dicthtml-wb.zip   ./webster1913/webster1913.df
23 | 
24 | test_script:
25 | - go test -v -cover ./...
26 | - mkdir tmp
27 | - CGO_ENABLED=1 go build -o ./bin/dictutil ./cmd/dictutil
28 | - ./bin/dictutil u -o ./tmp/1 ./gotdict/dicthtml-gt.zip
29 | - ./bin/dictutil u -o ./tmp/2 ./gotdict/dicthtml-gt.noimg.zip
30 | - ./bin/dictutil u -o ./tmp/3 ./webster1913/dicthtml-wb.zip
31 | - ./bin/dictutil p -o ./tmp/1.zip ./tmp/1
32 | - ./bin/dictutil p -o ./tmp/2.zip ./tmp/2
33 | - ./bin/dictutil p -o ./tmp/3.zip ./tmp/3
34 | - sha1sum ./gotdict/dicthtml-gt.zip ./gotdict/dicthtml-gt.noimg.zip ./webster1913/dicthtml-wb.zip
35 | - sha1sum ./tmp/1.zip               ./tmp/2.zip                     ./tmp/3.zip
36 | - cmp ./tmp/1.zip ./gotdict/dicthtml-gt.zip
37 | - cmp ./tmp/2.zip ./gotdict/dicthtml-gt.noimg.zip
38 | - cmp ./tmp/3.zip ./webster1913/dicthtml-wb.zip
39 | 
40 | artifacts:
41 | - path: gotdict/gotdict.df
42 | - path: gotdict/gotdict.noimg.df
43 | - path: gotdict/dicthtml-gt.zip
44 | - path: gotdict/dicthtml-gt.noimg.zip
45 | - path: webster1913/webster1913.df
46 | - path: webster1913/dicthtml-wb.zip
47 | 
48 | deploy: off
49 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | marisa/libmarisa.cc linguist-generated=true
2 | marisa/libmarisa.h  linguist-generated=true


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | os:
 2 | - osx
 3 | 
 4 | language: go
 5 | 
 6 | go:
 7 | - 1.14.x
 8 | 
 9 | env:
10 |   GO111MODULE: "on"
11 | 
12 | script:
13 | - go run -mod=readonly ./cmd/dictutil --help
14 | - go run -mod=readonly ./cmd/dictgen --help
15 | - go run -mod=readonly ./examples/dictzip-decompile --help
16 | - go run -mod=readonly ./examples/gotdict-convert --help
17 | - go run -mod=readonly ./examples/webster1913-convert --help
18 | - go test -mod=readonly -v ./...
19 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Copyright (c) 2020 Patrick Gaskin
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">dictutil</h1>
  2 | 
  3 | [![](https://img.shields.io/github/v/release/pgaskin/dictutil?include_prereleases)](https://github.com/pgaskin/dictutil/releases) [![](https://img.shields.io/drone/build/pgaskin/dictutil/master)](https://cloud.drone.io/pgaskin/dictutil) [![](https://img.shields.io/badge/godoc-reference-blue.svg)](https://pkg.go.dev/mod/github.com/pgaskin/dictutil?tab=versions) [![](https://goreportcard.com/badge/github.com/pgaskin/dictutil)](https://goreportcard.com/report/github.com/pgaskin/dictutil)
  4 | 
  5 | This repository contains a collection of tools and libraries to work with Kobo dictionaries, plus comprehensive documentation of Kobo's dictionary format.
  6 | 
  7 | Unlike previous attempts at working with Kobo dictionaries, dictutil has full support for all features supported by nickel (word prefixes, unicode, variants, images, etc), with a focus on simplicity, correctness (prefix generation and other features are directly tested against libnickel's code and regexps, v1/v2 dictionaries are differentiated), and completeness (most of the research was done by reverse-engineering libnickel).
  8 | 
  9 | Dictutil consists of multiple tools and libraries:
 10 | - [**dictutil**](https://pgaskin.net/dictutil/dictutil/) provides commands for installing, removing, unpacking, packing, and performing low-level modifications and tests on Kobo dictionaries. All operations are intended to be correct, lossless, and deterministic.
 11 | - [**dictgen**](https://pgaskin.net/dictutil/dictgen/) simplifies creating full-featured dictionaries for Kobo eReaders, with support for images, unicode prefixes, raw html, markdown, and more.
 12 | - [**dicthtml**](https://pgaskin.net/dictutil/dicthtml/) documents Kobo's dictionary format and how it works.
 13 | - [**examples/gotdict-convert**](https://pgaskin.net/dictutil/examples/gotdict-convert.html) is a working example of using dictutil to convert [GOTDict](https://github.com/wjdp/gotdict) into a Kobo dictionary.
 14 | - [**examples/webster1913-convert**](https://pgaskin.net/dictutil/examples/webster1913-convert.html) is a working example of using dictutil to convert [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) into a Kobo dictionary.
 15 | - [**examples/dictzip-decompile**](https://pgaskin.net/dictutil/examples/dictzip-decompile.html) is an **experimental** tool to convert a dictzip into a dictfile.
 16 | - [**examples/bgl-convert**](https://pgaskin.net/dictutil/examples/bgl-convert.html) is a simple tool to convert Babylon BGL dictionaries to a dictfile.
 17 | - *Library:* [**kobodict**](https://pkg.go.dev/github.com/pgaskin/dictutil/kobodict) provides support for reading, writing, encrypting, and decrypting Kobo dictionaries.
 18 | - *Library:* [**dictgen**](https://pkg.go.dev/github.com/pgaskin/dictutil/dictgen) provides the functionality of dictgen as a library.
 19 | - *Library:* [**marisa**](./marisa) provides a simplified self-contained CGO wrapper for [marisa-trie](https://github.com/s-yata/marisa-trie).
 20 | 
 21 | Dictutil implements [version 2](https://pgaskin.net/dictutil/dicthtml/v1v2.html) of the Kobo dictionary format, which supports firmware versions 4.7.10364+.
 22 | 
 23 | For more information, see the [documentation](https://pgaskin.net/dictutil/). If you just want a quick overview of the utilities provided, continue reading below.
 24 | 
 25 | ## Download
 26 | - **Documentation** can be found on the [website](https://pgaskin.net/dictutil/).
 27 | - **Tools** (dictutil, dictgen, gotdict-convert, webster1913-convert) can be downloaded from the [releases](https://github.com/pgaskin/dictutil/releases) page.
 28 | - **Pre-built dictionaries** from gotdict-convert and webster1913-convert can be downloaded from [AppVeyor](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts) or from the links below:
 29 |   - GOTDict *(with images, firmware 4.20.14601+)*: [dictzip (dicthtml-gt.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.df?branch=master&all=false&pr=false)
 30 |   - GOTDict *(without images)*: [dictzip (dicthtml-gt.noimg.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.noimg.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.noimg.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.noimg.df?branch=master&all=false&pr=false)
 31 |   - Webster's 1913 Dictionary: [dictzip (dicthtml-wb.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/dicthtml-wb.zip?branch=master&all=false&pr=false), [source dictfile (webster1913.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/webster1913.df?branch=master&all=false&pr=false)
 32 | - **API documentation** for the Go libraries can be found on [pkg.go.dev](https://pkg.go.dev/github.com/pgaskin/dictutil).
 33 | 
 34 | ## Usage
 35 | See the [documentation](https://pgaskin.net/dictutil/) for more detailed information and examples.
 36 | 
 37 | ### dictutil
 38 | 
 39 | ```
 40 | Usage: dictutil command [options] [arguments]
 41 | 
 42 | Dictutil provides low-level utilities to manipulate Kobo dictionaries (v2).
 43 | 
 44 | Commands:
 45 |   install (I)          Install a dictzip file
 46 |   pack (p)             Pack a dictzip file
 47 |   prefix (x)           Calculate the prefix for a word
 48 |   uninstall (U)        Uninstall a dictzip file
 49 |   unpack (u)           Unpack a dictzip file
 50 |   help                 Show help for all commands
 51 | 
 52 | Options:
 53 |   -h, --help   Show this help text
 54 | ```
 55 | 
 56 | ```
 57 | Usage: dictutil install [options] dictzip
 58 | 
 59 | Options:
 60 |   -k, --kobo string         KOBOeReader path (default: automatically detected)
 61 |   -l, --locale string       Locale name to use (format: ALPHANUMERIC{2}[-ALPHANUMERIC{2}]) (default: detected from filename if in format dicthtml-**.zip)
 62 |   -n, --name string         Custom additional label for dictionary (ignored when replacing built-in dictionaries) (doesn't have any effect on 4.20.14601+)
 63 |   -b, --builtin string      How to handle built-in locales [replace = replace and prevent from syncing] [ignore = replace and leave syncing as-is] (doesn't have any effect on 4.24.15672+) (default "replace")
 64 |   -B, --no-custom           Whether to force installation to .kobo/dict instead of .kobo/custom-dict (4.24.15672+ only)
 65 |       --use-extra-locales   Whether to use ExtraLocales on 4.24.15672+ if not a built-in dictionary (this is not required anymore since 4.24.15672) (4.24.15672+ only)
 66 |   -h, --help                Show this help text
 67 | 
 68 | Note:
 69 |   If you are not replacing a built-in dictionary and are using a firmware
 70 |   version before 4.24.15672, the 'Enable searches on extra dictionaries patch'
 71 |   must be installed or you will not be able to select your custom dictionary.
 72 | ```
 73 | 
 74 | ```
 75 | Usage: dictutil uninstall [options] locale
 76 | 
 77 | Options:
 78 |   -k, --kobo string      KOBOeReader path (default: automatically detected)
 79 |   -b, --builtin string   How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+) (default "normal")
 80 |   -B, --no-custom        Uninstall built-in dictionaries instead of custom ones on 4.24.15672+
 81 |   -h, --help             Show this help text
 82 | ```
 83 | 
 84 | ```
 85 | Usage: dictutil pack [options] dictdir
 86 | 
 87 | Options:
 88 |   -o, --output string   The output dictzip filename (will be overwritten if it exists) (default "dicthtml.zip")
 89 |   -c, --crypt string    Encrypt the dictzip using the specified encryption method (format: method:keyhex)
 90 |   -h, --help            Show this help text
 91 | ```
 92 | 
 93 | ```
 94 | Usage: dictutil unpack [options] dictzip
 95 | 
 96 | Options:
 97 |   -o, --output string   The output directory (must not exist) (default: the basename of the input without the extension)
 98 |   -c, --crypt string    Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex)
 99 |   -h, --help            Show this help text
100 | ```
101 | 
102 | ```
103 | Usage: dictutil prefix [options] word...
104 | 
105 | Options:
106 |   -f, --format string   The output format (go-slice, go-map, csv, tsv, json-array, json-object) (default "json-array")
107 |   -h, --help            Show this help text
108 | ```
109 | 
110 | ### dictgen
111 | 
112 | ```
113 | Usage: dictgen [options] dictfile...
114 | 
115 | Options:
116 |   -o, --output string         The output filename (will be overwritten if it exists) (- is stdout) (default "dicthtml.zip")
117 |   -c, --crypt string          Encrypt the dictzip using the specified encryption method (format: method:keyhex)
118 |   -I, --image-method string   How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove) (default "base64")
119 |       --remove-footer         Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary)
120 |   -h, --help                  Show this help text
121 | 
122 | If multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename.
123 | 
124 | Note that the only usable image method is currently removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details.
125 | 
126 | See https://pgaskin.net/dictutil/dictgen for more information about the dictfile format.
127 | ```
128 | 
129 | **See [here](https://pgaskin.net/dictutil/dictgen/) for information and examples of the dictfile format.**
130 | 
131 | ### gotdict-convert
132 | 
133 | ```
134 | Usage: gotdict-convert [options]
135 | 
136 | Options:
137 |   -g, --gotdict string   The path to the local copy of github.com/wjdp/gotdict. (default "./gotdict")
138 |   -o, --output string    The output filename (will be overwritten if it exists) (- is stdout) (default "./gotdict.df")
139 |   -I, --images           Include images in dictfile
140 |   -h, --help             Show this help text
141 | 
142 | To convert the resulting dictfile into a dictzip, use dictgen.
143 | ```
144 | 
145 | ### webster1913-convert
146 | 
147 | ```
148 | Usage: webster1913-convert [options] gutenberg_webster1913_path
149 | 
150 | Options:
151 |   -o, --output string   The output filename (will be overwritten if it exists) (- is stdout) (default "./webster1913.df")
152 |       --dump            Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging)
153 |   -h, --help            Show this help text
154 | 
155 | Arguments:
156 |   gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin.
157 | 
158 | To convert the resulting dictfile into a dictzip, use dictgen.
159 | ```
160 | 
161 | The original dictionary can be downloaded [here](http://www.gutenberg.org/ebooks/29765.txt.utf-8) or [here](https://github.com/pgaskin/dictserver/raw/master/data/dictionary.txt).
162 | 
163 | ### dictzip-decompile
164 | 
165 | ```
166 | Usage: dictzip-decompile [options] dictzip
167 | 
168 | Options:
169 |   -o, --output string   The output filename (will be overwritten if it exists) (- is stdout) (default "./decompiled.df")
170 |   -r, --resources       Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled)
171 |   -h, --help            Show this help text
172 | 
173 | Arguments:
174 |   dictzip is the path to the dictzip to decompile.
175 | 
176 | To convert the resulting dictfile into a dictzip, use dictgen.
177 | 
178 | Note: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown.
179 | 
180 | This is an experimental tool, and the output may not be perfect on complex dictionaries.
181 | ```
182 | 


--------------------------------------------------------------------------------
/cmd/dictgen/main.go:
--------------------------------------------------------------------------------
  1 | // Command dictgen is a CLI wrapper around package dictgen.
  2 | package main
  3 | 
  4 | import (
  5 | 	"encoding/hex"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"os"
  9 | 	"strings"
 10 | 
 11 | 	_ "image/gif"
 12 | 	_ "image/jpeg"
 13 | 	_ "image/png"
 14 | 
 15 | 	"github.com/pgaskin/dictutil/dictgen"
 16 | 	"github.com/pgaskin/dictutil/kobodict"
 17 | 	"github.com/spf13/pflag"
 18 | 
 19 | 	_ "github.com/pgaskin/dictutil/kobodict/marisa"
 20 | )
 21 | 
 22 | var version = "dev"
 23 | 
 24 | func main() {
 25 | 	pflag.CommandLine.SortFlags = false
 26 | 	output := pflag.StringP("output", "o", "dicthtml.zip", "The output filename (will be overwritten if it exists) (- is stdout)")
 27 | 	crypt := pflag.StringP("crypt", "c", "", "Encrypt the dictzip using the specified encryption method (format: method:keyhex)")
 28 | 	imageMethod := pflag.StringP("image-method", "I", "base64", "How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove)")
 29 | 	removeFooter := pflag.Bool("remove-footer", false, "Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary)")
 30 | 	help := pflag.BoolP("help", "h", false, "Show this help text")
 31 | 	pflag.Parse()
 32 | 
 33 | 	if *help || pflag.NArg() == 0 {
 34 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options] dictfile...\n\nVersion: dictgen %s\n\nOptions:\n%s\nIf multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename.\n\nNote that currently, the only usable image method is removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary if images are enabled), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details.\n\nSee https://pgaskin.net/dictutil/dictgen for more information about the dictfile format.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
 35 | 		os.Exit(0)
 36 | 		return
 37 | 	}
 38 | 
 39 | 	var e kobodict.Crypter
 40 | 	if *crypt != "" {
 41 | 		if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 {
 42 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n")
 43 | 			os.Exit(2)
 44 | 			return
 45 | 		} else if key, err := hex.DecodeString(spl[1]); err != nil {
 46 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err)
 47 | 			os.Exit(2)
 48 | 			return
 49 | 		} else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil {
 50 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err)
 51 | 			os.Exit(2)
 52 | 			return
 53 | 		} else {
 54 | 			e = enc
 55 | 		}
 56 | 	}
 57 | 
 58 | 	var ih dictgen.ImageHandler
 59 | 	switch *imageMethod {
 60 | 	case "base64":
 61 | 		ih = new(dictgen.ImageHandlerBase64)
 62 | 	case "embed":
 63 | 		ih = new(dictgen.ImageHandlerEmbed)
 64 | 	case "remove":
 65 | 		ih = new(dictgen.ImageHandlerRemove)
 66 | 	default:
 67 | 		fmt.Fprintf(os.Stderr, "Error: invalid value for --image-method, see --help for details.")
 68 | 		os.Exit(2)
 69 | 		return
 70 | 	}
 71 | 
 72 | 	var tdf dictgen.DictFile
 73 | 
 74 | 	fmt.Fprintf(os.Stderr, "Parsing dictfiles.\n")
 75 | 	var seenStdin bool
 76 | 	for _, fn := range pflag.Args() {
 77 | 		if fn == "-" {
 78 | 			if seenStdin {
 79 | 				fmt.Fprintf(os.Stderr, "Error: stdin can only be specified once.\n")
 80 | 				os.Exit(1)
 81 | 				return
 82 | 			}
 83 | 			seenStdin = true
 84 | 		}
 85 | 
 86 | 		if err := func() error {
 87 | 			var fr io.Reader
 88 | 			if fn == "-" {
 89 | 				fr = os.Stdin
 90 | 			} else {
 91 | 				f, err := os.OpenFile(fn, os.O_RDONLY, 0)
 92 | 				if err != nil {
 93 | 					return err
 94 | 				}
 95 | 				defer f.Close()
 96 | 				fr = f
 97 | 			}
 98 | 
 99 | 			if df, err := dictgen.ParseDictFile(fr); err != nil {
100 | 				return err
101 | 			} else if err := df.Validate(); err != nil {
102 | 				return err
103 | 			} else {
104 | 				tdf = append(tdf, df...)
105 | 			}
106 | 
107 | 			return nil
108 | 		}(); err != nil {
109 | 			fmt.Fprintf(os.Stderr, "Error: input %#v: %v.\n", fn, err)
110 | 			os.Exit(1)
111 | 			return
112 | 		}
113 | 	}
114 | 
115 | 	if *removeFooter {
116 | 		fmt.Fprintf(os.Stderr, "Appending HTML code to remove entry footers (note: you don't need this and should not use it unless you are replacing a dictionary which adds it, such as the French one).\n")
117 | 		for _, dfe := range tdf {
118 | 			dfe.PostRawHTML += `<span class="end"><style>.end,.end+*{display: none !important;}</style></span>`
119 | 		}
120 | 	}
121 | 
122 | 	fmt.Fprintf(os.Stderr, "Opening output.\n")
123 | 	var f io.WriteCloser
124 | 	switch *output {
125 | 	case "-":
126 | 		f = os.Stdout
127 | 	default:
128 | 		ff, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
129 | 		if err != nil {
130 | 			fmt.Fprintf(os.Stderr, "Error: create dictzip: %v\n", err)
131 | 			os.Exit(1)
132 | 			return
133 | 		}
134 | 		f = ff
135 | 	}
136 | 
137 | 	fmt.Fprintf(os.Stderr, "Generating dictzip.\n")
138 | 	dw := kobodict.NewWriter(f)
139 | 	dw.SetEncrypter(e)
140 | 	if e != nil {
141 | 		fmt.Fprintf(os.Stderr, "  Using encryption.\n")
142 | 	}
143 | 	if ih != nil {
144 | 		fmt.Fprintf(os.Stderr, "  Using image method: %s.\n", ih.Description())
145 | 	}
146 | 	if err := tdf.WriteDictzip(dw, ih, dictgen.ImageFuncFilesystem); err != nil {
147 | 		f.Close()
148 | 		fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err)
149 | 		os.Exit(1)
150 | 		return
151 | 	} else if err := dw.Close(); err != nil {
152 | 		f.Close()
153 | 		fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err)
154 | 		os.Exit(1)
155 | 		return
156 | 	} else if err := f.Close(); err != nil {
157 | 		fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err)
158 | 		os.Exit(1)
159 | 		return
160 | 	}
161 | 
162 | 	fmt.Fprintf(os.Stderr, "Successfully wrote %d entries from %d dictfile(s) to dictzip %s.\n", len(tdf), pflag.NArg(), *output)
163 | 	os.Exit(0)
164 | }
165 | 


--------------------------------------------------------------------------------
/cmd/dictutil/main.go:
--------------------------------------------------------------------------------
 1 | // Command dictutil provides commands for installing, removing, unpacking,
 2 | // packing, and performing low-level modifications and tests on Kobo
 3 | // dictionaries.
 4 | package main
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | 	"os"
 9 | 	"sort"
10 | 
11 | 	"github.com/spf13/pflag"
12 | 
13 | 	_ "github.com/pgaskin/dictutil/kobodict/marisa"
14 | )
15 | 
16 | var version = "dev"
17 | 
18 | var commands []*command
19 | 
20 | type command struct {
21 | 	Name        string
22 | 	Short       string
23 | 	Description string
24 | 	Main        func(args []string, fs *pflag.FlagSet) int
25 | }
26 | 
27 | func main() {
28 | 	sort.Slice(commands, func(i, j int) bool {
29 | 		return commands[i].Name < commands[j].Name
30 | 	})
31 | 
32 | 	cmdMap := map[string]*command{}
33 | 	for _, cmd := range commands {
34 | 		for _, v := range []string{cmd.Name, cmd.Short} {
35 | 			if _, seen := cmdMap[v]; seen {
36 | 				panic("command already set: " + v)
37 | 			}
38 | 			cmdMap[v] = cmd
39 | 		}
40 | 	}
41 | 
42 | 	if len(os.Args) < 2 {
43 | 		globalHelp()
44 | 		os.Exit(0)
45 | 	}
46 | 
47 | 	if os.Args[1] == "help" {
48 | 		globalHelp()
49 | 		for _, cmd := range commands {
50 | 			fmt.Printf("\n### Help for %s:\n\n", cmd.Name)
51 | 			z := os.Args[0] + " " + cmd.Name
52 | 			cmd.Main([]string{z, "--help"}, pflag.NewFlagSet(z, pflag.ExitOnError))
53 | 		}
54 | 	} else if cmd, ok := cmdMap[os.Args[1]]; !ok {
55 | 		globalHelp()
56 | 		os.Exit(0)
57 | 	} else {
58 | 		args := append([]string{os.Args[0] + " " + os.Args[1]}, os.Args[2:]...)
59 | 		fs := pflag.NewFlagSet(args[0], pflag.ExitOnError)
60 | 		os.Exit(cmd.Main(args, fs))
61 | 	}
62 | }
63 | 
64 | func globalHelp() {
65 | 	fmt.Fprintf(os.Stderr, "Usage: %s command [options] [arguments]\n\nDictutil provides low-level utilities to manipulate Kobo dictionaries (v2).\n\nVersion: dictutil %s\n\nCommands:\n", os.Args[0], version)
66 | 	for _, cmd := range commands {
67 | 		fmt.Fprintf(os.Stderr, "  %-20s %s\n", fmt.Sprintf("%s (%s)", cmd.Name, cmd.Short), cmd.Description)
68 | 	}
69 | 	fmt.Fprintf(os.Stderr, "  %-20s %s\n", "help", "Show help for all commands")
70 | 	fmt.Fprintf(os.Stderr, "\nOptions:\n  -h, --help   Show this help text\n")
71 | }
72 | 


--------------------------------------------------------------------------------
/cmd/dictutil/pack.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"encoding/hex"
  5 | 	"fmt"
  6 | 	"io/ioutil"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"runtime"
 10 | 	"strings"
 11 | 
 12 | 	"github.com/pgaskin/dictutil/kobodict"
 13 | 	"github.com/spf13/pflag"
 14 | )
 15 | 
 16 | func init() {
 17 | 	commands = append(commands, &command{Name: "pack", Short: "p", Description: "Pack a dictzip file", Main: packMain})
 18 | }
 19 | 
 20 | func packMain(args []string, fs *pflag.FlagSet) int {
 21 | 	fs.SortFlags = false
 22 | 	output := fs.StringP("output", "o", "dicthtml.zip", "The output dictzip filename (will be overwritten if it exists)")
 23 | 	crypt := fs.StringP("crypt", "c", "", "Encrypt the dictzip using the specified encryption method (format: method:keyhex)")
 24 | 	help := fs.BoolP("help", "h", false, "Show this help text")
 25 | 	fs.Parse(args[1:])
 26 | 
 27 | 	if *help || fs.NArg() != 1 {
 28 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options] dictdir\n\nOptions:\n%s", args[0], fs.FlagUsages())
 29 | 		return 0
 30 | 	}
 31 | 
 32 | 	var c kobodict.Crypter
 33 | 	if *crypt != "" {
 34 | 		if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 {
 35 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n")
 36 | 			return 2
 37 | 		} else if key, err := hex.DecodeString(spl[1]); err != nil {
 38 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err)
 39 | 			return 2
 40 | 		} else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil {
 41 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err)
 42 | 			return 2
 43 | 		} else {
 44 | 			c = enc
 45 | 		}
 46 | 	}
 47 | 
 48 | 	fn, err := filepath.Abs(fs.Args()[0])
 49 | 	if err != nil {
 50 | 		fmt.Fprintf(os.Stderr, "Error: resolve input path %#v: %v.\n", fs.Args()[0], err)
 51 | 		return 2
 52 | 	}
 53 | 
 54 | 	ofn, err := filepath.Abs(*output)
 55 | 	if err != nil {
 56 | 		fmt.Fprintf(os.Stderr, "Error: resolve output path %#v: %v.\n", *output, err)
 57 | 		return 2
 58 | 	}
 59 | 
 60 | 	if fi, err := os.Stat(fn); err != nil {
 61 | 		fmt.Fprintf(os.Stderr, "Error: inaccessible input dir %#v: %v.\n", fn, err)
 62 | 		return 2
 63 | 	} else if !fi.IsDir() {
 64 | 		fmt.Fprintf(os.Stderr, "Error: input %#v is not a dir.\n", fn)
 65 | 		return 2
 66 | 	}
 67 | 
 68 | 	fmt.Printf("Creating output temp file\n")
 69 | 	f, err := ioutil.TempFile(filepath.Dir(ofn), "tmp_dicthtml.*.zip")
 70 | 	if err != nil {
 71 | 		fmt.Fprintf(os.Stderr, "Error: create output temp file: %v.\n", err)
 72 | 		return 2
 73 | 	}
 74 | 	defer os.Remove(f.Name())
 75 | 	defer f.Close()
 76 | 
 77 | 	fmt.Printf("Packing dictzip.\n")
 78 | 	dw := kobodict.NewWriter(f)
 79 | 	defer dw.Close()
 80 | 
 81 | 	dw.SetEncrypter(c)
 82 | 
 83 | 	if err := kobodict.Pack(dw, fn); err != nil {
 84 | 		fmt.Fprintf(os.Stderr, "Error: pack input dir %#v to %#v: %v.\n", fn, ofn, err)
 85 | 		return 1
 86 | 	}
 87 | 
 88 | 	if err := dw.Close(); err != nil {
 89 | 		fmt.Fprintf(os.Stderr, "Error: pack input dir %#v to %#v: %v.\n", fn, ofn, err)
 90 | 		return 1
 91 | 	}
 92 | 
 93 | 	fmt.Printf("Renaming output file.\n")
 94 | 	if err := f.Chmod(0644); err != nil && runtime.GOOS != "windows" {
 95 | 		fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
 96 | 		return 2
 97 | 	}
 98 | 	if err := f.Sync(); err != nil {
 99 | 		fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
100 | 		return 2
101 | 	}
102 | 	if err := f.Close(); err != nil {
103 | 		fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
104 | 		return 2
105 | 	}
106 | 	if err := os.Rename(f.Name(), ofn); err != nil { // this will replace existing files properly on Go1.5+
107 | 		fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err)
108 | 		return 2
109 | 	}
110 | 
111 | 	fmt.Printf("Successfully packed dictdir %#v to dictzip %#v.\n", fn, ofn)
112 | 	return 0
113 | }
114 | 


--------------------------------------------------------------------------------
/cmd/dictutil/prefix.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/pgaskin/dictutil/kobodict"
 8 | 	"github.com/spf13/pflag"
 9 | )
10 | 
11 | func init() {
12 | 	commands = append(commands, &command{Name: "prefix", Short: "x", Description: "Calculate the prefix for a word", Main: prefixMain})
13 | }
14 | 
15 | func prefixMain(args []string, fs *pflag.FlagSet) int {
16 | 	fs.SortFlags = false
17 | 	format := fs.StringP("format", "f", "json-array", "The output format (go-slice, go-map, csv, tsv, json-array, json-object)")
18 | 	help := fs.BoolP("help", "h", false, "Show this help text")
19 | 	fs.Parse(args[1:])
20 | 
21 | 	if *help || fs.NArg() == 0 {
22 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options] word...\n\nOptions:\n%s", args[0], fs.FlagUsages())
23 | 		return 0
24 | 	}
25 | 
26 | 	if *format != "go-slice" && *format != "go-map" && *format != "csv" && *format != "tsv" && *format != "json-array" && *format != "json-object" {
27 | 		fmt.Fprintf(os.Stderr, "Error: invalid format %#v, see --help for more details.\n", *format)
28 | 		return 2
29 | 	}
30 | 
31 | 	switch *format {
32 | 	case "go-slice":
33 | 		fmt.Printf("[][]string{\n")
34 | 	case "go-map":
35 | 		fmt.Printf("map[string]string{\n")
36 | 	case "csv", "tsv":
37 | 		break
38 | 	case "json-array":
39 | 		fmt.Printf("[\n")
40 | 	case "json-object":
41 | 		fmt.Printf("{\n")
42 | 	default:
43 | 		panic("invalid output format")
44 | 	}
45 | 
46 | 	for i, word := range fs.Args() {
47 | 		prefix := kobodict.WordPrefix(word)
48 | 		last := i == fs.NArg()-1
49 | 
50 | 		switch *format {
51 | 		case "go-slice":
52 | 			fmt.Printf("\t{%#v, %#v},\n", word, prefix)
53 | 		case "go-map":
54 | 			fmt.Printf("\t%#v: %#v,\n", word, prefix)
55 | 		case "csv":
56 | 			fmt.Printf("%s,%s\n", word, prefix)
57 | 		case "tsv":
58 | 			fmt.Printf("%s\t%s\n", word, prefix)
59 | 		case "json-array":
60 | 			fmt.Printf("    [%#v, %#v]", word, prefix)
61 | 			if last {
62 | 				fmt.Printf("\n")
63 | 			} else {
64 | 				fmt.Printf(",\n")
65 | 			}
66 | 		case "json-object":
67 | 			fmt.Printf("    %#v: %#v", word, prefix)
68 | 			if last {
69 | 				fmt.Printf("\n")
70 | 			} else {
71 | 				fmt.Printf(",\n")
72 | 			}
73 | 		default:
74 | 			panic("invalid output format")
75 | 		}
76 | 	}
77 | 
78 | 	switch *format {
79 | 	case "csv", "tsv":
80 | 		break
81 | 	case "json-array":
82 | 		fmt.Printf("]\n")
83 | 	case "json-object", "go-slice", "go-map":
84 | 		fmt.Printf("}\n")
85 | 	default:
86 | 		panic("invalid output format")
87 | 	}
88 | 
89 | 	return 0
90 | }
91 | 


--------------------------------------------------------------------------------
/cmd/dictutil/uninstall.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"database/sql"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"net/http"
 10 | 	"os"
 11 | 	"path/filepath"
 12 | 	"regexp"
 13 | 	"sort"
 14 | 	"strings"
 15 | 
 16 | 	"github.com/pgaskin/koboutils/v2/kobo"
 17 | 	"github.com/spf13/pflag"
 18 | )
 19 | 
 20 | func init() {
 21 | 	commands = append(commands, &command{Name: "uninstall", Short: "U", Description: "Uninstall a dictzip file", Main: uninstallMain})
 22 | }
 23 | 
 24 | func uninstallMain(args []string, fs *pflag.FlagSet) int {
 25 | 	fs.SortFlags = false
 26 | 	root := fs.StringP("kobo", "k", "", "KOBOeReader path (default: automatically detected)")
 27 | 	builtin := fs.StringP("builtin", "b", "normal", "How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+)")
 28 | 	noCustom := fs.BoolP("no-custom", "B", false, "Uninstall built-in dictionaries instead of custom ones on 4.24.15672+")
 29 | 	help := fs.BoolP("help", "h", false, "Show this help text")
 30 | 	fs.Parse(args[1:])
 31 | 
 32 | 	if *help || fs.NArg() != 1 {
 33 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options] locale|dicthtml-name.zip\n\nOptions:\n%s\n", args[0], fs.FlagUsages())
 34 | 		builtinHelp()
 35 | 		return 0
 36 | 	}
 37 | 
 38 | 	if *builtin != "normal" && *builtin != "delete" && *builtin != "restore" {
 39 | 		fmt.Fprintf(os.Stderr, "Error: invalid built-in dictionary mode %#v, see --help for more details.\n", *builtin)
 40 | 		return 2
 41 | 	}
 42 | 
 43 | 	kobopath, version, err := findDevice(*root)
 44 | 	if err != nil {
 45 | 		fmt.Fprintf(os.Stderr, "Error: could not detect a Kobo eReader (you can specify one manually with --kobo): %v.\n", err)
 46 | 		return 1
 47 | 	}
 48 | 
 49 | 	fmt.Printf("Found Kobo eReader at %s with firmware version %s.\n", kobopath, version)
 50 | 	if kobo.VersionCompare(version, "4.7.10364") < 0 {
 51 | 		fmt.Fprintf(os.Stderr, "Error: firmware version too old (v2 dictionaries were only introduced in 4.7.10364).\n")
 52 | 		return 1
 53 | 	}
 54 | 
 55 | 	fw14601 := kobo.VersionCompare(version, "4.20.14601") >= 0 // https://github.com/pgaskin/kobopatch-patches/issues/49
 56 | 	fw15672 := kobo.VersionCompare(version, "4.24.15672") >= 0 // https://github.com/pgaskin/kobopatch-patches/issues/76
 57 | 
 58 | 	var dictPath, dictLocale string
 59 | 	if dictLocale = strings.TrimLeft(fs.Args()[0], "-"); dictLocale == "en" {
 60 | 		if fw15672 && !*noCustom {
 61 | 			dictPath = filepath.Join(kobopath, ".kobo", "custom-dict", "dicthtml.zip")
 62 | 		} else {
 63 | 			dictPath = filepath.Join(kobopath, ".kobo", "dict", "dicthtml.zip")
 64 | 		}
 65 | 	} else if regexp.MustCompile(`^[a-zA-Z0-9-]+$`).MatchString(dictLocale) {
 66 | 		if fw15672 && !*noCustom {
 67 | 			dictPath = filepath.Join(kobopath, ".kobo", "custom-dict", "dicthtml-"+dictLocale+".zip")
 68 | 		} else {
 69 | 			dictPath = filepath.Join(kobopath, ".kobo", "dict", "dicthtml-"+dictLocale+".zip")
 70 | 		}
 71 | 	} else {
 72 | 		fmt.Fprintf(os.Stderr, "Error: invalid locale name.\n")
 73 | 		return 1
 74 | 	}
 75 | 	dictSuffix := "-" + dictLocale
 76 | 	_, dictBuiltin := builtinDict[dictLocale]
 77 | 
 78 | 	fmt.Printf("Uninstalling dictionary %#v (locale: %s).\n\n", dictPath, dictLocale)
 79 | 
 80 | 	fmt.Printf("Updating database.\n")
 81 | 	if fw15672 {
 82 | 		// We won't bother to check the DB anymore since it's been a while since
 83 | 		// 4.20.14601, and everyone who would be confused by the dictionary
 84 | 		// table probaby would have already seen the message.
 85 | 		fmt.Printf("  No need to update dictionary table on 4.24.15672+, skipping.\n")
 86 | 	} else {
 87 | 		if err := func() error {
 88 | 			db, err := sql.Open("sqlite3", filepath.Join(kobopath, ".kobo", "KoboReader.sqlite"))
 89 | 			if err != nil {
 90 | 				return fmt.Errorf("open database: %w", err)
 91 | 			}
 92 | 			defer db.Close()
 93 | 
 94 | 			if exists, err := func() (bool, error) {
 95 | 				res, err := db.Query(`SELECT name FROM sqlite_master WHERE type="table" AND name="Dictionary";`)
 96 | 				if err != nil {
 97 | 					return false, fmt.Errorf("check dictionary table: %w", err)
 98 | 				}
 99 | 				defer res.Close()
100 | 
101 | 				if !res.Next() { // if no rows are returned, there was an error or the table didn't exist
102 | 					if err := res.Err(); err != nil {
103 | 						return false, fmt.Errorf("check dictionary table: %w", err)
104 | 					}
105 | 					return false, nil
106 | 				}
107 | 				return true, nil
108 | 			}(); err != nil {
109 | 				return fmt.Errorf("check dictionary table: %w", err)
110 | 			} else if exists {
111 | 				if fw14601 {
112 | 					fmt.Printf("  Note: the dictionary table is unnecessary and inconsequential in firmware 4.20.14601+ and can be safely removed.\n")
113 | 				}
114 | 			} else {
115 | 				if fw14601 {
116 | 					// show a message to prevent confusion
117 | 					fmt.Printf("  No need to update dictionary table on 4.20.14601+, skipping.\n")
118 | 					return nil
119 | 				} else {
120 | 					return fmt.Errorf("check dictionary table: not found, and version < 4.20.14123")
121 | 				}
122 | 			}
123 | 
124 | 			if !dictBuiltin || *builtin == "delete" {
125 | 				if res, err := db.Exec("DELETE FROM Dictionary WHERE Suffix = ?", dictSuffix); err != nil {
126 | 					return fmt.Errorf("delete row from database: %w", err)
127 | 				} else if ra, _ := res.RowsAffected(); ra == 0 {
128 | 					fmt.Printf("  Row already removed from database (suffix=%s).\n", dictSuffix)
129 | 				} else {
130 | 					fmt.Printf("  Removed row from database (suffix=%s).\n", dictSuffix)
131 | 				}
132 | 			}
133 | 
134 | 			if dictBuiltin && *builtin == "normal" {
135 | 				if _, err := db.Exec("UPDATE Dictionary SET Installed = ? WHERE Suffix = ?", "false", dictSuffix); err != nil {
136 | 					return fmt.Errorf("update row in database: %w", err)
137 | 				} else {
138 | 					fmt.Printf("  Set IsInstalled to false in database for built-in dictionary (suffix=%s).\n", dictSuffix)
139 | 				}
140 | 			}
141 | 
142 | 			if dictBuiltin && *builtin == "restore" {
143 | 				if _, err := db.Exec("UPDATE Dictionary SET Installed = ? WHERE Suffix = ?", "true", dictSuffix); err != nil {
144 | 					return fmt.Errorf("update row in database: %w", err)
145 | 				} else {
146 | 					fmt.Printf("  Set IsInstalled to true in database for built-in dictionary (suffix=%s).\n", dictSuffix)
147 | 				}
148 | 			}
149 | 
150 | 			if err := db.Close(); err != nil {
151 | 				return fmt.Errorf("close database: %w", err)
152 | 			}
153 | 
154 | 			return nil
155 | 		}(); err != nil {
156 | 			fmt.Fprintf(os.Stderr, "Error: update database: %v.\n", err)
157 | 			return 1
158 | 		}
159 | 	}
160 | 
161 | 	fmt.Printf("Updating ExtraLocales.\n")
162 | 	if dictBuiltin {
163 | 		fmt.Printf("  No need; built-in dictionary.\n")
164 | 	} else {
165 | 		if err := func() error {
166 | 			cfg := filepath.Join(kobopath, ".kobo", "Kobo", "Kobo eReader.conf")
167 | 
168 | 			f, err := os.OpenFile(cfg, os.O_RDONLY, 0)
169 | 			if err != nil {
170 | 				return fmt.Errorf("open config file: %w", err)
171 | 			}
172 | 			defer f.Close()
173 | 
174 | 			var locales []string
175 | 			var filtered bool
176 | 			buf := bytes.NewBuffer(nil)
177 | 
178 | 			fs := bufio.NewScanner(f)
179 | 			for fs.Scan() {
180 | 				if bytes.HasPrefix(fs.Bytes(), []byte("ExtraLocales=")) {
181 | 					for _, loc := range strings.Split(strings.SplitN(fs.Text(), "=", 2)[1], ",") {
182 | 						loc = strings.TrimSpace(loc)
183 | 						if loc == dictLocale {
184 | 							filtered = true
185 | 						} else {
186 | 							locales = append(locales, loc)
187 | 						}
188 | 					}
189 | 					continue
190 | 				}
191 | 				_, _ = buf.Write(fs.Bytes()) // err is always nil
192 | 				buf.WriteRune('\n')
193 | 			}
194 | 
195 | 			if !filtered {
196 | 				fmt.Printf("  Locale %#v already removed from ExtraLocales (or wasn't there to begin with).\n", dictLocale)
197 | 				return nil
198 | 			}
199 | 
200 | 			fmt.Printf("  Removing locale %#v from ExtraLocales.\n", dictLocale)
201 | 			sort.Strings(locales)
202 | 
203 | 			buf.WriteString("\n[ApplicationPreferences]\n") // this will get merged by Qt
204 | 			buf.WriteString("ExtraLocales=" + strings.Join(locales, ","))
205 | 
206 | 			f.Close()
207 | 
208 | 			fo, err := os.OpenFile(cfg+".tmp", os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
209 | 			if err != nil {
210 | 				return fmt.Errorf("open new config file: %w", err)
211 | 			}
212 | 			defer os.Remove(cfg + ".tmp")
213 | 			defer fo.Close()
214 | 
215 | 			if _, err := fo.Write(buf.Bytes()); err != nil {
216 | 				return fmt.Errorf("write new config file: %w", err)
217 | 			}
218 | 
219 | 			if err := fo.Sync(); err != nil {
220 | 				return fmt.Errorf("write new config file: %w", err)
221 | 			}
222 | 
223 | 			if err := fo.Close(); err != nil {
224 | 				return fmt.Errorf("write new config file: %w", err)
225 | 			}
226 | 
227 | 			if err := os.Rename(cfg+".tmp", cfg); err != nil {
228 | 				return fmt.Errorf("rename new config file: %w", err)
229 | 			}
230 | 
231 | 			return nil
232 | 		}(); err != nil {
233 | 			fmt.Fprintf(os.Stderr, "Error: update ExtraLocales: %v.\n", err)
234 | 			return 1
235 | 		}
236 | 	}
237 | 
238 | 	fmt.Printf("Removing dictzip.\n")
239 | 	if err := os.Remove(dictPath); os.IsNotExist(err) { // this will still remove it if it's readonly on Windows (golang/go@2ffb3e5d905b5622204d199128dec06cefd57790)
240 | 		fmt.Printf("  Already removed.\n")
241 | 	} else if err != nil {
242 | 		fmt.Fprintf(os.Stderr, "Error: remove dictzip: %v.\n", err)
243 | 		return 1
244 | 	} else {
245 | 		fmt.Printf("  Removed.\n")
246 | 	}
247 | 
248 | 	if *builtin == "restore" {
249 | 		// TODO: reconsider whether this belongs in uninstall, as:
250 | 		//  - This doesn't update the file size.
251 | 		//  - This doesn't ensure there is actually a DB entry for the restored
252 | 		//    dict.
253 | 		//  - This isn't really uninstalling.
254 | 		//  - It might not even belong in dictutil at all because the URLs may
255 | 		//    change (and it isn't that hard to manually download a dictionary
256 | 		//    to install it with dictutil install)
257 | 
258 | 		url := "https://kbdownload1-a.akamaihd.net/ereader/dictionaries/v2/"
259 | 		if fw15672 {
260 | 			url = "https://kbdownload1-a.akamaihd.net/ereader/dictionaries/v3/"
261 | 		}
262 | 		url += filepath.Base(dictPath)
263 | 
264 | 		fmt.Printf("Restoring original dictionary from %#v.\n", url)
265 | 
266 | 		if err := func() error {
267 | 			resp, err := http.Get(url)
268 | 			if err != nil {
269 | 				return fmt.Errorf("get dictionary: %w", err)
270 | 			}
271 | 			defer resp.Body.Close()
272 | 
273 | 			if resp.StatusCode != http.StatusOK {
274 | 				return fmt.Errorf("get dictionary: response status %s", resp.Status)
275 | 			}
276 | 
277 | 			df, err := os.OpenFile(dictPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
278 | 			if err != nil {
279 | 				return fmt.Errorf("open output dictzip: %w", err)
280 | 			}
281 | 			defer df.Close()
282 | 
283 | 			if _, err := io.Copy(df, resp.Body); err != nil {
284 | 				return fmt.Errorf("write output dictzip: %w", err)
285 | 			}
286 | 
287 | 			if err := df.Close(); err != nil {
288 | 				return fmt.Errorf("write output dictzip: %w", err)
289 | 			}
290 | 
291 | 			return nil
292 | 		}(); err != nil {
293 | 			fmt.Fprintf(os.Stderr, "Error: download dictionary: %v.\n", err)
294 | 			return 1
295 | 		}
296 | 	}
297 | 
298 | 	fmt.Printf("\nSuccessfully uninstalled dictionary for locale %s.\n", dictLocale)
299 | 
300 | 	return 0
301 | }
302 | 


--------------------------------------------------------------------------------
/cmd/dictutil/unpack.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/hex"
 5 | 	"fmt"
 6 | 	"os"
 7 | 	"path/filepath"
 8 | 	"strings"
 9 | 
10 | 	"github.com/pgaskin/dictutil/kobodict"
11 | 	"github.com/spf13/pflag"
12 | )
13 | 
14 | func init() {
15 | 	commands = append(commands, &command{Name: "unpack", Short: "u", Description: "Unpack a dictzip file", Main: unpackMain})
16 | }
17 | 
18 | func unpackMain(args []string, fs *pflag.FlagSet) int {
19 | 	fs.SortFlags = false
20 | 	output := fs.StringP("output", "o", "", "The output directory (must not exist) (default: the basename of the input without the extension)")
21 | 	crypt := fs.StringP("crypt", "c", "", "Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex)")
22 | 	help := fs.BoolP("help", "h", false, "Show this help text")
23 | 	fs.Parse(args[1:])
24 | 
25 | 	if *help || fs.NArg() != 1 {
26 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options] dictzip\n\nOptions:\n%s", args[0], fs.FlagUsages())
27 | 		return 0
28 | 	}
29 | 
30 | 	var c kobodict.Crypter
31 | 	if *crypt != "" {
32 | 		if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 {
33 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n")
34 | 			return 2
35 | 		} else if key, err := hex.DecodeString(spl[1]); err != nil {
36 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err)
37 | 			return 2
38 | 		} else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil {
39 | 			fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err)
40 | 			return 2
41 | 		} else {
42 | 			c = enc
43 | 		}
44 | 	}
45 | 
46 | 	fn, err := filepath.Abs(fs.Args()[0])
47 | 	if err != nil {
48 | 		fmt.Fprintf(os.Stderr, "Error: resolve input path %#v: %v.\n", fs.Args()[0], err)
49 | 		return 2
50 | 	}
51 | 
52 | 	ofn := *output
53 | 	if ofn == "" {
54 | 		ofn = strings.TrimSuffix(filepath.Base(fn), filepath.Ext(fn))
55 | 	}
56 | 
57 | 	fmt.Printf("Opening input dictzip.\n")
58 | 	f, err := os.Open(fn)
59 | 	if err != nil {
60 | 		fmt.Fprintf(os.Stderr, "Error: open input file %#v: %v.\n", fn, err)
61 | 		return 1
62 | 	}
63 | 	defer f.Close()
64 | 
65 | 	s, err := f.Stat()
66 | 	if err != nil {
67 | 		fmt.Fprintf(os.Stderr, "Error: stat input file %#v: %v.\n", fn, err)
68 | 		return 1
69 | 	}
70 | 
71 | 	fmt.Printf("Parsing dictzip.\n")
72 | 	dr, err := kobodict.NewReader(f, s.Size())
73 | 	if err != nil {
74 | 		fmt.Fprintf(os.Stderr, "Error: parse input file %#v: %v.\n", fn, err)
75 | 		return 1
76 | 	}
77 | 	dr.SetDecrypter(c)
78 | 
79 | 	fmt.Printf("Unpacking dictzip.\n")
80 | 	if err := kobodict.Unpack(dr, ofn); err != nil {
81 | 		fmt.Fprintf(os.Stderr, "Error: unpack input file %#v to %#v: %v.\n", fn, ofn, err)
82 | 		return 1
83 | 	}
84 | 
85 | 	fmt.Printf("Successfully unpacked dictzip %#v to dictdir %#v.\n", fn, ofn)
86 | 	return 0
87 | }
88 | 


--------------------------------------------------------------------------------
/dictgen/dictfile.go:
--------------------------------------------------------------------------------
  1 | package dictgen
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"strings"
  8 | 	"text/template"
  9 | )
 10 | 
 11 | // A DictFile is a high-level representation of a Kobo dictionary.
 12 | type DictFile []*DictFileEntry
 13 | 
 14 | // DictFileEntry represents a single entry in the DictFile.
 15 | type DictFileEntry struct {
 16 | 	Headword string
 17 | 	Variant  []string
 18 | 
 19 | 	NoHeader   bool
 20 | 	HeaderInfo string
 21 | 
 22 | 	RawHTML    bool
 23 | 	Definition string
 24 | 
 25 | 	PostRawHTML string // will not be parsed or saved, only to be used for runtime additions before generating
 26 | 
 27 | 	line int // for internal use if parsed, zero otherwise
 28 | }
 29 | 
 30 | // ParseDictFile parses a DictFile from it's textual representation (usually
 31 | // stored in a file with the extension .df).
 32 | func ParseDictFile(r io.Reader) (DictFile, error) {
 33 | 	var df DictFile
 34 | 	var dfe *DictFileEntry
 35 | 
 36 | 	br := bufio.NewScanner(r)
 37 | 	br.Buffer(make([]byte, 64*1024), 2048*1024) // start with a 64KiB buffer, but allow up to 2MiB (for dictfiles with long lines of raw HTML)
 38 | 	var line int
 39 | 
 40 | 	for br.Scan() {
 41 | 		buf := br.Bytes()
 42 | 		line++
 43 | 
 44 | 		if len(buf) == 0 {
 45 | 			// if in a block and after the metadata (in the definition),
 46 | 			// preserve the blank line
 47 | 			if dfe != nil && len(dfe.Definition) != 0 {
 48 | 				dfe.Definition += "\n"
 49 | 			}
 50 | 			continue
 51 | 		}
 52 | 
 53 | 		switch buf[0] {
 54 | 		case '@':
 55 | 			// start another one
 56 | 			dfe = new(DictFileEntry)
 57 | 
 58 | 			// add the headword and line info
 59 | 			dfe.Headword = strings.TrimSpace(string(buf[1:]))
 60 | 			dfe.line = line
 61 | 
 62 | 			// but error if the headword is blank (note that duplicates are
 63 | 			// acceptable, and encouraged in some cases; Kobo will merge it;
 64 | 			// try looking up 'be' in the English dictionary)
 65 | 			if len(dfe.Headword) == 0 {
 66 | 				return nil, fmt.Errorf("dictfile: line %d: empty headword after @", line)
 67 | 			}
 68 | 
 69 | 			// otherwise, add it to the dictfile (remember it's a pointer, it'll
 70 | 			// still get updated)
 71 | 			df = append(df, dfe)
 72 | 		case ':':
 73 | 			// if not in a block (before the first @), return an error
 74 | 			if dfe == nil {
 75 | 				return nil, fmt.Errorf("dictfile: line %d: header info (: or ::) specified before word (@)", line)
 76 | 			}
 77 | 
 78 | 			// if already after the metadata (in the definition), return an error
 79 | 			if len(dfe.Definition) != 0 {
 80 | 				return nil, fmt.Errorf("dictfile: line %d: header info (: or ::) specified within definition content (prepend a space if this was intended to be part of the definition itself)", line)
 81 | 			}
 82 | 
 83 | 			// if already seen the header info (a line starting with :)
 84 | 			if dfe.NoHeader || len(dfe.HeaderInfo) != 0 {
 85 | 				return nil, fmt.Errorf("dictfile: line %d: multiple header infos (: or ::) specified in definition block", line)
 86 | 			}
 87 | 
 88 | 			// put the trimmed text in the header info, or disable the header if
 89 | 			// it is ::
 90 | 			if len(buf) >= 2 {
 91 | 				if buf[1] == ':' {
 92 | 					if len(strings.TrimSpace(string(buf[2:]))) != 0 {
 93 | 						return nil, fmt.Errorf("dictfile: line %d: extra data after no header specified (::)", line)
 94 | 					}
 95 | 					dfe.NoHeader = true
 96 | 				} else {
 97 | 					dfe.HeaderInfo = strings.TrimSpace(string(buf[1:]))
 98 | 				}
 99 | 			} else {
100 | 				dfe.HeaderInfo = ""
101 | 			}
102 | 		case '&':
103 | 			// if not in a block, error
104 | 			if dfe == nil {
105 | 				return nil, fmt.Errorf("dictfile: line %d: variant (&) specified before word (@)", line)
106 | 			}
107 | 
108 | 			// if already after the metadata (in the definition), error
109 | 			if len(dfe.Definition) != 0 {
110 | 				return nil, fmt.Errorf("dictfile: line %d: variant (&) specified within definition content (prepend a space if this was intended to be part of the definition itself)", line)
111 | 			}
112 | 
113 | 			// trim the rest of the line (error if nothing left)
114 | 			v := strings.TrimSpace(string(buf[1:]))
115 | 			if len(v) == 0 {
116 | 				return nil, fmt.Errorf("dictfile: line %d: no word after variant specifier (&)", line)
117 | 			}
118 | 
119 | 			// and add it to the variant list
120 | 			dfe.Variant = append(dfe.Variant, v)
121 | 		default:
122 | 			// if not in a block, error
123 | 			if dfe == nil {
124 | 				return nil, fmt.Errorf("dictfile: line %d: definition specified before word (@)", line)
125 | 			}
126 | 
127 | 			// append the line to the definition
128 | 			dfe.Definition += string(buf) + "\n"
129 | 		}
130 | 	}
131 | 
132 | 	// check for read errors
133 | 	if err := br.Err(); err != nil {
134 | 		return nil, err
135 | 	}
136 | 
137 | 	// and finally, update the raw html flag and cleanup whitespace
138 | 	for _, dfe := range df {
139 | 		dfe.Definition = strings.TrimSpace(dfe.Definition)
140 | 
141 | 		if v := strings.TrimSpace(strings.TrimPrefix(dfe.Definition, "<html>")); v != dfe.Definition {
142 | 			if strings.HasSuffix(v, "</html>") {
143 | 				return nil, fmt.Errorf("dictfile: entry at line %d: raw HTML definitions are specified with <html>, but SHOULD NOT be a full HTML document ending with </html>", dfe.line)
144 | 			}
145 | 			dfe.RawHTML = true
146 | 			dfe.Definition = v
147 | 		} else if strings.Contains(dfe.Definition, "<html>") {
148 | 			return nil, fmt.Errorf("dictfile: entry at line %d: why does the definition contain a <html> tag ... to make it raw HTML, it should be at the very beginning", dfe.line)
149 | 		}
150 | 	}
151 | 
152 | 	// note: validation is done separately (and always done before generation)
153 | 
154 | 	return df, nil
155 | }
156 | 
157 | // Validate validates the entries in the DictFile. Note that duplicate entries
158 | // are fine, and are encouraged if necessary (Kobo will merge them).
159 | func (df DictFile) Validate() error {
160 | 	illegal := func(s string, word bool) error {
161 | 		if word && strings.Contains(s, "\"") {
162 | 			return fmt.Errorf("must not contain %#v", "\"")
163 | 		}
164 | 		for _, c := range []string{
165 | 			"<w", "</w",
166 | 			"<html", "</html",
167 | 			"<var", "</var",
168 | 			"<a name=",
169 | 		} {
170 | 			// TODO: optimize
171 | 			if strings.Contains(s, c) {
172 | 				return fmt.Errorf("must not contain %#v", c)
173 | 			}
174 | 		}
175 | 		return nil
176 | 	}
177 | 	for i, dfe := range df {
178 | 		if strings.TrimSpace(dfe.Headword) == "" {
179 | 			return fmt.Errorf("word %#v (i:%d, dfe:%#v): headword must not be blank", dfe.Headword, i, dfe)
180 | 		} else if err := illegal(dfe.Headword, true); err != nil {
181 | 			return fmt.Errorf("word %#v (i:%d): headword contains illegal string: %w", dfe.Headword, i, err)
182 | 		}
183 | 		for _, v := range dfe.Variant {
184 | 			if strings.TrimSpace(v) == "" {
185 | 				return fmt.Errorf("word %#v (i:%d): variant %#v must not be blank", dfe.Headword, i, v)
186 | 			} else if err := illegal(v, true); err != nil {
187 | 				return fmt.Errorf("word %#v (i:%d): variant %#v contains illegal string : %w", dfe.Headword, i, v, err)
188 | 			}
189 | 		}
190 | 		if err := illegal(dfe.HeaderInfo, false); err != nil {
191 | 			return fmt.Errorf("word %#v (i:%d): header info %#v contains illegal string : %w", dfe.Headword, i, dfe.HeaderInfo, err)
192 | 		}
193 | 		if err := illegal(dfe.Definition, false); err != nil {
194 | 			return fmt.Errorf("word %#v (i:%d): definition %#v contains illegal string : %w", dfe.Headword, i, dfe.Definition, err)
195 | 		}
196 | 	}
197 | 	return nil
198 | }
199 | 
200 | // WriteDictFile validates the DictFile and writes it to w in the dictfile
201 | // format.
202 | func (df DictFile) WriteDictFile(w io.Writer) error {
203 | 	if err := df.Validate(); err != nil {
204 | 		return err
205 | 	}
206 | 
207 | 	for _, dfe := range df {
208 | 		if err := dfe.writeDictFileEntry(w); err != nil {
209 | 			return err
210 | 		}
211 | 		// for consistency with template if git converted newlines
212 | 		if _, err := w.Write([]byte(`
213 | `)); err != nil {
214 | 			return err
215 | 		}
216 | 	}
217 | 	return nil
218 | }
219 | 
220 | // note: this assumes the entry is valid
221 | var dictFileEntryTmpl = template.Must(template.New("").Funcs(template.FuncMap{
222 | 	"dfesc": func(str string) string {
223 | 		return strings.NewReplacer(
224 | 			"\n@", "\n @",
225 | 			"\n:", "\n :",
226 | 			"\n&", "\n &",
227 | 		).Replace(str)
228 | 	},
229 | }).Parse(`
230 | {{- /* trim leading whitespace from template */ -}}
231 | 
232 | {{with .Headword}}@ {{.}}{{end -}}
233 | 
234 | {{with .NoHeader}}
235 | ::{{else}}{{with .HeaderInfo}}
236 | : {{.}}{{end}}{{end -}}
237 | 
238 | {{range .Variant}}
239 | & {{.}}{{end -}}
240 | 
241 | {{with .RawHTML}}
242 | <html>{{end -}}
243 | 
244 | {{with .Definition}}
245 | {{dfesc .}}{{end -}}
246 | 
247 | {{- /* keep trailing newline at end of template */}}
248 | `))
249 | 
250 | func (d DictFileEntry) writeDictFileEntry(w io.Writer) error {
251 | 	return dictFileEntryTmpl.Execute(w, d)
252 | }
253 | 


--------------------------------------------------------------------------------
/dictgen/dictfile_test.go:
--------------------------------------------------------------------------------
  1 | package dictgen
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"reflect"
  8 | 	"sort"
  9 | 	"strings"
 10 | 	"testing"
 11 | )
 12 | 
 13 | type testcase struct {
 14 | 	What string
 15 | 
 16 | 	In  string
 17 | 	Err error
 18 | 
 19 | 	Out DictFile
 20 | 
 21 | 	OutDictFile string
 22 | 	OutKoboHTML string
 23 | }
 24 | 
 25 | // TODO(v1): more specific tests
 26 | var testcases = []testcase{{
 27 | 	What: "some of everything",
 28 | 	In: `@ blank
 29 | 
 30 | @ headword
 31 | : info
 32 | & variant1
 33 | &variant2
 34 | test
 35 | test
 36 | 
 37 | @ custom
 38 | & NORMALIZEME
 39 | ::
 40 | <html>
 41 | <b>custom word:</b>
 42 | <p>test</p>
 43 | @ markdown
 44 | :-test
 45 | 1. Definition point 1.
 46 |   - Blah
 47 |   - Blah
 48 | 2. Blah blah blah.
 49 | 3. Blah *blah* **blah**!
 50 | 
 51 | Blah blah blah.`,
 52 | 	Out: DictFile{
 53 | 		{Headword: "blank", Variant: []string(nil), NoHeader: false, HeaderInfo: "", RawHTML: false, Definition: "", line: 1},
 54 | 		{Headword: "headword", Variant: []string{"variant1", "variant2"}, NoHeader: false, HeaderInfo: "info", RawHTML: false, Definition: "test\ntest", line: 3},
 55 | 		{Headword: "custom", Variant: []string{"NORMALIZEME"}, NoHeader: true, HeaderInfo: "", RawHTML: true, Definition: "<b>custom word:</b>\n<p>test</p>", line: 10},
 56 | 		{Headword: "markdown", Variant: []string(nil), NoHeader: false, HeaderInfo: "-test", RawHTML: false, Definition: "1. Definition point 1.\n  - Blah\n  - Blah\n2. Blah blah blah.\n3. Blah *blah* **blah**!\n\nBlah blah blah.", line: 16},
 57 | 	},
 58 | 	OutDictFile: `@ blank
 59 | 
 60 | @ headword
 61 | : info
 62 | & variant1
 63 | & variant2
 64 | test
 65 | test
 66 | 
 67 | @ custom
 68 | ::
 69 | & NORMALIZEME
 70 | <html>
 71 | <b>custom word:</b>
 72 | <p>test</p>
 73 | 
 74 | @ markdown
 75 | : -test
 76 | 1. Definition point 1.
 77 |   - Blah
 78 |   - Blah
 79 | 2. Blah blah blah.
 80 | 3. Blah *blah* **blah**!
 81 | 
 82 | Blah blah blah.
 83 | 
 84 | `,
 85 | 	OutKoboHTML: `<html><w><p><a name="blank" /><b>blank</b></p><var></var></w><w><a name="custom" /><var><variant name="normalizeme"/></var><b>custom word:</b>
 86 | <p>test</p></w><w><p><a name="headword" /><b>headword</b> info</p><var><variant name="variant1"/><variant name="variant2"/></var><p>test
 87 | test</p></w><w><p><a name="markdown" /><b>markdown</b> -test</p><var></var><ol>
 88 | <li>Definition point 1.
 89 | 
 90 | <ul>
 91 | <li>Blah</li>
 92 | <li>Blah</li>
 93 | </ul></li>
 94 | <li>Blah blah blah.</li>
 95 | <li>Blah <em>blah</em> <strong>blah</strong>!</li>
 96 | </ol>
 97 | 
 98 | <p>Blah blah blah.</p></w></html>`,
 99 | }}
100 | 
101 | func TestDictFile(t *testing.T) {
102 | 	for _, tc := range testcases {
103 | 		t.Logf("case %#v", tc.What)
104 | 
105 | 		df, err := ParseDictFile(strings.NewReader(tc.In))
106 | 		if tc.Err == nil && err != nil {
107 | 			t.Fatalf("case %#v: parse dictfile: unexpected error: %v", tc.What, err)
108 | 		} else if tc.Err != nil && err == nil {
109 | 			t.Fatalf("case %#v: parse dictfile: expected error (%v)", tc.What, tc.Err)
110 | 		} else if tc.Err != nil && tc.Err.Error() != err.Error() {
111 | 			t.Fatalf("case %#v: parse dictfile: expected error (%v), got: %v", tc.What, tc.Err, err)
112 | 		}
113 | 
114 | 		exp, err := json.MarshalIndent(tc.Out, "| ", "    ")
115 | 		if err != nil {
116 | 			panic(err)
117 | 		}
118 | 
119 | 		act, err := json.MarshalIndent(df, "| ", "    ")
120 | 		if err != nil {
121 | 			panic(err)
122 | 		}
123 | 
124 | 		if !reflect.DeepEqual(exp, act) {
125 | 			for _, dfe := range df {
126 | 				fmt.Printf("%#v,\n", dfe)
127 | 			}
128 | 			t.Fatalf("case %#v: expected:\n%s\n\ngot:\n%s", tc.What, exp, act)
129 | 		}
130 | 
131 | 		buf := bytes.NewBuffer(nil)
132 | 		if err := df.WriteDictFile(buf); err != nil {
133 | 			t.Fatalf("case %#v: write dictfile: unexpected error: %v", tc.What, err)
134 | 		} else if tc.OutDictFile != buf.String() {
135 | 			fmt.Printf("expected:\n`%s`\n\ngot:\n`%s`", tc.OutDictFile, buf.String())
136 | 			t.Fatalf("case %#v: unexpected dictfile output", tc.What)
137 | 		}
138 | 
139 | 		pdf, err := ParseDictFile(buf)
140 | 		if err != nil {
141 | 			t.Fatalf("case %#v: reparse written dictfile: unexpected error: %v", tc.What, err)
142 | 		}
143 | 		sort.Slice(pdf, func(i, j int) bool {
144 | 			return pdf[i].Headword < pdf[j].Headword
145 | 		})
146 | 		edf := df[:]
147 | 		sort.Slice(edf, func(i, j int) bool {
148 | 			return edf[i].Headword < edf[j].Headword
149 | 		})
150 | 		if jpdf, err := json.Marshal(pdf); err != nil {
151 | 			panic(pdf)
152 | 		} else if jedf, err := json.Marshal(edf); err != nil {
153 | 			panic(pdf)
154 | 		} else if !reflect.DeepEqual(jpdf, jedf) {
155 | 			t.Fatalf("case %#v: reparse written dictfile: differs from original (orig:%s) (reparsed:%s)", tc.What, jedf, jpdf)
156 | 		}
157 | 
158 | 		buf.Reset()
159 | 		if err := df.WriteKoboHTML(buf); err != nil {
160 | 			t.Fatalf("case %#v: write kobo html: unexpected error: %v", tc.What, err)
161 | 		} else if tc.OutKoboHTML != buf.String() {
162 | 			fmt.Printf("expected:\n`%s`\n\ngot:\n`%s`", tc.OutKoboHTML, buf.String())
163 | 			t.Fatalf("case %#v: unexpected kobo html output", tc.What)
164 | 		}
165 | 	}
166 | }
167 | 


--------------------------------------------------------------------------------
/dictgen/dictgen.go:
--------------------------------------------------------------------------------
  1 | // Package dictgen simplifies creating full-featured dictionaries for Kobo
  2 | // eReaders, with support for images, unicode prefixes, raw html, markdown, and
  3 | // more.
  4 | //
  5 | // A marisa implementation must be provided by
  6 | // github.com/pgaskin/kobodict/marisa or a custom one for this package to work.
  7 | package dictgen
  8 | 
  9 | import (
 10 | 	"bytes"
 11 | 	"fmt"
 12 | 	"io"
 13 | 	"sort"
 14 | 	"strings"
 15 | 	"text/template"
 16 | 
 17 | 	"github.com/pgaskin/dictutil/kobodict"
 18 | 	"github.com/russross/blackfriday/v2"
 19 | )
 20 | 
 21 | // WriteDictzip writes the dictfile to a kobodict.Writer, which should not have
 22 | // been used yet. The writer is not closed automatically. If the ImageHandler
 23 | // requires a file to be opened (i.e. not ImageHandlerRemove), the provided
 24 | // ImageFunc will be called.
 25 | func (df DictFile) WriteDictzip(dw *kobodict.Writer, ih ImageHandler, img ImageFunc) error {
 26 | 	var prefixes []string
 27 | 	prefixed := df.Prefixed()
 28 | 	for pfx := range prefixed {
 29 | 		prefixes = append(prefixes, pfx)
 30 | 	}
 31 | 	sort.Strings(prefixes)
 32 | 
 33 | 	hbuf := bytes.NewBuffer(nil)
 34 | 	for _, pfx := range prefixes {
 35 | 		for _, dfe := range prefixed[pfx] {
 36 | 			if err := dw.AddWord(dfe.Headword); err != nil {
 37 | 				return fmt.Errorf("add word %#v: %w", dfe.Headword, err)
 38 | 			}
 39 | 			for _, v := range dfe.Variant {
 40 | 				if err := dw.AddWord(v); err != nil {
 41 | 					return fmt.Errorf("add variant %#v: %w", v, err)
 42 | 				}
 43 | 			}
 44 | 		}
 45 | 		hbuf.Reset()
 46 | 		if err := prefixed[pfx].WriteKoboHTML(hbuf); err != nil {
 47 | 			return fmt.Errorf("generate dicthtml for %s: %w", pfx, err)
 48 | 		} else if buf, err := transformHTMLImages(ih, dw, hbuf.Bytes(), img); err != nil {
 49 | 			return fmt.Errorf("generate dicthtml for %s: transform images: %w", pfx, err)
 50 | 		} else if hw, err := dw.CreateDicthtml(pfx); err != nil {
 51 | 			return fmt.Errorf("write dicthtml for %s: %w", pfx, err)
 52 | 		} else if _, err = hw.Write(buf); err != nil {
 53 | 			return fmt.Errorf("write dicthtml for %s: %w", pfx, err)
 54 | 		}
 55 | 	}
 56 | 
 57 | 	return nil
 58 | }
 59 | 
 60 | // Prefixed shards the DictFile into the different word prefixes. The original
 61 | // DictFile is unchanged, but the entries are still pointers to the originals
 62 | // (i.e. the result will become out of date if you modify the entries).
 63 | //
 64 | // The DictFile is not validated.
 65 | //
 66 | // If a variamt has a different prefix, the entire entry is duplicated as
 67 | // necessary.
 68 | func (df DictFile) Prefixed() map[string]DictFile {
 69 | 	prefixed := map[string]DictFile{}
 70 | 	for _, dfe := range df {
 71 | 		pfx := map[string]bool{}
 72 | 
 73 | 		pfx[kobodict.WordPrefix(dfe.Headword)] = true
 74 | 		for _, v := range dfe.Variant {
 75 | 			pfx[kobodict.WordPrefix(v)] = true
 76 | 		}
 77 | 
 78 | 		for p := range pfx {
 79 | 			prefixed[p] = append(prefixed[p], dfe)
 80 | 		}
 81 | 	}
 82 | 	return prefixed
 83 | }
 84 | 
 85 | // WriteKoboHTML validates the DictFile and writes it to w in the dicthtml
 86 | // format.
 87 | func (df DictFile) WriteKoboHTML(w io.Writer) error {
 88 | 	if err := df.Validate(); err != nil {
 89 | 		return err
 90 | 	}
 91 | 
 92 | 	// must be sorted for proper matching
 93 | 	dfs := df[:]
 94 | 	sort.Slice(dfs, func(i int, j int) bool {
 95 | 		return dfs[i].Headword < dfs[j].Headword
 96 | 	})
 97 | 
 98 | 	if _, err := w.Write([]byte("<html>")); err != nil {
 99 | 		return err
100 | 	}
101 | 	for _, dfe := range dfs {
102 | 		if err := dfe.writeKoboHTML(w); err != nil {
103 | 			return err
104 | 		}
105 | 	}
106 | 	if _, err := w.Write([]byte("</html>")); err != nil {
107 | 		return err
108 | 	}
109 | 
110 | 	return nil
111 | }
112 | 
113 | // note: we don't want the html/template escaping, this isn't actually proper
114 | // html, and also, the whitespaces in the end tags should stay EXACTLY as is
115 | // (yes, I know there is a space before the end of the a but not the variant) to
116 | // provide the best possible matches against the regexps Kobo uses. Also, the
117 | // output should not have any newlines. Also, keep in mind headwords can have
118 | // unescaped html tags in it, and they will be rendered properly by Kobo.
119 | var koboHTMLTmpl = template.Must(template.New("").Funcs(template.FuncMap{
120 | 	"md": func(md string) string {
121 | 		return strings.TrimSpace(string(blackfriday.Run([]byte(md))))
122 | 	},
123 | 	"normhw": func(headword string) string {
124 | 		return kobodict.NormalizeWordReference(headword, false)
125 | 	},
126 | 	"normv": func(variant string) string {
127 | 		return kobodict.NormalizeWordReference(variant, true)
128 | 	},
129 | }).Parse(`
130 | {{- /* trim */ -}}
131 | 
132 | <w>
133 | 	{{- if .NoHeader -}}
134 | 		<a name="{{normhw .Headword}}" />
135 | 	{{- else -}}
136 | 		<p><a name="{{normhw .Headword}}" /><b>{{.Headword}}</b>{{with .HeaderInfo}} {{.}}{{end}}</p>
137 | 	{{- end -}}
138 | 	<var>
139 | 		{{- range .Variant -}}
140 | 			<variant name="{{normv .}}"/>
141 | 		{{- end -}}
142 | 	</var>
143 | 	{{- with .Definition -}}
144 | 		{{- if $.RawHTML -}}
145 | 			{{.}}
146 | 		{{- else -}}
147 | 			{{md .}}
148 | 		{{- end -}}
149 | 	{{- end -}}
150 | 	{{- with .PostRawHTML -}}
151 | 		{{.}}
152 | 	{{- end -}}
153 | </w>
154 | 
155 | {{- /* trim */ -}}
156 | `))
157 | 
158 | func (d DictFileEntry) writeKoboHTML(w io.Writer) error {
159 | 	return koboHTMLTmpl.Execute(w, d)
160 | }
161 | 


--------------------------------------------------------------------------------
/dictgen/image.go:
--------------------------------------------------------------------------------
  1 | package dictgen
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"crypto/sha1"
  6 | 	"encoding/base64"
  7 | 	"fmt"
  8 | 	"image"
  9 | 	"io"
 10 | 	"math"
 11 | 	"os"
 12 | 	"path/filepath"
 13 | 	"regexp"
 14 | 	"strings"
 15 | 
 16 | 	"github.com/disintegration/imaging"
 17 | 	"github.com/pgaskin/dictutil/kobodict"
 18 | )
 19 | 
 20 | // ImageFunc reads an image from the path (it may be absolute or relative) src,
 21 | // and returns an io.Reader for the image contents. If the returned reader
 22 | // implements io.Closer, it will automatically be called after the image has
 23 | // been processed.
 24 | type ImageFunc func(src string) (io.Reader, error)
 25 | 
 26 | // ImageFuncFilesystem loads an image from the filesystem. If src is relative,
 27 | // it is resolved relative to the current dir.
 28 | func ImageFuncFilesystem(src string) (io.Reader, error) {
 29 | 	rsrc, err := filepath.Abs(src)
 30 | 	if err != nil {
 31 | 		return nil, fmt.Errorf("resolve path %#v: %w", src, err)
 32 | 	}
 33 | 	f, err := os.Open(rsrc)
 34 | 	if err != nil {
 35 | 		return nil, fmt.Errorf("open image file %#v (resolved from %#v): %w", rsrc, src, err)
 36 | 	}
 37 | 	return f, nil // f will be closed by transformHTMLImages
 38 | }
 39 | 
 40 | // ImageHandler transforms images referenced in a DictFile.
 41 | type ImageHandler interface {
 42 | 	// Transform transforms an image read from ir, and returns a new value for
 43 | 	// the img tag's src attribute. As a special case, if an empty string is
 44 | 	// returned and the error is nil, the image tag is removed entirely. In
 45 | 	// addition, custom CSS (which must not contain any double quotes) can be
 46 | 	// returned to be set on the img tag.
 47 | 	Transform(src string, ir io.Reader, dw *kobodict.Writer) (nsrc string, css string, err error)
 48 | 
 49 | 	// Description returns a human-readable description of what the handler does.
 50 | 	Description() string
 51 | }
 52 | 
 53 | // ImageHandlerRemove removes images from the dicthtml.
 54 | type ImageHandlerRemove struct{}
 55 | 
 56 | // Transform implements ImageHandler.
 57 | func (*ImageHandlerRemove) Transform(string, io.Reader, *kobodict.Writer) (string, string, error) {
 58 | 	return "", "", nil
 59 | }
 60 | 
 61 | // Description implements ImageHandler.
 62 | func (*ImageHandlerRemove) Description() string {
 63 | 	return "remove images"
 64 | }
 65 | 
 66 | // ImageHandlerEmbed adds the images to the dictzip without any additional
 67 | // modifications. Usually, this would be the best choice, but unfortunately,
 68 | // it is too buggy as of firmware 4.19.14123.
 69 | type ImageHandlerEmbed struct{}
 70 | 
 71 | // Transform implements ImageHandler.
 72 | func (*ImageHandlerEmbed) Transform(src string, ir io.Reader, dw *kobodict.Writer) (string, string, error) {
 73 | 	if !strings.HasSuffix(src, ".jpg") && !strings.HasSuffix(src, ".gif") {
 74 | 		return "", "", fmt.Errorf("ImageHandlerEmbed: unsupported image file %s: extension must be .jpg or .gif when embedding", src)
 75 | 	}
 76 | 
 77 | 	// to generate a deterministic usually-unique filename
 78 | 	fn := fmt.Sprintf("%x%s", sha1.Sum([]byte(src)), filepath.Ext(src))
 79 | 	if !dw.Exists(fn) { // CreateFile will error if it already exists, and we're pretty confident the file is identical anyways
 80 | 		if iw, err := dw.CreateFile(fn); err != nil {
 81 | 			return "", "", fmt.Errorf("ImageHandlerEmbed: create dictfile entry %#v: %w", fn, err)
 82 | 		} else if _, err := io.Copy(iw, ir); err != nil {
 83 | 			return "", "", fmt.Errorf("ImageHandlerEmbed: copy image to dictfile: %w", err)
 84 | 		}
 85 | 	}
 86 | 	return "dict:///" + fn, "", nil
 87 | }
 88 | 
 89 | // Description implements ImageHandler.
 90 | func (*ImageHandlerEmbed) Description() string {
 91 | 	return "add to dictzip as-is (warning: this causes entries to appear blank due to a bug in nickel as of firmware 4.20.14601)"
 92 | }
 93 | 
 94 | // ImageHandlerBase64 optimizes the image and encodes it as base64. This is the
 95 | // most compatible option, but it comes at the expense of space and speed. In
 96 | // addition, if there are too many images, it can lead to nickel running out of
 97 | // memory when parsing the dictionary (and sickel should reboot it).
 98 | //
 99 | // In addition, it adds CSS to fix sizing issues (by default, images appear
100 | // really small when rendered in the dictionary due to default styling).
101 | //
102 | // This is currently the recommended option for adding images.
103 | //
104 | // You must import image/* yourself for format support.
105 | type ImageHandlerBase64 struct {
106 | 	// Images will be resized to fit within these dimensions, while preserving
107 | 	// aspect ratio. If not specified, the default is 1000x1000.
108 | 	MaxSize image.Point
109 | 	// NoGrayscale will prevent images from being grayscaled.
110 | 	NoGrayscale bool
111 | 	// JPEGQuality sets the JPEG quality for the encoded images. If not set, it
112 | 	// defaults to 60.
113 | 	JPEGQuality int
114 | }
115 | 
116 | func (ih *ImageHandlerBase64) params() (maxWidth, maxHeight int, noGrayscale bool, jpegQuality int) {
117 | 	mw, mh := float64(ih.MaxSize.X), float64(ih.MaxSize.Y)
118 | 	if mw < 1 {
119 | 		mw = 1000
120 | 	}
121 | 	if mh < 1 {
122 | 		mh = 1000
123 | 	}
124 | 	ng := ih.NoGrayscale
125 | 	jq := ih.JPEGQuality
126 | 	if jq == 0 {
127 | 		jq = 60
128 | 	}
129 | 	return int(mw), int(mh), ng, jq
130 | }
131 | 
132 | // Transform implements ImageHandler.
133 | func (ih *ImageHandlerBase64) Transform(src string, ir io.Reader, dw *kobodict.Writer) (string, string, error) {
134 | 	mw, mh, ng, jq := ih.params()
135 | 
136 | 	// decode the image
137 | 	img, err := imaging.Decode(ir)
138 | 	if err != nil {
139 | 		return "", "", fmt.Errorf("ImageHandlerBase64: decode image: %w", err)
140 | 	}
141 | 
142 | 	// resize it
143 | 	ow, oh := float64(img.Bounds().Dx()), float64(img.Bounds().Dy())
144 | 	sf := math.Min(float64(mw)/ow, float64(mh)/oh)
145 | 	img = imaging.Resize(img, int(ow*sf), int(oh*sf), imaging.Lanczos)
146 | 
147 | 	// make it grayscale
148 | 	if ng {
149 | 		img = imaging.Grayscale(img)
150 | 	}
151 | 
152 | 	// encode the image
153 | 	buf := bytes.NewBuffer(nil)
154 | 	bw := base64.NewEncoder(base64.StdEncoding, buf)
155 | 	if err := imaging.Encode(bw, img, imaging.JPEG, imaging.JPEGQuality(jq)); err != nil {
156 | 		return "", "", fmt.Errorf("ImageHandlerBase64: encode new image to dictfile: %w", err)
157 | 	}
158 | 	_ = bw.Close()
159 | 
160 | 	// generate the css
161 | 	css := fmt.Sprintf("width:%dpx;height:%dpx;max-width:100%%;margin:1em auto;page-break-before:auto;object-fit:scale-down;object-position:center", img.Bounds().Dx(), img.Bounds().Dy())
162 | 
163 | 	// build the URL
164 | 	return "data:image/jpeg;base64," + buf.String(), css, nil
165 | }
166 | 
167 | // Description implements ImageHandler.
168 | func (ih *ImageHandlerBase64) Description() string {
169 | 	mw, mh, ng, jq := ih.params()
170 | 	return fmt.Sprintf("optimize and encode as base64 data URL (max_width=%d, max_height=%d, grayscale=%t, jpeg_quality=%d) (warning: this causes segfaults in the in-book dictionary due to a bug in nickel with firmware versions below 4.20.14601)", mw, mh, ng, jq)
171 | }
172 | 
173 | var imgTagRe = regexp.MustCompile(`(<img)(\s+(?:[^>]*\s+)?src\s*=\s*['"]+)([^'"]+)(['"][^>]*>)`)
174 | 
175 | // transformHTMLImages transforms img tags in the specified HTML, using
176 | // openImage to read the specified paths. If openImage implements io.Closer,
177 | // it will be closed automatically. Img tags which reference have a data URL are
178 | // skipped.
179 | //
180 | // The dictwriter may be used during this process, so callers should not rely on
181 | // any entries opened before calling this.
182 | func transformHTMLImages(ih ImageHandler, dw *kobodict.Writer, html []byte, img ImageFunc) ([]byte, error) {
183 | 	nhtml := html[:]
184 | 	for _, m := range imgTagRe.FindAllSubmatch(html, -1) {
185 | 		t, a, b, src, c := m[0], m[1], m[2], m[3], m[4]
186 | 		if bytes.HasPrefix(src, []byte("data:")) {
187 | 			continue
188 | 		}
189 | 		ir, err := img(string(src))
190 | 		if err != nil {
191 | 			return nil, fmt.Errorf("transform image %#v: open file: %w", string(src), err)
192 | 		}
193 | 		nsrc, css, err := ih.Transform(string(src), ir, dw)
194 | 		if err != nil {
195 | 			if c, ok := ir.(io.Closer); ok {
196 | 				c.Close()
197 | 			}
198 | 			return nil, fmt.Errorf("transform image %#v: transform image: %w", string(src), err)
199 | 		}
200 | 		if c, ok := ir.(io.Closer); ok {
201 | 			c.Close()
202 | 		}
203 | 		var nstyle string
204 | 		if len(css) != 0 {
205 | 			nstyle = " style=\"" + css + "\""
206 | 		}
207 | 		if len(nsrc) == 0 {
208 | 			nhtml = bytes.Replace(nhtml, t, nil, 1)
209 | 		} else {
210 | 			nhtml = bytes.Replace(nhtml, t, []byte(string(a)+nstyle+string(b)+nsrc+string(c)), 1)
211 | 		}
212 | 	}
213 | 	return nhtml, nil
214 | }
215 | 


--------------------------------------------------------------------------------
/dictgen/image_test.go:
--------------------------------------------------------------------------------
 1 | package dictgen
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestImgTagRe(t *testing.T) {
 9 | 	inHTML := `
10 | 		<img src="test">
11 | 		<img src="test" />
12 | 		<img src="test" alt="asd" />
13 | 		<img height="10" width="10" src="test" alt="asd" />
14 | 		<img height="10" width="10"
15 | src = "test"
16 | alt="asd" />
17 | 	`
18 | 	exImg := [][]string{
19 | 		{`<img`, ` src="`, `test`, `">`},
20 | 		{`<img`, ` src="`, `test`, `" />`},
21 | 		{`<img`, ` src="`, `test`, `" alt="asd" />`},
22 | 		{`<img`, ` height="10" width="10" src="`, `test`, `" alt="asd" />`},
23 | 		{`<img`, ` height="10" width="10"
24 | src = "`, `test`, `"
25 | alt="asd" />`},
26 | 	}
27 | 
28 | 	acMatch := imgTagRe.FindAllStringSubmatch(inHTML, -1)
29 | 	acImg := make([][]string, len(acMatch))
30 | 	for i, m := range acMatch {
31 | 		acImg[i] = m[1:]
32 | 	}
33 | 
34 | 	if !reflect.DeepEqual(exImg, acImg) {
35 | 		t.Errorf("Expected %#v, got %#v.", exImg, acImg)
36 | 	}
37 | }
38 | 
39 | // TODO(v1): test the image handlers, especially the one which does the replacements
40 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | title: dictutil
 2 | remote_theme: pmarsceill/just-the-docs
 3 | url: https://pgaskin.net
 4 | baseurl: /dictutil
 5 | description: Tools, notes, and other stuff related to Kobo dictionaries.
 6 | search_enabled: false
 7 | aux_links:
 8 |   Download:
 9 |   - http://github.com/pgaskin/dictutil/releases/latest
10 |   MobileRead:
11 |   - https://www.mobileread.com/forums/showthread.php?t=327854
12 |   GitHub:
13 |   - http://github.com/pgaskin/dictutil
14 | heading_anchors: true
15 | footer_content: Copyright &copy; 2020 Patrick Gaskin.
16 | 


--------------------------------------------------------------------------------
/docs/_includes/head_custom.html:
--------------------------------------------------------------------------------
1 | <script async src="https://pg.ctr.pgaskin.net/count.js" data-goatcounter="https://pg.ctr.pgaskin.net/count"></script>
2 | 


--------------------------------------------------------------------------------
/docs/dictgen/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: dictgen
  4 | has_children: false
  5 | ---
  6 | 
  7 | # dictgen
  8 | 
  9 | This section contains documentation for dictgen, a high-level tool to create Kobo dictionaries.
 10 | {: .fs-6 .fw-300 }
 11 | 
 12 | ## Usage
 13 | 
 14 | ```
 15 | Usage: dictgen [options] dictfile...
 16 | 
 17 | Options:
 18 |   -o, --output string         The output filename (will be overwritten if it exists) (- is stdout) (default "dicthtml.zip")
 19 |   -c, --crypt string          Encrypt the dictzip using the specified encryption method (format: method:keyhex)
 20 |   -I, --image-method string   How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove) (default "base64")
 21 |       --remove-footer         Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary)
 22 |   -h, --help                  Show this help text
 23 | 
 24 | If multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename.
 25 | 
 26 | Note that currently, the only usable image method is removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary if images are enabled), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details.
 27 | 
 28 | See https://pgaskin.net/dictutil/dictgen for more information about the dictfile format.
 29 | ```
 30 | 
 31 | ## Example usage
 32 | 
 33 | **Building a dictzip for a dictfile:**
 34 | 
 35 | ```
 36 | dictgen my-dictionary.df
 37 | ```
 38 | 
 39 | If you are using Windows, you can also drag-and-drop a dictfile onto dictgen.exe. 
 40 | 
 41 | **Merging multiple dictfiles into a single dictzip:**
 42 | 
 43 | ```
 44 | dictgen my-dictionary.df another.df
 45 | ```
 46 | 
 47 | If you are using Windows, you can also drag-and-drop multiple dictfiles onto dictgen.exe. 
 48 | 
 49 | **Building a dictzip with images removed:**
 50 | 
 51 | ```
 52 | dictgen -I remove my-dictionary.df
 53 | ```
 54 | 
 55 | **Specifying a custom output filename:**
 56 | 
 57 | ```
 58 | dictgen -o dicthtml-df.zip my-dictionary.df
 59 | ```
 60 | 
 61 | ## Dictfile format
 62 | Dictgen uses a simple, but feature-complete format for representing Kobo dictionaries.
 63 | 
 64 | A dictfile (with the file extension `.df`) is a plain-text file consisting of multiple entries.
 65 | 
 66 | Each entry represents a single definition. There can be more than one entry per word. An entry is denoted by a line starting with `@ ` followed by the headword. The headword can contain spaces, capital letters, and so on.
 67 | 
 68 | After the headword, zero or more header lines can be added. To add additional variants which will be matched, use `& ` followed by the word variant. The variant can be anything which could be used in a headword. This can be specified more than once, but only one variant can be specified for each `& `. Another header type is word information, denoted by a `: `. If specified, the text following it is appended after the bolded headword on the same line (see the English built-in dictionary for an example; it has things like `-verb` and the pronunciation information here). If you want to have complete control over how the entry is displayed, use `::` (without anything following it) instead of `: `. This will remove the default bolded headword at the top of the generated entry.
 69 | 
 70 | After the header lines, you can include the body of the entry. By default, this uses [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) for formatting. If you want to include raw HTML, prepend the HTML with `<html>` (don't include a closing tag). This can span multiple lines, and will continue until the next entry or end of file.
 71 | 
 72 | In addition, you can include GIF and JPEG images in the body using the usual Markdown or HTML syntax. If the image path is relative (i.e. not a full path), it is resolved relative to the directory you run dictgen from.
 73 | 
 74 | You can also include custom CSS (per-entry) by including it between the `<style>` and `</style>` tags. This is supported in both HTML and Markdown mode.
 75 | 
 76 | ## Dictfile reference
 77 | 
 78 | - `@ HEADWORD`: Start a new entry. The headword doesn't have to be unique, and can contain spaces.
 79 |   - Header
 80 |     - `: WORD_INFO` or `::` *(optional)*: Add extra word info after the headword, or remove it entirely.
 81 |     - `& VARIANT` *(optional)*: Add an additional word to match. Follows the same rules as the headword. Can be repeated multiple times.
 82 |   - Body
 83 |     - `MARKDOWN` or `<html> RAW_HTML`: Include a definition written in Markdown or raw HTML code.
 84 | 
 85 | ## Examples
 86 | 
 87 | ### Simplest
 88 | 
 89 | ```
 90 | @ word
 91 | Definition here.
 92 | @ word 1
 93 | Definition 1 here.
 94 | @ test
 95 | Blah blah blah.
 96 | ```
 97 | 
 98 | ### Simple
 99 | 
100 | ```
101 | @ no
102 | - No means no...
103 | 
104 | @ NO
105 | - A different definition for nitric oxide.
106 | - Blah blah blah.
107 | 
108 | @ go
109 | & went
110 | & going
111 | 1. This definition is matched by three different words.
112 | 2. It's also numbered rather than bulleted.
113 |    - With some sub-items.
114 |    - And another.
115 | 
116 | An image:
117 | 
118 | ![](image.jpg)
119 | 
120 | @ test
121 | : this appears beside the headword
122 | Blah blah blah.
123 | ```
124 | 
125 | ### Full
126 | 
127 | ```
128 | @ word
129 | This is the definition of a word.
130 | 
131 | @ word 2
132 | This is the defnition of the second word.
133 | 
134 | @ water
135 | & H2O
136 | 1. You can also use lists in Markdown.
137 | 2. And **bold text** or *italic text*.
138 |    - Sub-items are also supported.
139 | 
140 | @ test
141 | : -noun
142 | Blah blah blah.
143 | 
144 | @ test
145 | : -verb
146 | Blah blah blah.
147 | 
148 | @ custom
149 | ::
150 | **This is a custom word header!**
151 | 
152 | And the definition here:
153 | - Blah blah blah.
154 | - Blah blah blah.
155 | 
156 | @ images
157 | Embedding an image (relative paths):
158 | 
159 | ![](image.jpg)
160 | 
161 | Embedding an image (Linux/macOS style paths):
162 | 
163 | ![](/path/to/image.jpg)
164 | 
165 | Embedding an image (Windows style paths):
166 | 
167 | ![](C:/path/to/image.jpg)
168 | 
169 | 
170 | @ raw-html
171 | <html><p>This definition contains raw html.</p>
172 | 
173 | <p>You can split it into multiple lines for readability.</p>
174 | 
175 | <ul>
176 |   <li>You can also use all HTML tags.</li>
177 |   <li><span style="background: #666">This text has a dark background</span></li>
178 |   <li><span class="test">This text is styled with CSS classes.</span></li>
179 | </ul>
180 | 
181 | <style>
182 | .test {
183 |   text-decoration: underline;
184 | }
185 | </style>
186 | ```
187 | 


--------------------------------------------------------------------------------
/docs/dicthtml/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: dicthtml
 4 | has_children: true
 5 | ---
 6 | 
 7 | # dicthtml
 8 | 
 9 | This section contains documentation and notes about Kobo's dictionary format.
10 | {: .fs-6 .fw-300 }


--------------------------------------------------------------------------------
/docs/dicthtml/install.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Installing custom dictionaries
 4 | parent: dicthtml
 5 | ---
 6 | 
 7 | # Installing custom dictionaries
 8 | Sideloading custom dictionaries is easy, but slightly finicky.
 9 | 
10 | ## Using dictutil
11 | You can easily install dictionaries using dictutil. First, if you are not replacing a built-in dictionary, enable and install the **Enable searches on extra dictionaries** [patch](https://pgaskin.net/kobopatch-patches). Then, follow the [instructions for using the install command](../dictutil/install.html).
12 | 
13 | You can uninstall custom dictionaries (including reverting overwritten built-in ones) using the [uninstall command](../dictutil/uninstall.html).
14 | 
15 | ## Manual installation
16 | 1. Enable and install the **Enable searches on extra dictionaries** [patch](https://pgaskin.net/kobopatch-patches).
17 | 2. Copy the dictionary to `KOBOeReader/.kobo/dict/dicthtml-LOCALE.zip`, where **LOCALE** is a string consisting of 2 lowercase alphanumeric characters. It does not have to be a valid locale.
18 | 3. If using a a firmware version 4.20.14601 or newer, mark the file as read-only (in Windows Explorer, or `chmod 444 dicthtml-LOCALE.zip`) to prevent nickel from overwriting it during the sync process.
19 | 4. If using a firmware version older than 4.20.14601, open `KOBOeReader/.kobo/KoboReader.sqlite` in a SQLite3 editor, and add a row to the Dictionary table with the following values:
20 |     - **Suffix:** `-LOCALE`, where **LOCALE** is the locale code you chose earlier. This is used when constructing filenames.
21 |     - **Name:** `Extra:_LOCALE LABEL`, where **LOCALE** is the locale code you chose earlier, and **LABEL** is a custom label (it can have spaces in it).
22 |     - **Installed:** `true`. This one is self-explanatory.
23 |     - **Size:** `SIZE`, where *SIZE* is the size of the dictzip in bytes. This is displayed in the dictionary settings, but is unused otherwise, so it's fine if it isn't accurate as long as it is a valid number. For built-in dictionaries with `IsSynced` set, it is used to check for updates.
24 |     - **IsSynced:** `false`. This is used to see if the sync process should attempt to sync the specified dictionary. If true, the `Size` column is checked against the expected size of the latest version (from the dictionary download server), and if it does not match, the new dictionary is downloaded over it.
25 | 5. Open `KOBOeReader/.kobo/Kobo/Kobo eReader.conf`, and add a line like `ExtraLocales=LOCALE` in the `ApplicationPreferences` section. If it already exists, add your locale code to it and keep the items separated by a comma and a space (e.g. `ExtraLocales=a1, a2`).
26 | 6. Eject your eReader and test the dictionary.
27 |     - If the dictionary is unselectable, ensure you followed the steps correctly, especially regarding the locale codes.
28 |     - If the dictionary says that the word wasn't found, or just acts unusually in general, ensure the dictionary file is valid.
29 | 
30 | ## About locale names and patches
31 | The reason why the patch is required is due to a bug in the firmware. When you choose an entry from the dictionary dropdown, it tries to find a locale name matching it (which it uses to construct the filename for the dicthtml). Kobo has a hard-coded list of supported built-in locales, and supports adding extra ones using the **ApplicationPreferences->ExtraLocales** config file option (a comma separated list of locale codes). These locales have an automatically generated name of "Extra: LOCALE".
32 | 
33 | But, this is where the bug occurs. To support translation dictionaries, the dictionary selector will split the name by spaces, and only check against the first element. This is perfectly fine for one-word locale names (i.e. all the built-in ones) For custom locales, it will try to match **Extra:**, which doesn't exist, so it will default to the English dictionary. Thus, to fix this, the "Extra: " prefix used for the custom locales needs to be changed to one without a space. The patch replaces the space with an underscore. This bug does have one benefit though: since only stuff before the first space is considered, you can have a custom label after it.
34 | 
35 | ## Alternative method
36 | It is also possible to install custom dictionaries by replacing an existing built-in installed dictionary in `KOBOeReader/.kobo/dict`. To prevent it from being overwritten during a sync, set the `IsSynced` column to `false` for it in the DB on firmware versions older than 4.20.14601, otherwise, mark it read-only.
37 | 
38 | ## About changes in firmware 4.20.14601
39 | 
40 | In short:
41 | 
42 | - **Same:** Nickel will still attempt to sync all dictionaries, including sideloaded ones, unless IsSynced is false.
43 | - **New:** IsSynced can't be changed anymore due to the dictionary table being removed.
44 | - **New:** Nickel will avoid overwriting dictionary files if they are marked read-only, and will instead write `"dicthtml-LOCALE" marked as read-only.. skipping` to the log in the `sync` category. Note that this functionality has been around since at least 4.10.11655, but the database needed to be modified anyways, so there wasn't much point to using it (and nobody noticed it either).
45 | - **Same:** Nickel still generates locale names by default with `Extra: LOCALE`.
46 | - **New:** Nickel doesn't read the dictionary table anymore, so the name in it is ignored. In addition, entries in the table won't change anything even if it is still present.
47 | - **New:** The built-in dictionaries are hard-coded, rather than writing them to the db during migrations and reading from it at runtime.
48 | - **Same:** Nickel still has the bug where the locale splitting is messed up, so the `Extra: LOCALE` names are inherently broken.
49 | - **Same:** The matching can be fixed by replacing `Extra: ` with `Extra:_` (or anything not containing Unicode whitespace).
50 | - **New:** The database doesn't need to be changed anymore in addition to the patch, as the names are generated dynamically using the same string.
51 | - **Therefore:** If the dictionary table is present, it can safely be removed.
52 | - **Therefore:** The steps required to install custom dictionaries are now (note that these have already been incorporated into the instructions above, they are just here for convenience):
53 |   - Copy the dictzip and mark it read-only.
54 |   - Add it to ExtraLocales if it is not a built-in locale.
55 |   - Use the patch to replace `Extra: ` in libnickel with any other string (same length or shorter with a null byte at the end), but does not contain a space (` `).
56 | 
57 | See [#49](https://github.com/pgaskin/kobopatch-patches/issues/49) for more information.
58 | 
59 | ## Issues with the read-only method for preventing dictionaries from being overwritten
60 | There have been reports of the read-only property (see [#6](https://github.com/pgaskin/dictutil/issues/6) and the threads on MobileRead for more details) not having an effect since at least 4.20.14622. This seems to be due to other checks in the code (for IsSynced and the file size) preventing the read-only one from actually being checked under some conditions. Additionally, some people have had problems marking the dictionary as read-only to begin with (this doesn't seem to be an issue on Linux).
61 | 
62 | For now, you can use this [patch](https://pgaskin.net/kobopatch-patches) (for kobopatch v0.15.0, which is included in patches v60+) to prevent all dictionaries from being synced. It should work on most recent firmware versions starting from 4.22.15190.
63 | 
64 | ```yaml
65 | Never sync dictionaries:
66 | - Enabled: no
67 | - BaseAddress:  {Sym: "SyncDictionariesCommand::prepareDownloadList()"}
68 | - ReplaceBytes: {Offset: 922, FindH: 0CD5,     ReplaceH:       0CE0}   #permissions
69 | - ReplaceBytes: {Offset: 900, FindH: FFF6CAAE, ReplaceInstNOP: true}   #size
70 | - ReplaceBytes: {Offset: 866, FindH: 3FF4DBAE, ReplaceInstNOP: true}   #isSynced
71 | ```
72 | 
73 | For versions 4.20.14601 to 4.21.15015, use this patch instead:
74 | 
75 | ```yaml
76 | Never sync dictionaries:
77 | - Enabled: no
78 | - BaseAddress:  {Sym: "SyncDictionariesCommand::prepareDownloadList()"}
79 | - ReplaceBytes: {Offset: 1048, FindH: 0CD5,     ReplaceH:       0CE0}   #permissions
80 | - ReplaceBytes: {Offset: 1026, FindH: FFF68DAE, ReplaceInstNOP: true}   #size
81 | - ReplaceBytes: {Offset:  992, FindH: 3FF49EAE, ReplaceInstNOP: true}   #isSynced
82 | ```
83 | 


--------------------------------------------------------------------------------
/docs/dicthtml/matching.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: Matching words
4 | parent: dicthtml
5 | ---
6 | 
7 | # Matching words
8 | TODO
9 | 


--------------------------------------------------------------------------------
/docs/dicthtml/prefixes.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: Prefixes
  4 | parent: dicthtml
  5 | ---
  6 | 
  7 | # Prefixes
  8 | Kobo dictionaries are sharded by a prefix derived from the headword.
  9 | 
 10 | The information in this document is based on reverse engineering DictionaryParser::htmlForWord.
 11 | 
 12 | **Note:** Kobo will only look in the file matching the word's prefix, so if a variant has a different prefix, it must be duplicated into each matching file (note that duplicate words aren't an issue).
 13 | 
 14 | **Note:** This document only covers the algorithm used for non-Japanese (Kanji) dictionaries.
 15 | 
 16 | ## Prefix algorithm
 17 | Prefixes are calculated using the following steps. Note that "character" refers to a single Unicode code point, not a byte.
 18 | 
 19 | 1. Trim the word at the first null byte, if any (i.e. treat it as a C string).
 20 | 2. Discard everything but the first two characters.
 21 | 3. Convert the characters to lowercase using the Unicode case mapping rules.
 22 | 4. Trim all whitespace characters on the left and right sides.
 23 | 5. If the string is empty, return "11".
 24 | 6. If the first of the remaining characters is in the Unicode Cyrillic character class, return them as-is.
 25 | 7. Right-pad the remaining characters to 2 characters long using "`a`"s.
 26 | 8. If either of the first two characters are not in the Unicode Letter character class, return "11".
 27 | 9. Return the characters as-is.
 28 | 
 29 | ## Examples
 30 | 
 31 | <!-- dictutil x -fjson-array word | jq -r '.[] | "| \"`" + .[0] + "`\" | \"`" + .[1] + "`\" | |"' -->
 32 | 
 33 | | Word | Prefix | Notes |
 34 | | --- | --- | --- |
 35 | | "`test`" | "`te`" | |
 36 | | "`a`" | "`aa`" | |
 37 | | "`Èe`" | "`èe`" | The word is made lowercase using unicode rules (i.e. accented characters are included). |
 38 | | "`multiple words`" | "`mu`" | |
 39 | | "`àççèñts`" | "`àç`" | |
 40 | | "`à`" | "`àa`" | |
 41 | | "`ç`" | "`ça`" | |
 42 | | "" | "`11`" | |
 43 | | "`  `" | "`11`" | Space trimming is done after taking the first 2 characters. |
 44 | | "` x`" | "`xa`" | |
 45 | | "`   123`" | "`11`" | |
 46 | | "`x   23`" | "`xa`" | |
 47 | | "`д `" | "`д`" | "д" is a Cyrillic character, and it's the first character of the word (after trimming spaces), so it isn't padded with "a"s. |
 48 | | "`дaд`" | "`дa`" | |
 49 | | "`未未`" | "`未未`" | |
 50 | | "`未`" | "`未a`" | Even though "未" is a two-byte character, it is a single unicode rune (and the characters are counted, not bytes). |
 51 | | "`  未`" | "`11`" | Space trimming is done after taking the first 2 characters. |
 52 | | "` 未`" | "`未a`" | The two-byte "未" character isn't split up when taking the first 2 characters. |
 53 | 
 54 | ## Testing
 55 | You can test Kobo's prefix algorithm directly using [dictword-test](https://github.com/pgaskin/kobo-mods/tree/master/dictword-test/).
 56 | 
 57 | If you just want an easy way to generate prefixes for words, use the [dictutil prefix](../dictutil/prefix.html) command
 58 | 
 59 | ## Sample implementation
 60 | Here is the Go implementation used in dictutil:
 61 | 
 62 | ```go
 63 | func WordPrefix(word string) string {
 64 | 	pfx := []rune(word)
 65 | 
 66 | 	for i, c := range pfx {
 67 | 		if i >= 2 || c == '\x00' { // limit to 2 chars, also cut at null
 68 | 			pfx = pfx[:i] // trim up to current char
 69 | 			break
 70 | 		}
 71 | 		pfx[i] = unicode.ToLower(c) // this includes accented chars
 72 | 	}
 73 | 
 74 | 	for len(pfx) != 0 {
 75 | 		if unicode.IsSpace(pfx[0]) {
 76 | 			pfx = pfx[1:] // trim left space
 77 | 		} else {
 78 | 			break
 79 | 		}
 80 | 	}
 81 | 
 82 | 	for len(pfx) != 0 {
 83 | 		if unicode.IsSpace(pfx[len(pfx)-1]) {
 84 | 			pfx = pfx[:len(pfx)-1] // trim right space
 85 | 		} else {
 86 | 			break
 87 | 		}
 88 | 	}
 89 | 
 90 | 	if len(pfx) == 0 {
 91 | 		return "11" // if empty, return "11"
 92 | 	}
 93 | 
 94 | 	if !unicode.Is(unicode.Cyrillic, pfx[0]) {
 95 | 		for len(pfx) < 2 {
 96 | 			pfx = append(pfx, 'a') // pad right with 'a's to 2 chars
 97 | 		}
 98 | 		if !unicode.IsLetter(pfx[0]) || !unicode.IsLetter(pfx[1]) {
 99 | 			return "11" // if either of the first 2 chars are letters, return "11"
100 | 		}
101 | 	}
102 | 
103 | 	return string(pfx)
104 | }
105 | ```
106 | 


--------------------------------------------------------------------------------
/docs/dicthtml/v1v2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgaskin/dictutil/6708cff9a06dbd088ec2267a2314028a9a00b5a7/docs/dicthtml/v1v2-1.png


--------------------------------------------------------------------------------
/docs/dicthtml/v1v2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pgaskin/dictutil/6708cff9a06dbd088ec2267a2314028a9a00b5a7/docs/dicthtml/v1v2-2.png


--------------------------------------------------------------------------------
/docs/dicthtml/v1v2.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Dicthtml v1/v2
 4 | parent: dicthtml
 5 | ---
 6 | 
 7 | # Dicthtml v1/v2
 8 | In firmware 4.7.10364 (December 2017), Kobo introduced a new version of the dictionaries.
 9 | 
10 | The v1 dictionaries are served from download.kobobooks.com/ereader/dictionaries/dicthtml\*.zip, while the v2 dictionaries are served from download.kobobooks.com/ereader/dictionaries/v2/dicthtml\*.zip.
11 | 
12 | While the v1 dictionaries are still available (probably for the Kobo Mini, which is still on 3.19.5761), they will not fully work on newer firmware versions due to the prefix changes.
13 | 
14 | I haven't looked at the exact details about v1 dictionaries, but the main change seems to be the rules for computing prefixes for words with accents.
15 | 
16 | ## Prefix changes
17 | 
18 | ![](v1v2-1.png)
19 | 
20 | The primary change in v2 was the removal of the last step of prefix calculation - converting all non-ascii characters to `1`s. Note that this step is done after checking that the first two characters are all Unicode letters (which include accented letters), hence why the prefix wouldn't be `11` (which is used if any of the first 2 characters are not Unicode letters).
21 | 
22 | ## Built-in dictionary fixes
23 | 
24 | In addition, Kobo fixed some bugs with the dictionaries themselves. In v1, a few dictionaries were missing `<w>` tags around some words, presumably because the conversion code was buggy and the input format was undocumented/unstructured.
25 | 
26 | ![](v1v2-2.png)
27 | 
28 | As illustrated by the diff above, some words weren't separated properly and a few line breaks were missing in v1.


--------------------------------------------------------------------------------
/docs/dictutil/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: dictutil
 4 | has_children: true
 5 | ---
 6 | 
 7 | # dictutil
 8 | 
 9 | This section contains documentation for dictutil, a tool to manipulate Kobo dictionaries.
10 | {: .fs-6 .fw-300 }
11 | 
12 | ```
13 | Usage: dictutil command [options] [arguments]
14 | 
15 | Dictutil provides low-level utilities to manipulate Kobo dictionaries (v2).
16 | 
17 | Commands:
18 |   install (I)          Install a dictzip file
19 |   pack (p)             Pack a dictzip file
20 |   prefix (x)           Calculate the prefix for a word
21 |   uninstall (U)        Uninstall a dictzip file
22 |   unpack (u)           Unpack a dictzip file
23 |   help                 Show help for all commands
24 | 
25 | Options:
26 |   -h, --help   Show this help text
27 | ```


--------------------------------------------------------------------------------
/docs/dictutil/install.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Install
 4 | parent: dictutil
 5 | ---
 6 | 
 7 | # Install
 8 | 
 9 | ## Usage
10 | 
11 | ```
12 | Usage: dictutil install [options] dictzip
13 | 
14 | Options:
15 |   -k, --kobo string         KOBOeReader path (default: automatically detected)
16 |   -l, --locale string       Locale name to use (format: ALPHANUMERIC{2}[-ALPHANUMERIC{2}]) (default: detected from filename if in format dicthtml-**.zip)
17 |   -n, --name string         Custom additional label for dictionary (ignored when replacing built-in dictionaries) (doesn't have any effect on 4.20.14601+)
18 |   -b, --builtin string      How to handle built-in locales [replace = replace and prevent from syncing] [ignore = replace and leave syncing as-is] (doesn't have any effect on 4.24.15672+) (default "replace")
19 |   -B, --no-custom           Whether to force installation to .kobo/dict instead of .kobo/custom-dict (4.24.15672+ only)
20 |       --use-extra-locales   Whether to use ExtraLocales on 4.24.15672+ if not a built-in dictionary (this is not required anymore since 4.24.15672) (4.24.15672+ only)
21 |   -h, --help                Show this help text
22 | 
23 | Note:
24 |   If you are not replacing a built-in dictionary and are using a firmware
25 |   version before 4.24.15672, the 'Enable searches on extra dictionaries patch'
26 |   must be installed or you will not be able to select your custom dictionary.
27 | ```
28 | 
29 | ## Examples
30 | 
31 | **Install a dictionary with the locale in the filename (dicthtml-\*\*.zip):**
32 | 
33 | ```sh
34 | dictutil install dicthtml-aa.zip
35 | ```
36 | 
37 | **Install a dictionary with a different locale:**
38 | 
39 | ```sh
40 | dictutil install --locale aa mydictionary.zip
41 | ```
42 | 
43 | **Install a dictionary on a specific Kobo:**
44 | 
45 | ```sh
46 | dictutil install --kobo /path/to/KOBOeReader dicthtml-aa.zip
47 | ```
48 | 
49 | **Install a dictionary with a custom label (4.19.14123 and older):**
50 | 
51 | ```sh
52 | dictutil install --name "My Dictionary" dicthtml-aa.zip
53 | ```
54 | 
55 | ## Details
56 | See [installing dictionaries](../dicthtml/install.html) for more details on how this works.
57 | 


--------------------------------------------------------------------------------
/docs/dictutil/pack.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Pack
 4 | parent: dictutil
 5 | ---
 6 | 
 7 | # Pack
 8 | 
 9 | ## Usage
10 | 
11 | ```
12 | Usage: dictutil pack [options] dictdir
13 | 
14 | Options:
15 |   -o, --output string   The output dictzip filename (will be overwritten if it exists) (default "dicthtml.zip")
16 |   -c, --crypt string    Encrypt the dictzip using the specified encryption method (format: method:keyhex)
17 |   -h, --help            Show this help text
18 | ```
19 | 
20 | ## Examples
21 | 
22 | **Pack a dictdir:**
23 | 
24 | ```sh
25 | dictutil pack /path/to/dictdir
26 | # the output is written to dicthtml.zip
27 | ```
28 | 
29 | **Pack a dictdir to a specific filename:**
30 | 
31 | ```sh
32 | dictutil pack --output "dicthtml-aa.zip" /path/to/dictdir
33 | ```
34 | 
35 | ## Input format
36 | The input dictdir is the same as the output of [dictutil unpack](./unpack.html).
37 | 


--------------------------------------------------------------------------------
/docs/dictutil/prefix.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Prefix
 4 | parent: dictutil
 5 | ---
 6 | 
 7 | # Prefix
 8 | 
 9 | ## Usage
10 | 
11 | ```
12 | Usage: dictutil prefix [options] word...
13 | 
14 | Options:
15 |   -f, --format string   The output format (go-slice, go-map, csv, tsv, json-array, json-object) (default "json-array")
16 |   -h, --help            Show this help text
17 | ```
18 | 
19 | ## Examples
20 | 
21 | **Get the prefix for a word:**
22 | 
23 | ```sh
24 | dictutil prefix "word"
25 | ```
26 | 
27 | **Get the prefix for multiple words:**
28 | 
29 | ```sh
30 | dictutil prefix "word1" "word2" "word3"
31 | ```
32 | 
33 | **Get the prefix for multiple words as CSV:**
34 | 
35 | ```sh
36 | dictutil prefix --format csv "word1" "word2" "word3"
37 | ```
38 | 


--------------------------------------------------------------------------------
/docs/dictutil/uninstall.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Uninstall
 4 | parent: dictutil
 5 | ---
 6 | 
 7 | # Uninstall
 8 | 
 9 | ## Usage
10 | 
11 | ```
12 | Usage: dictutil uninstall [options] locale
13 | 
14 | Options:
15 |   -k, --kobo string      KOBOeReader path (default: automatically detected)
16 |   -b, --builtin string   How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+) (default "normal")
17 |   -B, --no-custom        Uninstall built-in dictionaries instead of custom ones on 4.24.15672+
18 |   -h, --help             Show this help text
19 | ```
20 | 
21 | ## Examples
22 | 
23 | **Uninstall a dictionary:**
24 | 
25 | ```sh
26 | dictutil uninstall aa
27 | ```
28 | 
29 | **Restore a overwritten built-in dictionary:**
30 | 
31 | ```sh
32 | dictutil uninstall --builtin restore fr
33 | ```
34 | 
35 | **Completely delete a built-in dictionary:**
36 | 
37 | ```sh
38 | dictutil uninstall --builtin delete fr
39 | ```
40 | 
41 | Note: You can restore the dictionary by manually downloading it and using [dictutil install](./install).
42 | 
43 | ## Details
44 | Uninstall does the following steps:
45 | 
46 | 1. If the DB entry for the dictionary exists:
47 |    - Built-in (normal): Set `Installed` to `false`.
48 |    - Built-in (delete): Remove the row for the suffix.
49 |    - Built-in (restore): Set `Installed` to `true`.
50 |    - Extra: Remove the row for the suffix.
51 | 2. If the dictionary is not built-in and there is an `ExtraLocales` entry for the locale in the `.kobo/Kobo/Kobo eReader.conf`, remove it.
52 | 3. With the dictzip:
53 |    - Built-in (normal): Delete it if it exists.
54 |    - Built-in (delete): Delete it if it exists.
55 |    - Built-in (restore): Delete it if it exists, then download it again from Kobo.
56 |    - Extra: Delete it if it exists.
57 | 


--------------------------------------------------------------------------------
/docs/dictutil/unpack.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Unpack
 4 | parent: dictutil
 5 | ---
 6 | 
 7 | # Unpack
 8 | 
 9 | ## Usage
10 | 
11 | ```
12 | Usage: dictutil unpack [options] dictzip
13 | 
14 | Options:
15 |   -o, --output string   The output directory (must not exist) (default: the basename of the input without the extension)
16 |   -c, --crypt string    Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex)
17 |   -h, --help            Show this help text
18 | ```
19 | 
20 | ## Examples
21 | 
22 | **Unpack a dictionary:**
23 | 
24 | ```sh
25 | dictutil unpack dicthtml.zip
26 | # The output is written to ./dicthtml
27 | ```
28 | 
29 | ```sh
30 | dictutil unpack dicthtml-fr.zip
31 | # The output is written to ./dicthtml-fr
32 | ```
33 | 
34 | **Unpack a dictionary to a custom directory:**
35 | 
36 | ```
37 | dictutil unpack --output mydictionary dicthtml.zip
38 | ```
39 | 
40 | ## Details
41 | An unpacked dictdir contains:
42 | 
43 | - `words`: The parsed marisa word list (newline-separated).
44 | - `*.html`: The ungzipped dicthtml files.
45 | - `*`: Any additional files as-is.
46 | 


--------------------------------------------------------------------------------
/docs/examples/bgl-convert.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: bgl-convert
 4 | parent: examples
 5 | ---
 6 | 
 7 | # bgl-convert
 8 | Converts Babylon BGL dictionaries into dictfiles for use with dictgen.
 9 | 
10 | Paste the BGL text in the box below to convert it:
11 | 
12 | <iframe src="https://raw.githack.com/pgaskin/dictutil/master/examples/bgl-convert/index.html" style="border: 1px solid #000; width: 100%; height: 600px;"></iframe>
13 | 
14 | Example BGL:
15 | 
16 | ```
17 | ### metadata trimmed for brevity
18 | ### ...
19 | 
20 | headword
21 | Definition with <b>html</b> tags.
22 | 
23 | headword1|variant1|variant2
24 | The second definition. Blah
25 | blah blah blah.
26 | 
27 | 
28 | ```
29 | 


--------------------------------------------------------------------------------
/docs/examples/dictzip-decompile.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: dictzip-decompile
 4 | parent: examples
 5 | ---
 6 | 
 7 | # dictzip-decompile
 8 | This is an **experimental** tool to convert a dictzip into a dictfile. The output may not be perfect for complex dictionaries. The output should be perfect for dictionaries generated by Penelope.
 9 | 
10 | ## Usage
11 | 
12 | ```
13 | Usage: dictzip-decompile [options] dictzip
14 | 
15 | Options:
16 |   -o, --output string   The output filename (will be overwritten if it exists) (- is stdout) (default "./decompiled.df")
17 |   -r, --resources       Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled)
18 |   -h, --help            Show this help text
19 | 
20 | Arguments:
21 |   dictzip is the path to the dictzip to decompile.
22 | 
23 | To convert the resulting dictfile into a dictzip, use dictgen.
24 | 
25 | Note: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown.
26 | 
27 | This is an experimental tool, and the output may not be perfect on complex dictionaries.
28 | ```
29 | 
30 | ## Example uses
31 | - Fixing prefixes or missing variants in dictzips generated by other tools (recompiling the dictfile will automatically fix the prefixes and variants).
32 | - Upgrading a v1 dictzip to v2 (same as above).
33 | - Decompiling a dictzip to merge it with another.
34 | - Converting a previously-created dictzip to a dictfile to make it easier to improve.
35 | - Converting StarDict dictionaries by converting to a dictzip using Penelope, then to a dictfile using this tool.
36 | 
37 | ## Notes
38 | The following dictzip generators have enhanced decompilation support:
39 | 
40 | - **Penelope:** The output should be perfect.
41 | - **Kobo (en, a few others):** The output should be mostly perfect, but there are a few missing edge cases. Variants (`&`) and header info (`:`) are extracted in addition to the entry content.
42 | - **Kobo (fr):** The output should be mostly perfect, but there are a few missing edge cases. Variants (`&`) and header info (`:`) are extracted in addition to the entry content.
43 | - **dictgen:** The output should be very close to the original dictfile (it has been tested with the output of gotdict-convert and webster1913-convert). With gotdict-convert, the only difference when the decompiled dictzip's dictfile was recompiled was the casing of a few entries in the words index. Even so, this should not be used unless the original dictfile has been lost. In addition, the original Markdown code and images are not recovered. Variants (`&`) and header info (`:` / `::`) are extracted in addition to the entry content.
44 | 
45 | Other dictzips only have the headword (`@`) and variants (`&`) extracted, and the content is included as-is as raw HTML without support for other dictfile features.
46 | 


--------------------------------------------------------------------------------
/docs/examples/gotdict-convert.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: gotdict-convert
 4 | parent: examples
 5 | ---
 6 | 
 7 | # gotdict-convert
 8 | This tool converts [gotdict](https://github.com/wjdp/gotdict) to a dictfile for conversion into a Kobo dictzip.
 9 | 
10 | Images are supported on firmware 4.20.14601+.
11 | 
12 | ## Download
13 | Pre-built dictionaries can be downloaded from the following links:
14 | - GOTDict *(with images, firmware 4.20.14601+)*: [dictzip (dicthtml-gt.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.df?branch=master&all=false&pr=false)
15 | - GOTDict *(without images)*: [dictzip (dicthtml-gt.noimg.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.noimg.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.noimg.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.noimg.df?branch=master&all=false&pr=false)
16 | 
17 | You can use [dictutil](../dictutil/install.html) to install the dictionaries, or see [here](../dicthtml/install.html) for manual installation instructions.
18 | 
19 | ## Usage
20 | 
21 | ```
22 | Usage: gotdict-convert [options]
23 | 
24 | Version: dev
25 | 
26 | Options:
27 |   -g, --gotdict string   The path to the local copy of github.com/wjdp/gotdict. (default "./gotdict")
28 |   -o, --output string    The output filename (will be overwritten if it exists) (- is stdout) (default "./gotdict.df")
29 |   -I, --images           Include images in dictfile
30 |   -h, --help             Show this help text
31 | 
32 | To convert the resulting dictfile into a dictzip, use dictgen.
33 | ```
34 | 
35 | You can also use the parser as a [Go library](https://pkg.go.dev/github.com/pgaskin/dictutil/examples/gotdict-convert/gotdict).
36 | 


--------------------------------------------------------------------------------
/docs/examples/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: examples
 4 | has_children: true
 5 | ---
 6 | 
 7 | # Examples
 8 | 
 9 | This section contains some tools which make use of dictutil.
10 | {: .fs-6 .fw-300 }


--------------------------------------------------------------------------------
/docs/examples/webster1913-convert.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: webster1913-convert
 4 | parent: examples
 5 | ---
 6 | 
 7 | # webster1913-convert
 8 | This tool converts [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) into a dictfile for conversion into a Kobo dictzip.
 9 | 
10 | ## Download
11 | Pre-built dictionaries can be downloaded from the following links:
12 | - Webster's 1913 Dictionary: [dictzip (dicthtml-wb.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/dicthtml-wb.zip?branch=master&all=false&pr=false), [source dictfile (webster1913.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/webster1913.df?branch=master&all=false&pr=false)
13 | 
14 | You can use [dictutil](../dictutil/install.html) to install the dictionaries, or see [here](../dicthtml/install.html) for manual installation instructions.
15 | 
16 | ## Usage
17 | 
18 | ```
19 | Usage: webster1913-convert [options] gutenberg_webster1913_path
20 | 
21 | Options:
22 |   -o, --output string   The output filename (will be overwritten if it exists) (- is stdout) (default "./webster1913.df")
23 |       --dump            Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging)
24 |   -h, --help            Show this help text
25 | 
26 | Arguments:
27 |   gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin.
28 | 
29 | To convert the resulting dictfile into a dictzip, use dictgen.
30 | ```
31 | 
32 | The source dictionary can be downloaded [here](http://www.gutenberg.org/ebooks/29765.txt.utf-8) or [here](https://github.com/pgaskin/dictserver/raw/master/data/dictionary.txt).
33 | 
34 | You can also use the parser as a [Go library](https://pkg.go.dev/github.com/pgaskin/dictutil/examples/webster1913-convert/webster1913).
35 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Documentation
 4 | nav_order: 1
 5 | ---
 6 | 
 7 | # Dictutil
 8 | {: .fs-9 }
 9 | 
10 | A collection of documentation and tools for working with Kobo dictionaries.
11 | {: .fs-6 .fw-300 }
12 | 
13 | [Download](https://github.com/pgaskin/dictutil/releases){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 } [dicthtml](./dicthtml/){: .btn .fs-5 .mb-4 .mb-md-0 } [dictgen](./dictgen/){: .btn .fs-5 .mb-4 .mb-md-0 } [dictutil](./dictutil/){: .btn .fs-5 .mb-4 .mb-md-0 }
14 | 
15 | ---
16 | 
17 | **Prebuilt dictionaries:**
18 | 
19 | [GOTDict](./examples/gotdict-convert.html#download){: .btn .fs-3 .mb-1 .mb-md-0 } [Webster's 1913 Unabridged Dictionary](./examples/webster1913-convert.html#download){: .btn .fs-3 .mb-1 .mb-md-0 }
20 | 
21 | ---
22 | 
23 | These tools are designed to work with v2 dictionaries (4.7.10364+).
24 | 
25 | ## Getting started
26 | If you're interested in creating dictionaries, look at the [dictgen documentation](./dictgen/). If you're interested in installing or manipulating existing dictionaries, see the [dictutil documentation](./dictutil/). Otherwise, see the [dicthtml documentation](./dicthtml/) for more information about the Kobo dictionary format.
27 | 
28 | ## dicthtml
29 | These pages are some notes I've made about the Kobo dictionary format based on reverse engineering the firmware and the official dictionaries.
30 | 
31 | - **[Format](./dicthtml/format.html):** About the Kobo dictionary format.
32 | - **[Prefixes](./dicthtml/prefixes.html):** Details about prefix calculation.
33 | - **[v1/v2 dictionaries](./dicthtml/v1v2.html):** Changes between v1/v2 dictionaries.
34 | - **[Installing custom dictionaries](./dicthtml/install.html):** Notes about sideloading dictionaries.
35 | 
36 | ## dictutil
37 | dictutil is a low-level tool to unpack, pack, and perform other operations on Kobo dictzips.
38 | 
39 | - **[Dictutil](./dictutil/)**
40 | - **[Install](./dictutil/install.html):** Install a dictzip.
41 | - **[Uninstall](./dictutil/uninstall.html):** Uninstall a dictzip.
42 | - **[Pack](./dictutil/pack.html):** Pack a dictzip from a dictdir.
43 | - **[Unpack](./dictutil/unpack.html):** Unpack a dictzip into a dictdir.
44 | - **[Prefix](./dictutil/prefix.html):** Calculate the dicthtml prefix for a word.
45 | 
46 | ## dictgen
47 | dictgen is an easy-to-use tool/library to generate Kobo dictionaries from scratch or use in conversion scripts. It deals with all the unusual bits (e.g. variant capitalization, prefix generation, etc) for you and gives warnings when it can't.
48 | 
49 | - **[Dictgen](./dictgen#usage)**
50 | - **[Dictfile format](./dictgen#dictfile-format)**
51 | 
52 | ## examples
53 | These are some tools which make use of dictutil to convert actual dictionaries.
54 | 
55 | - **[gotdict-convert](./examples/gotdict-convert.html):** Converts [github.com/wjdp/gotdict](https://github.com/wjdp/gotdict) to a dictfile.
56 | - **[webster1913-convert](./examples/webster1913-convert.html):** Converts [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) to a dictfile.
57 | - **[dictzip-decompile](./examples/dictzip-decompile.html):** An **experimental** tool to convert a dictzip into a dictfile.
58 | - **[bgl-convert](./examples/bgl-convert.html):** A simple tool to convert Babylon BGL dictionaries to a dictfile.
59 | 
60 | ## other
61 | 
62 | - **[dictword-test](https://github.com/pgaskin/kobo-mods/tree/master/dictword-test):** Calculates word prefixes using libnickel.
63 | - **[marisa](https://github.com/pgaskin/dictutil/tree/master/marisa):** Marisa bindings for Go.
64 | 


--------------------------------------------------------------------------------
/examples/bgl-convert/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html lang="en">
  4 | <meta charset="utf-8">
  5 | <title>BGL Converter</title>
  6 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 | <meta name="description" content="Converts a Babylon BGL dictionary in text format into a dictfile for dictgen.">
  8 | 
  9 | <style>
 10 | * {
 11 |     box-sizing: border-box;
 12 | }
 13 | #app {
 14 |     display: flex;
 15 |     flex-direction: row;
 16 |     align-items: stretch;
 17 |     position: fixed;
 18 |     top: 0;
 19 |     left: 0;
 20 |     right: 0;
 21 |     bottom: 0;
 22 | }
 23 | #app textarea {
 24 |     flex: 1;
 25 |     resize: none;
 26 |     padding: 1em;
 27 |     background: #eee;
 28 |     border: 0 solid #000;
 29 |     font: normal 12px/1.3 monospace;
 30 | }
 31 | @media screen and (min-width: 600px) {
 32 |     #app #input {
 33 |         flex: 0 0 auto;
 34 |         resize: horizontal;
 35 |         min-width: 25%;
 36 |         max-width: 75%;
 37 |         width: 50%;
 38 |         min-height: 100%;
 39 |         max-height: 100%;
 40 |         border-width: 0 1px 0 0;
 41 |     }
 42 | }
 43 | @media screen and (max-width: 600px) {
 44 |     #app {
 45 |         flex-direction: column;
 46 |     }
 47 |     #app #input {
 48 |         flex: 0 0 auto;
 49 |         resize: vertical;
 50 |         min-height: 25%;
 51 |         max-height: 75%;
 52 |         height: 50%;
 53 |         min-width: 100%;
 54 |         max-width: 100%;
 55 |         border-width: 0 0 1px 0;
 56 |     }
 57 | }
 58 | </style>
 59 | 
 60 | <div id="app">
 61 | <textarea id="input" placeholder="Input Babylon BGL text"></textarea>
 62 | <textarea id="output" placeholder="Output dictfile" autocomplete="off" readonly></textarea>
 63 | </div>
 64 | 
 65 | <script>
 66 | const input = document.getElementById("input")
 67 | const output = document.getElementById("output")
 68 | 
 69 | window.addEventListener("load", update)
 70 | input.addEventListener("input", update)
 71 | 
 72 | function update() {
 73 |     output.value = makeDictfile(parseBGL(input.value))
 74 | }
 75 | 
 76 | function parseBGL(bgl) {
 77 |     let entries = []
 78 |     let entry = null
 79 |     for (const line of bgl.split(/\r?\n/)) {
 80 |         if (line.startsWith("###"))
 81 |             continue
 82 |         const trimmed = line.trim()
 83 |         const empty = trimmed == "";
 84 |         if (!entry) {
 85 |             if (!empty) {
 86 |                 let spl = trimmed.split(/\s*\|\s*/)
 87 |                 entry = {
 88 |                     headword: spl[0],
 89 |                     variant:  spl.splice(1),
 90 |                     html:     ``,
 91 |                 }
 92 |             }
 93 |         } else if (empty) {
 94 |             entries.push(entry)
 95 |             entry = null
 96 |         } else {
 97 |             entry.html += trimmed + "\n"
 98 |         }
 99 |     }
100 |     if (entry)
101 |         entries.push(entry)
102 |     return entries
103 | }
104 | 
105 | function makeDictfile(parsedBGL) {
106 |     return parsedBGL.map(({headword, variant, html}) => ([
107 |         `@ ${headword}`,
108 |         ...variant.map(v => `& ${v}`),
109 |         `<html>`,
110 |         html,
111 |     ].join("\n"))).join("\n")
112 | }
113 | </script>
114 | 


--------------------------------------------------------------------------------
/examples/dictzip-decompile/main.go:
--------------------------------------------------------------------------------
  1 | // Command dictzip-decompile converts a dictzip into a dictfile. The regenerated
  2 | // dictzip from the dictfile may not match exactly, but it will look the same,
  3 | // and certain bugs with prefixes and variants will be implicitly fixed by the
  4 | // conversion process (i.e. variant in wrong file, incorrect prefix, missing
  5 | // words in index file). All output is in raw HTML, not Markdown.
  6 | //
  7 | // This is an experimental tool, and the output may not be perfect on complex
  8 | // dictionaries.
  9 | package main
 10 | 
 11 | import (
 12 | 	"fmt"
 13 | 	"io"
 14 | 	"os"
 15 | 
 16 | 	"github.com/pgaskin/dictutil/kobodict"
 17 | 	"github.com/spf13/pflag"
 18 | 
 19 | 	_ "github.com/pgaskin/dictutil/kobodict/marisa"
 20 | )
 21 | 
 22 | var version = "dev"
 23 | 
 24 | func main() {
 25 | 	pflag.CommandLine.SortFlags = false
 26 | 	output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"decompiled.df", "The output filename (will be overwritten if it exists) (- is stdout)")
 27 | 	resources := pflag.BoolP("resources", "r", false, "Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled)")
 28 | 	help := pflag.BoolP("help", "h", false, "Show this help text")
 29 | 	pflag.Parse()
 30 | 
 31 | 	if *help || pflag.NArg() != 1 {
 32 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options] dictzip\n\nVersion: dictzip-decompile %s\n\nOptions:\n%s\nArguments:\n  dictzip is the path to the dictzip to decompile.\n\nTo convert the resulting dictfile into a dictzip, use dictgen.\n\nNote: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown.\n\nThis is an experimental tool, and the output may not be perfect on complex dictionaries.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
 33 | 		if pflag.NArg() != 0 {
 34 | 			os.Exit(2)
 35 | 		} else {
 36 | 			os.Exit(0)
 37 | 		}
 38 | 		return
 39 | 	}
 40 | 
 41 | 	fn := pflag.Args()[0]
 42 | 
 43 | 	fmt.Fprintf(os.Stderr, "Opening input dictzip.\n")
 44 | 	f, err := os.Open(fn)
 45 | 	if err != nil {
 46 | 		fmt.Fprintf(os.Stderr, "Error: open input file %#v: %v.\n", fn, err)
 47 | 		os.Exit(1)
 48 | 		return
 49 | 	}
 50 | 	defer f.Close()
 51 | 
 52 | 	s, err := f.Stat()
 53 | 	if err != nil {
 54 | 		fmt.Fprintf(os.Stderr, "Error: stat input file %#v: %v.\n", fn, err)
 55 | 		os.Exit(1)
 56 | 		return
 57 | 	}
 58 | 
 59 | 	fmt.Fprintf(os.Stderr, "Parsing dictzip.\n")
 60 | 	dr, err := kobodict.NewReader(f, s.Size())
 61 | 	if err != nil {
 62 | 		fmt.Fprintf(os.Stderr, "Error: parse input file %#v: %v.\n", fn, err)
 63 | 		os.Exit(1)
 64 | 		return
 65 | 	}
 66 | 
 67 | 	fmt.Fprintf(os.Stderr, "Decompiling dictzip.\n")
 68 | 	df, err := decompile(dr)
 69 | 	if err != nil {
 70 | 		fmt.Fprintf(os.Stderr, "Error: decompile dictzip %#v: %v.\n", fn, err)
 71 | 		os.Exit(1)
 72 | 		return
 73 | 	}
 74 | 
 75 | 	if *resources {
 76 | 		fmt.Fprintf(os.Stderr, "Extracting resources.\n")
 77 | 		for _, f := range dr.File {
 78 | 			fmt.Fprintf(os.Stderr, "  ./%s\n", f.Name)
 79 | 			if err := func() error {
 80 | 				rc, err := f.Open()
 81 | 				if err != nil {
 82 | 					return fmt.Errorf("open: %w", err)
 83 | 				}
 84 | 				defer rc.Close()
 85 | 
 86 | 				f, err := os.OpenFile(f.Name, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
 87 | 				if err != nil {
 88 | 					return fmt.Errorf("create output: %w", err)
 89 | 				}
 90 | 				defer f.Close()
 91 | 
 92 | 				if _, err := io.Copy(f, rc); err != nil {
 93 | 					return fmt.Errorf("copy: %w", err)
 94 | 				}
 95 | 
 96 | 				if err := f.Close(); err != nil {
 97 | 					return fmt.Errorf("write output: %w", err)
 98 | 				}
 99 | 
100 | 				return nil
101 | 			}(); err != nil {
102 | 				fmt.Fprintf(os.Stderr, "Error: extract resource %#v: %v.\n", f.Name, err)
103 | 				os.Exit(1)
104 | 				return
105 | 			}
106 | 		}
107 | 	} else {
108 | 		if len(dr.File) != 0 {
109 | 			fmt.Fprintf(os.Stderr, "Warning: dictfile contains %d resources, but skipping because resource extraction is not enabled (see --help for more details).\n", len(dr.File))
110 | 		}
111 | 	}
112 | 
113 | 	fmt.Fprintf(os.Stderr, "Writing dictfile.\n")
114 | 	switch *output {
115 | 	case "-":
116 | 		if err := df.WriteDictFile(os.Stdout); err != nil {
117 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
118 | 			os.Exit(1)
119 | 			return
120 | 		}
121 | 	default:
122 | 		f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
123 | 		if err != nil {
124 | 			fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err)
125 | 			os.Exit(1)
126 | 			return
127 | 		}
128 | 
129 | 		if err := df.WriteDictFile(f); err != nil {
130 | 			f.Close()
131 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
132 | 			os.Exit(1)
133 | 			return
134 | 		}
135 | 
136 | 		if err := f.Close(); err != nil {
137 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
138 | 			os.Exit(1)
139 | 			return
140 | 		}
141 | 	}
142 | 
143 | 	fmt.Fprintf(os.Stderr, "Successfully converted %d entries from dictzip %#v to dictfile %s.\n", len(df), fn, *output)
144 | 	os.Exit(0)
145 | }
146 | 


--------------------------------------------------------------------------------
/examples/dictzip-decompile/parse.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"crypto/sha1"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"regexp"
  9 | 	"unicode"
 10 | 
 11 | 	"github.com/pgaskin/dictutil/dictgen"
 12 | 	"github.com/pgaskin/dictutil/kobodict"
 13 | )
 14 | 
 15 | // This isn't exposed as a separate package, as it's subject to change and
 16 | // highly specific to dictzip-decompile.
 17 | 
 18 | // The regexps used to extract data should have a similar level of strictness as
 19 | // the ones used by nickel (for simplicity, compatibility, and predictability).
 20 | 
 21 | // decompile decompiles a dictzip into a dictfile. External resources are not
 22 | // extracted, and must be done separately.
 23 | //
 24 | // Duplicate entries (e.g. the ones added by dictgen for fixing broken variants)
 25 | // are collapsed into one. They will be expanded again as necessary when the
 26 | // dictfile is compiled by dictgen.
 27 | func decompile(r *kobodict.Reader) (dictgen.DictFile, error) {
 28 | 	var df dictgen.DictFile
 29 | 	seenEntries := map[[20]byte]struct{}{}
 30 | 	for _, dh := range r.Dicthtml {
 31 | 		if err := func() error {
 32 | 			rc, err := dh.Open()
 33 | 			if err != nil {
 34 | 				return fmt.Errorf("open: %w", err)
 35 | 			}
 36 | 			defer rc.Close()
 37 | 
 38 | 			buf, err := ioutil.ReadAll(rc)
 39 | 			if err != nil {
 40 | 				return fmt.Errorf("read: %w", err)
 41 | 			}
 42 | 
 43 | 			es, err := extractEntries(buf)
 44 | 			if err != nil {
 45 | 				return fmt.Errorf("extract entries: %w", err)
 46 | 			}
 47 | 
 48 | 			for _, e := range es {
 49 | 				ss := sha1.Sum(e)
 50 | 				if _, ok := seenEntries[ss]; ok {
 51 | 					continue
 52 | 				}
 53 | 				seenEntries[ss] = struct{}{}
 54 | 
 55 | 				de, err := decompileEntry(e)
 56 | 				if err != nil {
 57 | 					return fmt.Errorf("decompile entry %#v: %w", string(e), err)
 58 | 				}
 59 | 
 60 | 				df = append(df, de)
 61 | 			}
 62 | 
 63 | 			return nil
 64 | 		}(); err != nil {
 65 | 			return nil, fmt.Errorf("process dicthtml %#v: %w", dh.Name, err)
 66 | 		}
 67 | 	}
 68 | 	return df, nil
 69 | }
 70 | 
 71 | // The regexps/vars used by decompileEntry.
 72 | var (
 73 | 	// generator matchers (match the entire entry, split into parts) (match in order) (don't include variants here)
 74 | 	generator1PenelopeRe         = regexp.MustCompile(`^(?s)<a name="([^"]+)"\/><div><b>([^<]+)<\/b><br\/>(.+)<\/div>$`)                                        // also: first and second groups must be equal
 75 | 	generator2KoboFrRe           = regexp.MustCompile(`^(?s)<p><a name="([^"]+)" ?(?:\/>|><\/a>)<b>\s*([^<]+)\s*<\/b>\s*(.*?)<br ?\/><br ?\/>\s*(.+)\s*<\/p>$`) // also: 2nd and 3rd (header) group must not contain "<br", "<li", "<var", "<p"; also: need to wrap returned content in a p tag
 76 | 	generator3KoboEnOrDictutilRe = regexp.MustCompile(`^(?s)<p><a name="([^"]+)" ?(?:\/>|><\/a>)<b>\s*(.+?)\s*<\/b>\s*(.*?)\s*<\/p>\s*(.+)\s*$`)                // also: 2nd and 3rd (header) group must not contain "<br", "<li", "<var", "<p"
 77 | 	// fallback matchers (if none of the above exist)
 78 | 	headFallbackIndexWordRe = regexp.MustCompile(`<a name="([^"]+)" ?(?:\/>|><\/a>)`) // this is slightly more lenient than some of Kobo's (it makes the space before the closing optional)
 79 | 	// other matchers
 80 | 	variantsRe     = regexp.MustCompile(`<var>(.*?)<\/var>`)
 81 | 	variantsItemRe = regexp.MustCompile(`<variant name="([^"]+)" ?(?:\/>|><\/variant>)`)
 82 | )
 83 | 
 84 | // decompileEntry parses an entry (it must be trimmed).
 85 | func decompileEntry(buf []byte) (*dictgen.DictFileEntry, error) {
 86 | 	var entry dictgen.DictFileEntry
 87 | 
 88 | 	// Generator-specific enhanced extraction (for making use of dictfile lines
 89 | 	// starting with &, :, etc).
 90 | 	var generatorMatched bool
 91 | 	// -- Penelope: https://github.com/pettarin/penelope/blob/fce6dcfd899d3755ae3a5a3867d7d436105ada56/penelope/format_kobo.py#L167
 92 | 	//    e.g. <w><a name="dfgdfg"/><div><b>dfgdfg</b><br/>Penelope</div>sdfsdf</div></w>
 93 | 	if !generatorMatched {
 94 | 		if m := generator1PenelopeRe.FindSubmatch(buf); len(m) != 0 {
 95 | 			headwordIndex, headwordDisplay, contentHTML := m[1], m[2], m[3]
 96 | 			if !bytes.Equal(headwordIndex, headwordDisplay) {
 97 | 				// it's a false positive if those aren't identical
 98 | 			} else {
 99 | 				entry.Headword = string(headwordIndex)
100 | 				entry.RawHTML = true
101 | 				entry.Definition = string(contentHTML)
102 | 				generatorMatched = true
103 | 			}
104 | 		}
105 | 	}
106 | 	// -- Kobo: based on dicthtml-fr
107 | 	//    e.g. <w><p><a name="a-"/><b>a-, an-</b><br/><br/><ol> <li>Élément exprimant la négation ( pas ), ou la privation ( sans ). </li>&nbsp;&nbsp;&nbsp;⇒anormal, apolitique. </ol></p></w>
108 | 	if !generatorMatched {
109 | 		if m := generator2KoboFrRe.FindSubmatch(buf); len(m) != 0 {
110 | 			headwordIndex, headwordDisplay, headerInfo, contentHTML := m[1], m[2], m[3], m[4]
111 | 			if bytes.Contains(headwordDisplay, []byte("<br")) || bytes.Contains(headerInfo, []byte("<br")) {
112 | 				// it's a false positive if those contain line breaks
113 | 			} else if bytes.Contains(headwordDisplay, []byte("<li")) || bytes.Contains(headerInfo, []byte("<li")) {
114 | 				// it's a false positive if those contain list items
115 | 			} else if bytes.Contains(headwordDisplay, []byte("<var")) || bytes.Contains(headerInfo, []byte("<var")) {
116 | 				// it's a false positive if those contain variants
117 | 			} else if bytes.Contains(headwordDisplay, []byte("<p")) || bytes.Contains(headerInfo, []byte("<p")) {
118 | 				// it's a false positive if those contain new paragraphs
119 | 			} else {
120 | 				if bytes.EqualFold(headwordIndex, headwordDisplay) {
121 | 					entry.Headword = string(headwordDisplay)
122 | 				} else {
123 | 					entry.Headword = string(headwordIndex)
124 | 				}
125 | 				entry.RawHTML = true
126 | 				entry.HeaderInfo = string(headerInfo)
127 | 				entry.Definition = "<p>" + string(contentHTML) + "</p>"
128 | 				generatorMatched = true
129 | 			}
130 | 		}
131 | 	}
132 | 	// -- Kobo: based on dicthtml-en, a few others
133 | 	//    e.g. <w><p><a name="ab"></a><b>ab</b> [<pr>'ab</pr>] -n</p><var><variant name="variant-added-for-testing"/></var><p><ol><li>an abdominal muscle usu. used in pl.</li><li>about</li></ol></p></w>
134 | 	// -- or dictgen
135 | 	//    e.g. <w><p><a name="a" /><b>a</b> A (# emph. #).</p><var><variant name="variant-added-for-testing"/></var><ol><li>Etym: [Shortened form of an. AS. an one. See One.] An adjective, commonly called the indefinite article, and signifying one or any, but less emphatically.</li><li>&#34;At a birth&#34;; &#34;In a word&#34;; &#34;At a blow&#34;. Shak. Note: It is placed before nouns of the singular number denoting an individual object, or a quality individualized, before collective nouns, and also before plural nouns when the adjective few or the phrase great many or good many is interposed; as, a dog, a house, a man; a color; a sweetness; a hundred, a fleet, a regiment; a few persons, a great many days. It is used for an, for the sake of euphony, before words beginning with a consonant sound [for exception of certain words beginning with h, see An]; as, a table, a woman, a year, a unit, a eulogy, a ewe, a oneness, such a one, etc. Formally an was used both before vowels and consonants.</li><li>Etym: [Originally the preposition a (an, on).] In each; to or for each; as, &#34;twenty leagues a day&#34;, &#34;a hundred pounds a year&#34;, &#34;a dollar a yard&#34;, etc.</li></ol></w>
136 | 	if !generatorMatched {
137 | 		if m := generator3KoboEnOrDictutilRe.FindSubmatch(buf); len(m) != 0 {
138 | 			headwordIndex, headwordDisplay, headerInfo, contentHTML := m[1], m[2], m[3], m[4]
139 | 			if bytes.Contains(headwordDisplay, []byte("<br")) || bytes.Contains(headerInfo, []byte("<br")) {
140 | 				// it's a false positive if those contain line breaks
141 | 			} else if bytes.Contains(headwordDisplay, []byte("<li")) || bytes.Contains(headerInfo, []byte("<li")) {
142 | 				// it's a false positive if those contain list items
143 | 			} else if bytes.Contains(headwordDisplay, []byte("<var")) || bytes.Contains(headerInfo, []byte("<var")) {
144 | 				// it's a false positive if those contain variants
145 | 			} else if bytes.Contains(headwordDisplay, []byte("<p")) || bytes.Contains(headerInfo, []byte("<p")) {
146 | 				// it's a false positive if those contain new paragraphs
147 | 			} else {
148 | 				if bytes.EqualFold(headwordIndex, headwordDisplay) {
149 | 					entry.Headword = string(headwordDisplay)
150 | 				} else {
151 | 					entry.Headword = string(headwordIndex)
152 | 				}
153 | 				entry.RawHTML = true
154 | 				entry.HeaderInfo = string(headerInfo)
155 | 				entry.Definition = string(contentHTML)
156 | 				generatorMatched = true
157 | 			}
158 | 		}
159 | 	}
160 | 	// -- Fallback: extract (then remove) the first headword, rest goes in raw html definition.
161 | 	//    e.g. <w><a name="test"><p>dfkgjdlfjglkdfjg</p><var><variant name="asd"/></var></w>
162 | 	if !generatorMatched {
163 | 		entry.NoHeader = true
164 | 		entry.RawHTML = true
165 | 		entry.Definition = string(headFallbackIndexWordRe.ReplaceAllFunc(buf, func(src []byte) []byte {
166 | 			if entry.Headword != "" {
167 | 				return src // don't continue after the first headword has been found
168 | 			}
169 | 			entry.Headword = string(headFallbackIndexWordRe.FindSubmatch(src)[1])
170 | 			return nil // remove the entire a tag
171 | 		}))
172 | 		if entry.Headword == "" {
173 | 			return nil, fmt.Errorf("no headword found in %#v", string(buf))
174 | 		}
175 | 		generatorMatched = true
176 | 	}
177 | 
178 | 	// Add any additional headwords (then remove) (which really shouldn't be there in the first place) as variants.
179 | 	// i.e. stray <a name="..."> tags (but not if the link has text, because then it's not a headword anymore)
180 | 	entry.Definition = string(headFallbackIndexWordRe.ReplaceAllFunc([]byte(entry.Definition), func(src []byte) []byte {
181 | 		entry.Variant = append(entry.Variant, string(headFallbackIndexWordRe.FindSubmatch(src)[1]))
182 | 		return nil // remove the entire a tag
183 | 	}))
184 | 
185 | 	// Append (then remove) any variants found in the raw html.
186 | 	// i.e. <var> tags inside <variant> ones
187 | 	entry.Definition = string(variantsRe.ReplaceAllFunc([]byte(entry.Definition), func(src []byte) []byte {
188 | 		for _, m := range variantsItemRe.FindAllSubmatch(src, -1) {
189 | 			entry.Variant = append(entry.Variant, string(m[1]))
190 | 		}
191 | 		return nil // remove the entire variant tag
192 | 	}))
193 | 
194 | 	return &entry, nil
195 | }
196 | 
197 | // The regexps/vars used by extractEntries.
198 | var (
199 | 	htmlStart = []byte("<html>")
200 | 	htmlEnd   = []byte("</html>")
201 | 	entryRe   = regexp.MustCompile(`(?s)<w>\s*(.+?)\s*<\/w>`)
202 | )
203 | 
204 | // extractEntries gets the trimmed body of each entry in the dicthtml file.
205 | func extractEntries(buf []byte) ([][]byte, error) {
206 | 	if idx := bytes.Index(buf, htmlStart); idx < 0 {
207 | 		return nil, fmt.Errorf("missing %s tag", string(htmlStart))
208 | 	} else {
209 | 		buf = buf[idx+len(htmlStart):]
210 | 	}
211 | 
212 | 	if idx := bytes.LastIndex(buf, htmlEnd); idx < 0 {
213 | 		return nil, fmt.Errorf("missing %s tag", string(htmlStart))
214 | 	} else {
215 | 		buf = buf[:idx]
216 | 	}
217 | 
218 | 	var entries [][]byte
219 | 
220 | 	var cur, prev, body []int
221 | 	prev = []int{0, 0}
222 | 	for _, m := range entryRe.FindAllSubmatchIndex(buf, -1) {
223 | 		cur, body = m[0:2][:], m[2:4]
224 | 		for _, b := range buf[prev[1]:cur[0]] {
225 | 			// note: even though we might split up multi-byte utf-8 chars
226 | 			// here, it's fine, as the whitespace should be ascii if any,
227 | 			// and if there is anything else, it's an issue.
228 | 			if !unicode.IsSpace(rune(b)) {
229 | 				return nil, fmt.Errorf("non-whitespace between word entries (%#v in %#v before %#v)", string(rune(b)), string(buf[prev[1]:cur[0]]), string(buf[cur[0]:cur[1]]))
230 | 			}
231 | 		}
232 | 		prev = cur
233 | 		entries = append(entries, buf[body[0]:body[1]])
234 | 	}
235 | 	for _, b := range buf[prev[1]:] {
236 | 		if !unicode.IsSpace(rune(b)) {
237 | 			return nil, fmt.Errorf("non-whitespace after last word entry (%#v in %#v)", string(rune(b)), string(buf[prev[1]:]))
238 | 		}
239 | 	}
240 | 
241 | 	return entries, nil
242 | }
243 | 


--------------------------------------------------------------------------------
/examples/gotdict-convert/gotdict/parser.go:
--------------------------------------------------------------------------------
  1 | // Package gotdict parses GOTDict (https://github.com/wjdp/gotdict).
  2 | package gotdict
  3 | 
  4 | import (
  5 | 	"bytes"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"regexp"
 11 | 	"sort"
 12 | 	"strings"
 13 | 	"unicode"
 14 | 
 15 | 	"gopkg.in/yaml.v2"
 16 | )
 17 | 
 18 | // Dict represents the Dict.
 19 | type Dict []*Def
 20 | 
 21 | // Def represents a definition.
 22 | type Def struct {
 23 | 	// Title is the main title of the definition (it may contain spaces) (i.e. Tyrion Lannister).
 24 | 	Title string
 25 | 	// Terms are other forms of the title which should be recognized.
 26 | 	Terms []string
 27 | 	// Type is the record type. Currently, not many entries have one.
 28 | 	Type Type
 29 | 	// Images contains referenced image files.
 30 | 	Images map[string][]byte
 31 | 	// Definition contains the Markdown definition.
 32 | 	Definition string
 33 | }
 34 | 
 35 | // Type is a Dict record type.
 36 | type Type string
 37 | 
 38 | const (
 39 | 	// TypeUnknown is used for definitions without a type set (i.e. before types were used).
 40 | 	TypeUnknown Type = ""
 41 | 	// TypeCharacter is a character (e.g. Jon, Tyrion).
 42 | 	TypeCharacter Type = "character"
 43 | 	// TypeHouse is a house (e.g. Lannister, Stark).
 44 | 	TypeHouse Type = "house"
 45 | 	// TypeEvent is an event in time.
 46 | 	TypeEvent Type = "event"
 47 | 	// TypeCity is a city.
 48 | 	TypeCity Type = "city"
 49 | 	// TypeLocation is a location (e.g. King's Landing).
 50 | 	TypeLocation Type = "location"
 51 | 	// TypeRiver is a river.
 52 | 	TypeRiver Type = "river"
 53 | 	// TypeShip is a ship.
 54 | 	TypeShip Type = "ship"
 55 | 	// TypeWord is an uncommon or ASOIAF-specific word.
 56 | 	TypeWord Type = "word"
 57 | )
 58 | 
 59 | // Parse parses the Dict. If imgdir is an empty string, images are removed. If
 60 | // imgref is true, image paths are set to the full filepath rather than reading
 61 | // the images to memory.
 62 | func Parse(defdir, imgdir string, imgref bool) (Dict, error) {
 63 | 	var dict Dict
 64 | 
 65 | 	fis, err := ioutil.ReadDir(defdir)
 66 | 	if err != nil {
 67 | 		return nil, err
 68 | 	}
 69 | 
 70 | 	seen := map[string]*Def{}
 71 | 	for _, fi := range fis {
 72 | 		if filepath.Ext(fi.Name()) != ".mdd" {
 73 | 			continue
 74 | 		}
 75 | 
 76 | 		buf, err := ioutil.ReadFile(filepath.Join(defdir, fi.Name()))
 77 | 		if err != nil {
 78 | 			return nil, err
 79 | 		}
 80 | 
 81 | 		var obj struct {
 82 | 			Title string   `yaml:"title"`
 83 | 			Terms []string `yaml:"terms"`
 84 | 			Type  Type     `yaml:"type"`
 85 | 		}
 86 | 
 87 | 		md, err := unmarshalStrictFrontMatter(buf, &obj)
 88 | 		if err != nil {
 89 | 			return nil, fmt.Errorf("parse %s frontmatter: %w", fi.Name(), err)
 90 | 		} else if obj.Title == "" {
 91 | 			return nil, fmt.Errorf("parse %s frontmatter: title not set", fi.Name())
 92 | 		}
 93 | 
 94 | 		def := &Def{}
 95 | 
 96 | 		obj.Title = strings.TrimSpace(obj.Title)
 97 | 		if odef, ok := seen[obj.Title]; ok {
 98 | 			return nil, fmt.Errorf("parse %s: already seen %#v in other def %#v", fi.Name(), def.Title, odef)
 99 | 		}
100 | 		seen[obj.Title] = def
101 | 		def.Title = obj.Title
102 | 
103 | 		for _, term := range obj.Terms {
104 | 			term = strings.TrimSpace(term)
105 | 			if odef, ok := seen[term]; ok && term != "Jon Umber" { // it's usually a mistake to have duplicate terms (but remember that dictgen will handle them fine)
106 | 				return nil, fmt.Errorf("parse %s: already seen term %#v in other def %#v", fi.Name(), term, odef)
107 | 			}
108 | 			seen[term] = def
109 | 			def.Terms = append(def.Terms, term)
110 | 		}
111 | 
112 | 		def.Type = Type(strings.TrimSpace(string(obj.Type)))
113 | 		def.Images = map[string][]byte{}
114 | 		def.Definition = string(md)
115 | 
116 | 		if imgdir == "" {
117 | 			def.Definition = regexp.MustCompile(`(\s*Map on [Nn]ext [Pp]age\.?)|(\s*\(Map on [Nn]ext [Pp]age\.?\))|(!\[[^]]*\]\([^)]+\))`).ReplaceAllLiteralString(def.Definition, "")
118 | 		} else {
119 | 			var repl []string
120 | 			for _, img := range regexp.MustCompile(`!\[[^]]*\]\((images/)?([^)]+)\)`).FindAllStringSubmatch(def.Definition, -1) {
121 | 				if img[1] == "" {
122 | 					return nil, fmt.Errorf("parse %s: unknown image path %#v", fi.Name(), img[1])
123 | 				}
124 | 				fn, err := filepath.Abs(filepath.Join(imgdir, img[2]))
125 | 				if err != nil {
126 | 					return nil, fmt.Errorf("parse %s: resolve image %#v: %w", fi.Name(), img[1], err)
127 | 				}
128 | 				if imgref {
129 | 					if _, err := os.Stat(fn); err != nil {
130 | 						return nil, fmt.Errorf("parse %s: stat image %#v: %w", fi.Name(), img[1], err)
131 | 					}
132 | 					repl = append(repl, "("+img[1]+img[2]+")", "("+fn+")")
133 | 				} else {
134 | 					imgbuf, err := ioutil.ReadFile(fn)
135 | 					if err != nil {
136 | 						return nil, fmt.Errorf("parse %s: read image %#v: %w", fi.Name(), img[1], err)
137 | 					}
138 | 					def.Images[img[2]] = imgbuf
139 | 					repl = append(repl, "("+img[1]+img[2]+")", "("+img[2]+")")
140 | 				}
141 | 			}
142 | 			def.Definition = strings.NewReplacer(repl...).Replace(def.Definition)
143 | 		}
144 | 
145 | 		def.Definition = strings.TrimSpace(def.Definition)
146 | 
147 | 		dict = append(dict, def)
148 | 	}
149 | 
150 | 	sort.Slice(dict, func(i, j int) bool {
151 | 		return dict[i].Title < dict[j].Title
152 | 	})
153 | 
154 | 	return dict, nil
155 | }
156 | 
157 | func unmarshalStrictFrontMatter(buf []byte, v interface{}) (content []byte, err error) {
158 | 	spl := bytes.SplitN(buf, []byte{'-', '-', '-'}, 3)
159 | 	for _, b := range spl[0] {
160 | 		if !unicode.IsSpace(rune(b)) {
161 | 			return buf, nil
162 | 		}
163 | 	}
164 | 	return spl[2], yaml.UnmarshalStrict(spl[1], v)
165 | }
166 | 


--------------------------------------------------------------------------------
/examples/gotdict-convert/main.go:
--------------------------------------------------------------------------------
 1 | // Command gotdict-convert converts GOTDict (https://github.com/wjdp/gotdict) to
 2 | // a dictgen dictfile.
 3 | package main
 4 | 
 5 | import (
 6 | 	"fmt"
 7 | 	"os"
 8 | 	"path/filepath"
 9 | 
10 | 	"github.com/spf13/pflag"
11 | 
12 | 	"github.com/pgaskin/dictutil/dictgen"
13 | 	"github.com/pgaskin/dictutil/examples/gotdict-convert/gotdict"
14 | )
15 | 
16 | var version = "dev"
17 | 
18 | func main() {
19 | 	pflag.CommandLine.SortFlags = false
20 | 	gotdictp := pflag.StringP("gotdict", "g", "."+string(os.PathSeparator)+"gotdict", "The path to the local copy of github.com/wjdp/gotdict.")
21 | 	output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"gotdict.df", "The output filename (will be overwritten if it exists) (- is stdout)")
22 | 	images := pflag.BoolP("images", "I", false, "Include images in the generated dictfile")
23 | 	help := pflag.BoolP("help", "h", false, "Show this help text")
24 | 	pflag.Parse()
25 | 
26 | 	if *help || pflag.NArg() != 0 {
27 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\nVersion: gotdict-convert %s\n\nOptions:\n%s\nTo convert the resulting dictfile into a dictzip, use dictgen.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
28 | 		os.Exit(0)
29 | 		return
30 | 	}
31 | 
32 | 	var img string
33 | 	if *images {
34 | 		fmt.Fprintf(os.Stderr, "Parsing gotdict (with images).\n")
35 | 		img = filepath.Join(*gotdictp, "images")
36 | 	} else {
37 | 		fmt.Fprintf(os.Stderr, "Parsing gotdict (no images).\n")
38 | 	}
39 | 
40 | 	gd, err := gotdict.Parse(filepath.Join(*gotdictp, "_definitions"), img, true)
41 | 	if err != nil {
42 | 		fmt.Fprintf(os.Stderr, "Error: parse gotdict: %v\n", err)
43 | 		os.Exit(1)
44 | 		return
45 | 	}
46 | 
47 | 	fmt.Fprintf(os.Stderr, "Transforming definitions.\n")
48 | 	var df dictgen.DictFile
49 | 	for _, d := range gd {
50 | 		var hwi string
51 | 		if d.Type != "" {
52 | 			hwi = "-" + string(d.Type)
53 | 		}
54 | 
55 | 		df = append(df, &dictgen.DictFileEntry{
56 | 			Headword:   d.Title,
57 | 			HeaderInfo: hwi,
58 | 			Variant:    d.Terms,
59 | 			Definition: d.Definition,
60 | 		})
61 | 	}
62 | 
63 | 	fmt.Fprintf(os.Stderr, "Writing dictfile.\n")
64 | 	switch *output {
65 | 	case "-":
66 | 		if err := df.WriteDictFile(os.Stdout); err != nil {
67 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
68 | 			os.Exit(1)
69 | 			return
70 | 		}
71 | 	default:
72 | 		f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
73 | 		if err != nil {
74 | 			fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err)
75 | 			os.Exit(1)
76 | 			return
77 | 		}
78 | 
79 | 		if err := df.WriteDictFile(f); err != nil {
80 | 			f.Close()
81 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
82 | 			os.Exit(1)
83 | 			return
84 | 		}
85 | 
86 | 		if err := f.Close(); err != nil {
87 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
88 | 			os.Exit(1)
89 | 			return
90 | 		}
91 | 	}
92 | 
93 | 	fmt.Fprintf(os.Stderr, "Successfully converted %d entries from gotdict %s to dictfile %s.\n", len(df), *gotdictp, *output)
94 | 	os.Exit(0)
95 | }
96 | 


--------------------------------------------------------------------------------
/examples/webster1913-convert/main.go:
--------------------------------------------------------------------------------
  1 | // Command webster1913-convert converts Project Gutenberg's Webster's 1913
  2 | // Unabridged Dictionary to a dictgen dictfile.
  3 | package main
  4 | 
  5 | import (
  6 | 	"bytes"
  7 | 	"encoding/json"
  8 | 	"fmt"
  9 | 	"html/template"
 10 | 	"io"
 11 | 	"os"
 12 | 
 13 | 	"github.com/spf13/pflag"
 14 | 
 15 | 	"github.com/pgaskin/dictutil/dictgen"
 16 | 	"github.com/pgaskin/dictutil/examples/webster1913-convert/webster1913"
 17 | )
 18 | 
 19 | var version = "dev"
 20 | 
 21 | var deftmpl = template.Must(template.New("").Funcs(template.FuncMap{
 22 | 	"spldc": func(s string) []string {
 23 | 		for i, c := range s {
 24 | 			if c == '.' || c == ',' || c == '(' {
 25 | 				return []string{s[:i], s[i:]}
 26 | 			}
 27 | 		}
 28 | 		return []string{"", s}
 29 | 	},
 30 | }).Parse(`
 31 | 	{{- with .Etymology}}<p><i>{{.}}</i></p>{{end -}}
 32 | 	{{- with .Meanings}}<ol>{{range .}}<li>{{.Text}}{{with .Example}}<br/><br/>{{.}}{{end}}</li>{{end}}</ol>{{end -}}
 33 | 	{{- with .PhraseDefns}}<p>{{range $n, $v := .}}{{if $n}} {{end}}{{range $x, $y := (spldc $v)}}{{if $x}}<span>{{$y}}</span>{{else}}<b>{{$y}}</b>{{end}}{{end}}{{end}}</p>{{end -}}
 34 | 	{{- with .Synonyms}}<p>{{range $n, $v := .}}{{if $n}} {{end}}{{$v}}{{end}}</p>{{end -}}
 35 | 	{{- with .Extra}}<p>{{.}}</p>{{end -}}
 36 | `))
 37 | 
 38 | func main() {
 39 | 	pflag.CommandLine.SortFlags = false
 40 | 	output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"webster1913.df", "The output filename (will be overwritten if it exists) (- is stdout)")
 41 | 	dump := pflag.Bool("dump", false, "Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging)")
 42 | 	help := pflag.BoolP("help", "h", false, "Show this help text")
 43 | 	pflag.Parse()
 44 | 
 45 | 	if *help || pflag.NArg() != 1 {
 46 | 		fmt.Fprintf(os.Stderr, "Usage: %s [options] gutenberg_webster1913_path\n\nVersion: webster1913-convert %s\n\nOptions:\n%s\nArguments:\n  gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin.\n\nTo convert the resulting dictfile into a dictzip, use dictgen.\n", os.Args[0], version, pflag.CommandLine.FlagUsages())
 47 | 		os.Exit(0)
 48 | 		return
 49 | 	}
 50 | 
 51 | 	fmt.Fprintf(os.Stderr, "Opening input file.\n")
 52 | 	var r io.Reader
 53 | 	switch v := pflag.Args()[0]; v {
 54 | 	case "-":
 55 | 		r = os.Stdin
 56 | 	default:
 57 | 		f, err := os.Open(v)
 58 | 		if err != nil {
 59 | 			fmt.Fprintf(os.Stderr, "Error: open input %#v: %v\n", v, err)
 60 | 			os.Exit(1)
 61 | 			return
 62 | 		}
 63 | 		defer f.Close()
 64 | 		r = f
 65 | 	}
 66 | 
 67 | 	fmt.Fprintf(os.Stderr, "Parsing dictionary.\n")
 68 | 	wd, err := webster1913.Parse(r, func(i int, word string) {
 69 | 		if i%1000 == 0 {
 70 | 			fmt.Fprintf(os.Stderr, "[% 5d] %s\n", i, word)
 71 | 		}
 72 | 	})
 73 | 	if err != nil {
 74 | 		fmt.Fprintf(os.Stderr, "Error: parse webster1913: %v\n", err)
 75 | 		os.Exit(1)
 76 | 		return
 77 | 	}
 78 | 
 79 | 	if *dump {
 80 | 		fmt.Fprintf(os.Stderr, "Dumping JSON to stdout.\n")
 81 | 		enc := json.NewEncoder(os.Stdout)
 82 | 		enc.SetIndent("", "    ")
 83 | 		enc.Encode(wd)
 84 | 		os.Exit(0)
 85 | 		return
 86 | 	}
 87 | 
 88 | 	fmt.Fprintf(os.Stderr, "Transforming definitions.\n")
 89 | 	var df dictgen.DictFile
 90 | 	dbuf := bytes.NewBuffer(nil)
 91 | 	for _, d := range wd {
 92 | 		dbuf.Reset()
 93 | 		if err := deftmpl.Execute(dbuf, d); err != nil {
 94 | 			fmt.Fprintf(os.Stderr, "Error: render definition %#v: %v\n", d, err)
 95 | 			os.Exit(1)
 96 | 			return
 97 | 		}
 98 | 		df = append(df, &dictgen.DictFileEntry{
 99 | 			Headword:   d.Headword,
100 | 			Variant:    d.Variant,
101 | 			RawHTML:    true,
102 | 			HeaderInfo: d.Info,
103 | 			Definition: dbuf.String(),
104 | 		})
105 | 	}
106 | 
107 | 	fmt.Fprintf(os.Stderr, "Writing dictfile.\n")
108 | 	switch *output {
109 | 	case "-":
110 | 		if err := df.WriteDictFile(os.Stdout); err != nil {
111 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
112 | 			os.Exit(1)
113 | 			return
114 | 		}
115 | 	default:
116 | 		f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
117 | 		if err != nil {
118 | 			fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err)
119 | 			os.Exit(1)
120 | 			return
121 | 		}
122 | 
123 | 		if err := df.WriteDictFile(f); err != nil {
124 | 			f.Close()
125 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
126 | 			os.Exit(1)
127 | 			return
128 | 		}
129 | 
130 | 		if err := f.Close(); err != nil {
131 | 			fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err)
132 | 			os.Exit(1)
133 | 			return
134 | 		}
135 | 	}
136 | 
137 | 	fmt.Fprintf(os.Stderr, "Successfully converted %d entries from Webster's 1913 dictionary %#v to dictfile %s.\n", len(df), pflag.Args()[0], *output)
138 | 	os.Exit(0)
139 | }
140 | 


--------------------------------------------------------------------------------
/examples/webster1913-convert/webster1913/parser.go:
--------------------------------------------------------------------------------
  1 | // Package webster1913 parses Project Gutenberg's Webster's 1913 Unabridged
  2 | // Dictionary (http://www.gutenberg.org/ebooks/29765.txt.utf-8).
  3 | package webster1913
  4 | 
  5 | import (
  6 | 	"bufio"
  7 | 	"bytes"
  8 | 	"io"
  9 | 	"regexp"
 10 | 	"runtime/debug"
 11 | 	"strings"
 12 | )
 13 | 
 14 | // Dict represents the parsed dictionary.
 15 | type Dict []*Entry
 16 | 
 17 | // Entry is a single dictionary entry.
 18 | type Entry struct {
 19 | 	Headword    string
 20 | 	Variant     []string
 21 | 	Info        string
 22 | 	Etymology   string
 23 | 	Meanings    []*EntryMeaning
 24 | 	Synonyms    []string
 25 | 	PhraseDefns []string
 26 | 	Extra       string // unparseable text
 27 | }
 28 | 
 29 | // EntryMeaning is a meaning for a dictionary entry.
 30 | type EntryMeaning struct {
 31 | 	Text    string
 32 | 	Example string
 33 | }
 34 | 
 35 | var (
 36 | 	entryWordRe         = regexp.MustCompile(`^[A-Z_ ;-]+$`)
 37 | 	numberedDefnStartRe = regexp.MustCompile(`^[0-9]+\.\s*`)
 38 | 	singleDefnStartRe   = regexp.MustCompile(`^Defn:\s+`)
 39 | 	noteStartRe         = regexp.MustCompile(`^\s*Note:\s+`)
 40 | 	synStartRe          = regexp.MustCompile(`^Syn.\s*$`)
 41 | 	synItemStartRe      = regexp.MustCompile(`^\s+--\s+`)
 42 | 	phraseDefnStartRe   = regexp.MustCompile(`^\s+--\s+([A-Za-z ]+?[A-Za-z])\s*(\([^)]+\))?[,.]\s*`)
 43 | 	wordInfoFormRe      = regexp.MustCompile(`(?:p\. p\.|vb\. n\.|p\. pr\.) +([A-Z][a-z]+)[:;.,]`)
 44 | )
 45 | 
 46 | type state int
 47 | 
 48 | const (
 49 | 	// StateNone is before the first entry.
 50 | 	StateNone state = iota
 51 | 	// StateEntryInfo is at the beginning of the entry.
 52 | 	StateEntryInfo
 53 | 	// StateEntryExtra is unclassified text in the entry.
 54 | 	StateEntryExtra
 55 | 	// StateEntryMeaningText is inside an entry's meaning's text.
 56 | 	StateEntryMeaningText
 57 | 	// StateEntryMeaningExample is inside an entry's meaning's example.
 58 | 	StateEntryMeaningExample
 59 | 	// StateEntrySynonym is inside an entry's synonym list.
 60 | 	StateEntrySynonym
 61 | 	// StateEntryPhraseDefn is inside an entry's phrase definition list.
 62 | 	StateEntryPhraseDefn
 63 | )
 64 | 
 65 | // Parse parses Project Gutenberg's Webster's Unabridged Dictionary.
 66 | func Parse(r io.Reader, progress func(i int, w string)) (Dict, error) {
 67 | 	var wd Dict
 68 | 	var perr error
 69 | 	sc := bufio.NewScanner(r)
 70 | 
 71 | 	var state state
 72 | 	var entry *Entry
 73 | 	var meaning *EntryMeaning
 74 | 	var i int
 75 | 	for sc.Scan() {
 76 | 		ln := sc.Bytes()
 77 | 		lnt := bytes.TrimSpace(ln)
 78 | 		blankLine := len(lnt) == 0
 79 | 
 80 | 		if bytes.HasPrefix(lnt, []byte("*** END")) {
 81 | 			break
 82 | 		}
 83 | 
 84 | 		if entryWordRe.Match(ln) {
 85 | 			if state == StateNone {
 86 | 				// skip the file header(up to the word "A")
 87 | 				if !bytes.Equal(lnt, []byte{'A'}) {
 88 | 					continue
 89 | 				}
 90 | 			}
 91 | 			if bytes.Count(lnt, []byte{'-'}) != len(lnt) {
 92 | 				// ^ if all dashes, it is a false positive
 93 | 				if entry != nil {
 94 | 					progress(len(wd), entry.Headword)
 95 | 				}
 96 | 				spl := strings.Split(string(bytes.ToLower(ln)), ";")
 97 | 				entry = &Entry{Headword: strings.TrimSpace(spl[0])}
 98 | 				if len(spl) > 1 {
 99 | 					for _, v := range spl[1:] {
100 | 						if w := strings.TrimSpace(v); w != "" {
101 | 							entry.Variant = append(entry.Variant, w)
102 | 						}
103 | 					}
104 | 				}
105 | 				meaning = nil
106 | 				wd = append(wd, entry)
107 | 				state = StateEntryInfo
108 | 				continue
109 | 			}
110 | 		}
111 | 
112 | 		switch state {
113 | 		case StateNone:
114 | 			// ignore any text before the first entry
115 | 		case StateEntryInfo:
116 | 			switch {
117 | 			case blankLine:
118 | 				for _, m := range wordInfoFormRe.FindAllStringSubmatch(entry.Info, -1) {
119 | 					entry.Variant = append(entry.Variant, strings.ToLower(m[1]))
120 | 				}
121 | 				// attempt to split into etymology
122 | 				if spl := strings.SplitN(entry.Info, " Etym: ", 2); len(spl) == 2 {
123 | 					entry.Info = strings.TrimSpace(spl[0])
124 | 					entry.Etymology = strings.TrimSpace(spl[1])
125 | 				}
126 | 				state = StateEntryExtra
127 | 			default:
128 | 				entry.Info += " " + string(lnt)
129 | 			}
130 | 		case StateEntryExtra:
131 | 			switch {
132 | 			case singleDefnStartRe.Match(ln):
133 | 				meaning = &EntryMeaning{Text: string(singleDefnStartRe.ReplaceAllLiteral(ln, nil))}
134 | 				entry.Meanings = append(entry.Meanings, meaning)
135 | 				state = StateEntryMeaningText
136 | 			case numberedDefnStartRe.Match(ln):
137 | 				meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))}
138 | 				entry.Meanings = append(entry.Meanings, meaning)
139 | 				state = StateEntryMeaningText
140 | 			case phraseDefnStartRe.Match(ln):
141 | 				meaning = nil
142 | 				entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
143 | 				entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
144 | 				state = StateEntryPhraseDefn
145 | 			case blankLine:
146 | 				// ignore
147 | 			default:
148 | 				entry.Extra += " " + string(lnt)
149 | 			}
150 | 		case StateEntryMeaningText:
151 | 			switch {
152 | 			case synStartRe.Match(ln):
153 | 				meaning = nil
154 | 				state = StateEntrySynonym
155 | 			case singleDefnStartRe.Match(ln):
156 | 				// if it is in any kind of definition (single/numbered), it is part of it.
157 | 				meaning.Text += " " + string(singleDefnStartRe.ReplaceAllLiteral(lnt, nil))
158 | 			case numberedDefnStartRe.Match(ln):
159 | 				meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))}
160 | 				entry.Meanings = append(entry.Meanings, meaning)
161 | 				state = StateEntryMeaningText
162 | 			case phraseDefnStartRe.Match(ln):
163 | 				meaning = nil
164 | 				entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
165 | 				entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
166 | 				state = StateEntryPhraseDefn
167 | 			case len(meaning.Text) > 5 && len(lnt) < 55 && bytes.HasSuffix(lnt, []byte{'.'}) && !noteStartRe.Match(ln):
168 | 				// if there is already some body text, it is not a hard-wrapped
169 | 				// line, and it ends with a period, and is not a note, then it's
170 | 				// the last line of the text before the example.
171 | 				meaning.Text += " " + string(lnt)
172 | 				state = StateEntryMeaningExample
173 | 			case blankLine:
174 | 				// ignore
175 | 			default:
176 | 				meaning.Text += " " + string(lnt)
177 | 			}
178 | 		case StateEntryMeaningExample:
179 | 			switch {
180 | 			case synStartRe.Match(ln):
181 | 				meaning = nil
182 | 				state = StateEntrySynonym
183 | 			case singleDefnStartRe.Match(ln):
184 | 				meaning = &EntryMeaning{Text: string(singleDefnStartRe.ReplaceAllLiteral(ln, nil))}
185 | 				entry.Meanings = append(entry.Meanings, meaning)
186 | 				state = StateEntryMeaningText
187 | 			case numberedDefnStartRe.Match(ln):
188 | 				meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))}
189 | 				entry.Meanings = append(entry.Meanings, meaning)
190 | 				state = StateEntryMeaningText
191 | 			case phraseDefnStartRe.Match(ln):
192 | 				meaning = nil
193 | 				entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
194 | 				entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
195 | 				state = StateEntryPhraseDefn
196 | 			case blankLine:
197 | 				// ignore
198 | 			default:
199 | 				if meaning.Example != "" {
200 | 					meaning.Example += " "
201 | 				}
202 | 				meaning.Example += string(lnt)
203 | 			}
204 | 		case StateEntrySynonym:
205 | 			switch {
206 | 			case blankLine:
207 | 				state = StateEntryExtra
208 | 			case synItemStartRe.Match(ln):
209 | 				entry.Synonyms = append(entry.Synonyms, string(synItemStartRe.ReplaceAllLiteral(ln, nil)))
210 | 			case len(entry.Synonyms) == 0:
211 | 				// there was a "Syn." without any valid synonyms under it
212 | 				state = StateEntryExtra
213 | 			case phraseDefnStartRe.Match(ln):
214 | 				meaning = nil
215 | 				entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
216 | 				entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
217 | 				state = StateEntryPhraseDefn
218 | 			default:
219 | 				entry.Synonyms[len(entry.Synonyms)-1] += " " + string(lnt)
220 | 			}
221 | 		case StateEntryPhraseDefn:
222 | 			switch {
223 | 			case phraseDefnStartRe.Match(ln):
224 | 				meaning = nil
225 | 				entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1))))
226 | 				entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1])))
227 | 				state = StateEntryPhraseDefn
228 | 			case blankLine:
229 | 				// allow a blank line to end it for reducing the chance of bugs.
230 | 				state = StateEntryExtra
231 | 			default:
232 | 				// phrase definitions are always last, so no need for checking
233 | 				// for any other state changes (e.g. the start of a numbered
234 | 				// definition) (and the previous case should deal with any
235 | 				// edge-cases).
236 | 				entry.PhraseDefns[len(entry.PhraseDefns)-1] += " " + string(lnt)
237 | 			}
238 | 		}
239 | 
240 | 		if i%10000 == 0 {
241 | 			debug.FreeOSMemory() // hack to try and limit memory usage
242 | 		}
243 | 		i++
244 | 	}
245 | 
246 | 	if serr := sc.Err(); serr != nil {
247 | 		return nil, serr
248 | 	}
249 | 	if perr != nil {
250 | 		return nil, perr
251 | 	}
252 | 	return wd, nil
253 | }
254 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/pgaskin/dictutil
 2 | 
 3 | go 1.14
 4 | 
 5 | require (
 6 | 	github.com/disintegration/imaging v1.6.2
 7 | 	github.com/mattn/go-sqlite3 v2.0.3+incompatible
 8 | 	github.com/pgaskin/koboutils/v2 v2.1.0
 9 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
10 | 	github.com/russross/blackfriday/v2 v2.0.1
11 | 	github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
12 | 	github.com/spf13/pflag v1.0.5
13 | 	gopkg.in/yaml.v2 v2.2.8
14 | )
15 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/disintegration/imaging v1.6.2 h1:w1LecBlG2Lnp8B3jk5zSuNqd7b4DXhcjwek1ei82L+c=
 2 | github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4=
 3 | github.com/mattn/go-sqlite3 v2.0.3+incompatible h1:gXHsfypPkaMZrKbD5209QV9jbUTJKjyR5WD3HYQSd+U=
 4 | github.com/mattn/go-sqlite3 v2.0.3+incompatible/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
 5 | github.com/pgaskin/koboutils/v2 v2.1.0 h1:J5KzLWvj0zDvoP5aJ7RxWuzFA32CcnD+hqH6tw/3uRE=
 6 | github.com/pgaskin/koboutils/v2 v2.1.0/go.mod h1:wTzkDIlsxmUyfwfspGcm0Ap+HOxSUYV0S8kMYrf+0gM=
 7 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 9 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
10 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
11 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
12 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
13 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
14 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
15 | golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 h1:hVwzHzIUGRjiF7EcUjqNxk3NCfkPxbDKRdnNE1Rpg0U=
16 | golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
17 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
18 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
19 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
20 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
21 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
22 | 


--------------------------------------------------------------------------------
/kobodict/crypt.go:
--------------------------------------------------------------------------------
  1 | package kobodict
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"crypto/aes"
  6 | 	"crypto/cipher"
  7 | 	"fmt"
  8 | )
  9 | 
 10 | // Crypter represents a symmetric dictionary encryption method.
 11 | type Crypter interface {
 12 | 	Encrypter
 13 | 	Decrypter
 14 | }
 15 | 
 16 | // CryptMethodAES represents AES-128-ECB encryption with PKCS#7 padding.
 17 | const CryptMethodAES string = "aes"
 18 | 
 19 | // NewCrypter creates the specified type of Crypter with the specified key.
 20 | func NewCrypter(method string, key []byte) (Crypter, error) {
 21 | 	switch method {
 22 | 	case CryptMethodAES:
 23 | 		c, err := newCryptAES(key)
 24 | 		return c, err
 25 | 	default:
 26 | 		return nil, fmt.Errorf("unknown encryption method %#v", method)
 27 | 	}
 28 | }
 29 | 
 30 | type cryptAES struct {
 31 | 	b cipher.Block
 32 | }
 33 | 
 34 | func newCryptAES(key []byte) (*cryptAES, error) {
 35 | 	if b, err := aes.NewCipher(key); err != nil {
 36 | 		return nil, err
 37 | 	} else {
 38 | 		return &cryptAES{b}, nil
 39 | 	}
 40 | }
 41 | 
 42 | // Encrypt implements Encrypter.
 43 | func (c *cryptAES) Encrypt(buf []byte) ([]byte, error) {
 44 | 	if dst, err := cryptPKCS7Pad(buf, aes.BlockSize); err != nil {
 45 | 		return nil, err
 46 | 	} else if dst, err = cryptAES128ECBEncrypt(c.b, dst); err != nil {
 47 | 		return nil, err
 48 | 	} else {
 49 | 		return dst, nil
 50 | 	}
 51 | }
 52 | 
 53 | // Decrypt implements Decrypter.
 54 | func (c *cryptAES) Decrypt(buf []byte) ([]byte, error) {
 55 | 	if dst, err := cryptAES128ECBDecrypt(c.b, buf); err != nil {
 56 | 		return nil, err
 57 | 	} else if dst, err := cryptPKCS7Unpad(dst, aes.BlockSize); err != nil {
 58 | 		return nil, err
 59 | 	} else {
 60 | 		return dst, nil
 61 | 	}
 62 | }
 63 | 
 64 | func cryptPKCS7Unpad(src []byte, blockSize int) ([]byte, error) {
 65 | 	if blockSize > 0xFF || blockSize < 0x00 {
 66 | 		return nil, fmt.Errorf("block size %d out of bounds", blockSize)
 67 | 	} else if len(src)%blockSize != 0 || len(src) == 0 {
 68 | 		return nil, fmt.Errorf("data length %d is empty or not a multiple of block size %d", len(src), blockSize)
 69 | 	}
 70 | 	plen := int(src[len(src)-1])
 71 | 	if len(src) <= plen {
 72 | 		return nil, fmt.Errorf("invalid padding: padding length %d out of bounds", plen)
 73 | 	}
 74 | 	for _, v := range src[len(src)-plen:] {
 75 | 		if int(v) != plen {
 76 | 			return nil, fmt.Errorf("invalid padding: expected %d, got %d", plen, v)
 77 | 		}
 78 | 	}
 79 | 	return src[:len(src)-plen], nil
 80 | }
 81 | 
 82 | func cryptPKCS7Pad(src []byte, blockSize int) ([]byte, error) {
 83 | 	if blockSize > 0xFF || blockSize < 0x00 {
 84 | 		return nil, fmt.Errorf("block size %d out of bounds", blockSize)
 85 | 	}
 86 | 	plen := blockSize - len(src)%blockSize
 87 | 	return append(src, bytes.Repeat([]byte{byte(plen)}, plen)...), nil
 88 | }
 89 | 
 90 | func cryptAES128ECBDecrypt(cb cipher.Block, src []byte) ([]byte, error) {
 91 | 	if len(src)%aes.BlockSize != 0 {
 92 | 		return nil, fmt.Errorf("src not a multiple of block size %d", aes.BlockSize)
 93 | 	}
 94 | 	dst := make([]byte, len(src))
 95 | 	for i := aes.BlockSize; i <= len(src); i += aes.BlockSize {
 96 | 		cb.Decrypt(dst[i-aes.BlockSize:i], src[i-aes.BlockSize:i])
 97 | 	}
 98 | 	return dst, nil
 99 | }
100 | 
101 | func cryptAES128ECBEncrypt(cb cipher.Block, src []byte) ([]byte, error) {
102 | 	if len(src)%aes.BlockSize != 0 {
103 | 		return nil, fmt.Errorf("src not a multiple of block size %d", aes.BlockSize)
104 | 	}
105 | 	dst := make([]byte, len(src))
106 | 	for i := aes.BlockSize; i <= len(src); i += aes.BlockSize {
107 | 		cb.Encrypt(dst[i-aes.BlockSize:i], src[i-aes.BlockSize:i])
108 | 	}
109 | 	return dst, nil
110 | }
111 | 


--------------------------------------------------------------------------------
/kobodict/crypt_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 | 
3 | // TODO(v1)
4 | 


--------------------------------------------------------------------------------
/kobodict/fs.go:
--------------------------------------------------------------------------------
  1 | package kobodict
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"strings"
 11 | 	"unicode/utf8"
 12 | )
 13 | 
 14 | // Unpack is a helper function to unpack the contents of a Reader to a folder
 15 | // on-disk. The provided dir must be non-existent. Unpack will not close the
 16 | // reader.
 17 | func Unpack(r *Reader, dir string) error {
 18 | 	if _, err := os.Stat(dir); !os.IsNotExist(err) {
 19 | 		return fmt.Errorf("dir %#v already exists", dir)
 20 | 	}
 21 | 	if err := os.Mkdir(dir, 0755); err != nil {
 22 | 		return fmt.Errorf("create dir %#v: %w", dir, err)
 23 | 	}
 24 | 	for _, f := range r.File {
 25 | 		if err := unpackFile(dir, f.Open, f.Name); err != nil {
 26 | 			return fmt.Errorf("unpack file %#v: %w", f.Name, err)
 27 | 		}
 28 | 	}
 29 | 	for _, f := range r.Dicthtml {
 30 | 		if err := unpackFile(dir, f.Open, f.Name); err != nil {
 31 | 			return fmt.Errorf("unpack dicthtml %#v (prefix: %s): %w", f.Name, f.Prefix, err)
 32 | 		}
 33 | 	}
 34 | 	if err := ioutil.WriteFile(filepath.Join(dir, "words"), []byte(strings.Join(r.Word, "\n")), 0644); err != nil {
 35 | 		return fmt.Errorf("write words file: %w", err)
 36 | 	}
 37 | 	return nil
 38 | }
 39 | 
 40 | func unpackFile(dir string, open func() (io.ReadCloser, error), name string) error {
 41 | 	fr, err := open()
 42 | 	if err != nil {
 43 | 		return fmt.Errorf("read contents: %w", err)
 44 | 	}
 45 | 	defer fr.Close()
 46 | 
 47 | 	fw, err := os.OpenFile(filepath.Join(dir, name), os.O_CREATE|os.O_WRONLY|os.O_EXCL, 0644)
 48 | 	if err != nil {
 49 | 		return fmt.Errorf("create output file: %w", err)
 50 | 	}
 51 | 	defer fw.Close()
 52 | 
 53 | 	if _, err := io.Copy(fw, fr); err != nil {
 54 | 		return fmt.Errorf("write output file: %w", err)
 55 | 	}
 56 | 
 57 | 	if err := fw.Close(); err != nil {
 58 | 		return fmt.Errorf("write output file: %w", err)
 59 | 	}
 60 | 
 61 | 	return nil
 62 | }
 63 | 
 64 | // Pack is a helper function to pack the contents a folder unpacked using Unpack
 65 | // into a Writer. It is assumed that the writer has not been used. The provided
 66 | // file will be overwritten if it exists and is a regular file, or created if it
 67 | // doesn't exist. Pack will not close the writer.
 68 | func Pack(w *Writer, dir string) error {
 69 | 	if fi, err := os.Stat(filepath.Join(dir, "words")); os.IsNotExist(err) || (err == nil && fi.IsDir()) {
 70 | 		return fmt.Errorf("dir %#v is not an unpacked dictzip (no words file)", dir)
 71 | 	}
 72 | 
 73 | 	fis, err := ioutil.ReadDir(dir) // note: this is sorted
 74 | 	if err != nil {
 75 | 		return fmt.Errorf("read dir %#v: %w", dir, err)
 76 | 	}
 77 | 
 78 | 	for _, fi := range fis {
 79 | 		switch {
 80 | 		case fi.IsDir():
 81 | 			return fmt.Errorf("invalid dir %#v: dirs are not supported", fi.Name())
 82 | 		case fi.Name() == "words":
 83 | 			continue
 84 | 		case strings.HasSuffix(fi.Name(), ".html"):
 85 | 			if err := func() error {
 86 | 				fr, err := os.OpenFile(filepath.Join(dir, fi.Name()), os.O_RDONLY, 0)
 87 | 				if err != nil {
 88 | 					return fmt.Errorf("open file: %w", err)
 89 | 				}
 90 | 				defer fr.Close()
 91 | 
 92 | 				tmp := make([]byte, 2)
 93 | 				if _, err := fr.Read(tmp); err != nil {
 94 | 					return fmt.Errorf("read file: %w", err)
 95 | 				} else if tmp[0] == 0x1F && tmp[1] == 0x8B {
 96 | 					return fmt.Errorf("invalid unpacked dicthtml file: already compressed")
 97 | 				} else if _, err := fr.Seek(0, os.SEEK_SET); err != nil {
 98 | 					return fmt.Errorf("read file: %w", err)
 99 | 				}
100 | 
101 | 				fw, err := w.CreateDicthtml(strings.TrimSuffix(fi.Name(), ".html"))
102 | 				if err != nil {
103 | 					return fmt.Errorf("create dictzip entry: %w", err)
104 | 				}
105 | 
106 | 				if _, err := io.Copy(fw, fr); err != nil {
107 | 					return fmt.Errorf("write file: %w", err)
108 | 				}
109 | 
110 | 				return nil
111 | 			}(); err != nil {
112 | 				return fmt.Errorf("add dicthtml %#v: %w", fi.Name(), err)
113 | 			}
114 | 		default:
115 | 			if err := func() error {
116 | 				fr, err := os.OpenFile(filepath.Join(dir, fi.Name()), os.O_RDONLY, 0)
117 | 				if err != nil {
118 | 					return fmt.Errorf("open file: %w", err)
119 | 				}
120 | 				defer fr.Close()
121 | 
122 | 				fw, err := w.CreateFile(strings.TrimSuffix(fi.Name(), ".html"))
123 | 				if err != nil {
124 | 					return fmt.Errorf("create dictzip entry: %w", err)
125 | 				}
126 | 
127 | 				if _, err := io.Copy(fw, fr); err != nil {
128 | 					return fmt.Errorf("write file: %w", err)
129 | 				}
130 | 
131 | 				return nil
132 | 			}(); err != nil {
133 | 				return fmt.Errorf("add file %#v: %w", fi.Name(), err)
134 | 			}
135 | 		}
136 | 	}
137 | 
138 | 	if err := func() error {
139 | 		fr, err := os.OpenFile(filepath.Join(dir, "words"), os.O_RDONLY, 0)
140 | 		if err != nil {
141 | 			return fmt.Errorf("open words file: %w", err)
142 | 		}
143 | 		defer fr.Close()
144 | 
145 | 		sc := bufio.NewScanner(fr)
146 | 		for sc.Scan() {
147 | 			if !utf8.Valid(sc.Bytes()) {
148 | 				return fmt.Errorf("invalid word: %#v", sc.Text())
149 | 			}
150 | 			if word := strings.TrimSpace(sc.Text()); len(word) != 0 {
151 | 				if err := w.AddWord(word); err != nil {
152 | 					return fmt.Errorf("add word %#v: %s", word, err)
153 | 				}
154 | 			}
155 | 		}
156 | 		if sc.Err() != nil {
157 | 			return fmt.Errorf("read words file: %w", err)
158 | 		}
159 | 
160 | 		return nil
161 | 	}(); err != nil {
162 | 		return fmt.Errorf("add words index: %w", err)
163 | 	}
164 | 
165 | 	return nil
166 | }
167 | 


--------------------------------------------------------------------------------
/kobodict/fs_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 | 
3 | // TODO(v1)
4 | 


--------------------------------------------------------------------------------
/kobodict/marisa.go:
--------------------------------------------------------------------------------
 1 | package kobodict
 2 | 
 3 | import "io"
 4 | 
 5 | // Marisa is used by Reader and Writer for reading/writing Marisa tries. It is
 6 | // automatically set on supported platforms if
 7 | // github.com/pgaskin/dictutil/kobodict/marisa is imported, but can be
 8 | // overridden manually.
 9 | var Marisa interface {
10 | 	MarisaReader
11 | 	MarisaWriter
12 | }
13 | 
14 | // MarisaReader represents a simplified abstraction for reading Marisa tries.
15 | type MarisaReader interface {
16 | 	ReadAll(io.Reader) ([]string, error)
17 | }
18 | 
19 | // MarisaWriter represents a simplified abstraction for writing Marisa tries.
20 | type MarisaWriter interface {
21 | 	WriteAll(io.Writer, []string) error
22 | }
23 | 


--------------------------------------------------------------------------------
/kobodict/marisa/marisa.go:
--------------------------------------------------------------------------------
 1 | // Package marisa is imported with _ to enable marisa for the kobodict, if
 2 | // supported. It is in a separate package so functions in kobodict which don't
 3 | // require marisa can be used without compiling it. As an alternative to
 4 | // importing this package, you can provide your own implementation of marisa in
 5 | // kobodict.Marisa. If imported, this package will fail to compile unless marisa
 6 | // is available for your GOOS/GOARCH.
 7 | package marisa
 8 | 
 9 | import "github.com/pgaskin/dictutil/kobodict"
10 | 
11 | // This is done so it can still be instantiated even if not implemented for the
12 | // current platform (it will be caught when assigning it to kobodict.Marisa),
13 | // named platform for better error messages.
14 | 
15 | type platform struct{}
16 | 
17 | func init() {
18 | 	kobodict.Marisa = new(platform) // platform-specific implementation
19 | }
20 | 


--------------------------------------------------------------------------------
/kobodict/marisa/marisa_cgo.go:
--------------------------------------------------------------------------------
 1 | //+build cgo
 2 | 
 3 | package marisa
 4 | 
 5 | import (
 6 | 	"io"
 7 | 
 8 | 	"github.com/pgaskin/dictutil/marisa"
 9 | )
10 | 
11 | func (*platform) ReadAll(r io.Reader) (wd []string, err error) {
12 | 	return marisa.ReadAll(r)
13 | }
14 | 
15 | func (*platform) WriteAll(w io.Writer, wd []string) (err error) {
16 | 	return marisa.WriteAll(w, wd)
17 | }
18 | 


--------------------------------------------------------------------------------
/kobodict/marisa/marisa_test.go:
--------------------------------------------------------------------------------
 1 | package marisa
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"crypto/sha1"
 6 | 	"encoding/hex"
 7 | 	"io"
 8 | 	"reflect"
 9 | 	"runtime"
10 | 	"testing"
11 | 
12 | 	"github.com/pgaskin/dictutil/kobodict"
13 | )
14 | 
15 | func TestMarisa(t *testing.T) {
16 | 	impl, ok := (interface{})(new(platform)).(interface {
17 | 		kobodict.MarisaReader
18 | 		kobodict.MarisaWriter
19 | 	})
20 | 	if !ok {
21 | 		t.Skipf("warning: Marisa not supported on platform GOOS=%s GOARCH=%s and must be provided externally", runtime.GOOS, runtime.GOARCH)
22 | 	}
23 | 
24 | 	w := []string{
25 | 		"asd",
26 | 		"dfg",
27 | 		"sdf",
28 | 	}
29 | 
30 | 	buf := bytes.NewBuffer(nil)
31 | 	if err := impl.WriteAll(buf, w); err != nil {
32 | 		t.Fatalf("unexpected error when writing trie: %v", err)
33 | 	} else if buf.Len() == 0 {
34 | 		t.Errorf("written trie is empty")
35 | 	}
36 | 
37 | 	ss := sha1.New()
38 | 
39 | 	nw, err := impl.ReadAll(io.TeeReader(buf, ss))
40 | 	if err != nil {
41 | 		t.Fatalf("unexpected error when reading written trie: %v", err)
42 | 	} else if len(nw) == 0 {
43 | 		t.Errorf("read trie is empty")
44 | 	} else if !reflect.DeepEqual(nw, w) {
45 | 		t.Errorf("read tree: expected %+s, got %+s", w, nw)
46 | 	}
47 | 
48 | 	if runtime.GOARCH == "amd64" {
49 | 		if x, y := hex.EncodeToString(ss.Sum(nil)), "ea7252fc4e86585dea884e4bcb5ce7be90676474"; x != y {
50 | 			t.Errorf("trie output is incorrect or non-determinstic, expected sha1 %s, got %s", y, x)
51 | 		}
52 | 	} else {
53 | 		t.Logf("skipping sha1 check on non-amd64 architecture, as the correct file differs slightly on each one (usually by ~4 bytes)")
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/kobodict/reader.go:
--------------------------------------------------------------------------------
  1 | package kobodict
  2 | 
  3 | import (
  4 | 	"archive/zip"
  5 | 	"bytes"
  6 | 	"compress/gzip"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"io/ioutil"
 10 | 	"strings"
 11 | )
 12 | 
 13 | // Reader provides access to the contents of a dictzip file.
 14 | type Reader struct {
 15 | 	Word     []string
 16 | 	Dicthtml []*ReaderDicthtml
 17 | 	File     []*ReaderFile
 18 | 	z        *zip.Reader
 19 | 	d        Decrypter
 20 | }
 21 | 
 22 | // ReaderDicthtml represents a dicthtml file from a Reader.
 23 | type ReaderDicthtml struct {
 24 | 	Name   string
 25 | 	Prefix string
 26 | 	f      *zip.File
 27 | 	r      *Reader
 28 | }
 29 | 
 30 | // ReaderDicthtml represents a raw file from a Reader (e.g. images).
 31 | type ReaderFile struct {
 32 | 	Name string
 33 | 	f    *zip.File
 34 | 	r    *Reader
 35 | }
 36 | 
 37 | // Decrypter decrypts dicthtml files.
 38 | type Decrypter interface {
 39 | 	// Decrypt decrypts the dicthtml bytes. It will only be called if the
 40 | 	// dicthtml is not otherwise readable. An error should be returned if the
 41 | 	// decryption itself encounters an error; the decryptor should not try to
 42 | 	// judge if the resulting bytes are valid.
 43 | 	Decrypt([]byte) ([]byte, error)
 44 | }
 45 | 
 46 | // NewReader returns a new dictzip reader which reads from r, with the given
 47 | // file size.
 48 | func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
 49 | 	zr, err := zip.NewReader(r, size)
 50 | 	if err != nil {
 51 | 		return nil, fmt.Errorf("open zip: %w", err)
 52 | 	}
 53 | 
 54 | 	kr := &Reader{
 55 | 		z: zr,
 56 | 	}
 57 | 
 58 | 	var found bool
 59 | 	for _, zf := range zr.File {
 60 | 		if zf.Name == "words" {
 61 | 			if fr, err := zf.Open(); err != nil {
 62 | 				return nil, fmt.Errorf("open words index: %w", err)
 63 | 			} else if Marisa == nil {
 64 | 				return nil, fmt.Errorf("no marisa bindings found")
 65 | 			} else if kr.Word, err = Marisa.ReadAll(fr); err != nil {
 66 | 				return nil, fmt.Errorf("read words index: %w", err)
 67 | 			}
 68 | 			found = true
 69 | 			break
 70 | 		}
 71 | 	}
 72 | 	if !found {
 73 | 		return nil, fmt.Errorf("not a dictzip: no words index found")
 74 | 	}
 75 | 
 76 | 	for _, f := range zr.File {
 77 | 		switch {
 78 | 		case !f.Mode().IsRegular():
 79 | 			continue
 80 | 		case f.Name == "words":
 81 | 			continue
 82 | 		case strings.Contains(f.Name, "/"):
 83 | 			return nil, fmt.Errorf("read zip: illegal file %#v: contains slash (not in root dir)", f.Name)
 84 | 		case strings.HasSuffix(f.Name, ".html"):
 85 | 			kr.Dicthtml = append(kr.Dicthtml, &ReaderDicthtml{
 86 | 				Name:   f.Name,
 87 | 				Prefix: strings.TrimSuffix(f.Name, ".html"),
 88 | 				f:      f,
 89 | 				r:      kr,
 90 | 			})
 91 | 		default:
 92 | 			kr.File = append(kr.File, &ReaderFile{
 93 | 				Name: f.Name,
 94 | 				f:    f,
 95 | 				r:    kr,
 96 | 			})
 97 | 		}
 98 | 	}
 99 | 
100 | 	return kr, nil
101 | }
102 | 
103 | // SetDecrypter sets the Decrypter used to decrypt encrypted dicthtml files.
104 | func (r *Reader) SetDecrypter(d Decrypter) {
105 | 	r.d = d
106 | }
107 | 
108 | // Open returns an io.ReadCloser which reads the decoded dicthtml file. Multiple
109 | // files can be read at once.
110 | func (f *ReaderDicthtml) Open() (io.ReadCloser, error) {
111 | 	enc, err := func() (bool, error) {
112 | 		fr, err := f.f.Open()
113 | 		if err != nil {
114 | 			return false, fmt.Errorf("open zip entry: %v", err)
115 | 		}
116 | 		defer fr.Close()
117 | 
118 | 		tmp := make([]byte, 2)
119 | 		if n, err := fr.Read(tmp); err != nil {
120 | 			return false, fmt.Errorf("read zip entry: %v", err)
121 | 		} else if n != len(tmp) {
122 | 			return false, fmt.Errorf("corrupt dicthtml: too short (%d)", n)
123 | 		}
124 | 
125 | 		if tmp[0] == 0x1F && tmp[1] == 0x8B {
126 | 			return false, nil
127 | 		}
128 | 
129 | 		if f.r.d == nil {
130 | 			return true, fmt.Errorf("corrupt or encrypted dicthtml: invalid header")
131 | 		}
132 | 
133 | 		// maybe optimize this later?
134 | 		if buf, err := ioutil.ReadAll(io.MultiReader(bytes.NewReader(tmp), fr)); err != nil {
135 | 			return true, fmt.Errorf("read zip entry: %v", err)
136 | 		} else if dec, err := f.r.d.Decrypt(buf); err != nil {
137 | 			return true, fmt.Errorf("decrypt dicthtml: %v", err)
138 | 		} else if dec[0] != 0x1F || dec[1] != 0x8B {
139 | 			return true, fmt.Errorf("corrupt dicthtml or invalid encryption key: invalid header")
140 | 		}
141 | 		return true, nil
142 | 	}()
143 | 	if err != nil {
144 | 		return nil, err
145 | 	}
146 | 
147 | 	fr, err := f.f.Open()
148 | 	if err != nil {
149 | 		return nil, fmt.Errorf("open zip entry: %v", err)
150 | 	}
151 | 
152 | 	var dr io.Reader
153 | 	if enc {
154 | 		if buf, err := ioutil.ReadAll(fr); err != nil {
155 | 			return nil, fmt.Errorf("read zip entry: %v", err)
156 | 		} else if dec, err := f.r.d.Decrypt(buf); err != nil {
157 | 			return nil, fmt.Errorf("decrypt dicthtml: %v", err)
158 | 		} else if dec[0] != 0x1F || dec[1] != 0x8B {
159 | 			return nil, fmt.Errorf("corrupt dicthtml or invalid encryption key: invalid header")
160 | 		} else {
161 | 			dr = bytes.NewReader(dec)
162 | 		}
163 | 	} else {
164 | 		dr = fr
165 | 	}
166 | 
167 | 	zr, err := gzip.NewReader(dr)
168 | 	if err != nil {
169 | 		return nil, fmt.Errorf("decompress dicthtml: %v", err)
170 | 	}
171 | 
172 | 	return &funcReadCloser{
173 | 		Reader: zr,
174 | 		Closer: func() error {
175 | 			if err := zr.Close(); err != nil {
176 | 				fr.Close()
177 | 				return err
178 | 			}
179 | 			return fr.Close()
180 | 		},
181 | 	}, nil
182 | }
183 | 
184 | // Open returns an io.ReadCloser which reads the contents of the file. Multiple
185 | // files can be read at once.
186 | func (f *ReaderFile) Open() (io.ReadCloser, error) {
187 | 	return f.f.Open()
188 | }
189 | 
190 | type funcReadCloser struct {
191 | 	io.Reader
192 | 	Closer func() error
193 | }
194 | 
195 | func (f *funcReadCloser) Close() error {
196 | 	if f.Closer != nil {
197 | 		return f.Closer()
198 | 	}
199 | 	return nil
200 | }
201 | 


--------------------------------------------------------------------------------
/kobodict/reader_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 | 
3 | // TODO(v1)
4 | 


--------------------------------------------------------------------------------
/kobodict/util.go:
--------------------------------------------------------------------------------
  1 | // Package kobodict implements reading, writing, and other utilities for Kobo
  2 | // dictionaries (v2).
  3 | //
  4 | // A marisa implementation must be provided by
  5 | // github.com/pgaskin/kobodict/marisa or a custom one if Writer or Reader is
  6 | // used.
  7 | package kobodict
  8 | 
  9 | import (
 10 | 	"strings"
 11 | 	"unicode"
 12 | )
 13 | 
 14 | // NormalizeWordReference normalizes a word for use in an dicthtml headword
 15 | // (<a name="...") or variant (<variant name="..."). It matches the way Kobo
 16 | // finds words in a file.
 17 | //
 18 | // The logic is reversed from DictionaryParser::htmlForWord in libnickel.
 19 | //
 20 | // Note: Headwords are prefix-matched against the query, the uppercased query,
 21 | // the lowercased query, and the lowercased query with the first letter
 22 | // uppercased. Variants are only prefix-matched against the lowercased query.
 23 | //
 24 | // Note: The matching is only done in the file matching the prefix for the query.
 25 | func NormalizeWordReference(w string, variant bool) string {
 26 | 	if variant {
 27 | 		// variants must always be lowercase (the match is only checked against
 28 | 		// the lowercased query)
 29 | 		w = strings.ToLower(w)
 30 | 	}
 31 | 
 32 | 	// trim leading and trailing whitespace
 33 | 	return strings.TrimSpace(w)
 34 | }
 35 | 
 36 | // WordPrefix gets the prefix of a word for sharding dicthtml files.
 37 | //
 38 | // This is not to be used with Kanji, as those are handled by a separate
 39 | // function for Japanese dictionaries.
 40 | //
 41 | // WordPrefix is a simplification of the logic reversed from
 42 | // DictionaryParser::htmlForWord (see wordPrefix), but with performance and
 43 | // cleaner code. It is should have the exact same results.
 44 | func WordPrefix(word string) string {
 45 | 	pfx := []rune(word)
 46 | 
 47 | 	for i, c := range pfx {
 48 | 		if i >= 2 || c == '\x00' { // limit to 2 chars, also cut at null
 49 | 			pfx = pfx[:i] // trim up to current char
 50 | 			break
 51 | 		}
 52 | 		pfx[i] = unicode.ToLower(c) // this includes accented chars
 53 | 	}
 54 | 
 55 | 	for len(pfx) != 0 {
 56 | 		if unicode.IsSpace(pfx[0]) {
 57 | 			pfx = pfx[1:] // trim left space
 58 | 		} else {
 59 | 			break
 60 | 		}
 61 | 	}
 62 | 
 63 | 	for len(pfx) != 0 {
 64 | 		if unicode.IsSpace(pfx[len(pfx)-1]) {
 65 | 			pfx = pfx[:len(pfx)-1] // trim right space
 66 | 		} else {
 67 | 			break
 68 | 		}
 69 | 	}
 70 | 
 71 | 	if len(pfx) == 0 {
 72 | 		return "11" // if empty, return "11"
 73 | 	}
 74 | 
 75 | 	if !unicode.Is(unicode.Cyrillic, pfx[0]) {
 76 | 		for len(pfx) < 2 {
 77 | 			pfx = append(pfx, 'a') // pad right with 'a's to 2 chars
 78 | 		}
 79 | 		if !unicode.IsLetter(pfx[0]) || !unicode.IsLetter(pfx[1]) {
 80 | 			return "11" // if neither of the first 2 chars are letters, return "11"
 81 | 		}
 82 | 	}
 83 | 
 84 | 	return string(pfx)
 85 | }
 86 | 
 87 | // wordPrefix gets the prefix of a word for sharding dicthtml files.
 88 | //
 89 | // This is not to be used with Kanji, as those are handled by a separate
 90 | // function for Japanese dictionaries.
 91 | //
 92 | // The logic is reversed from DictionaryParser::htmlForWord in libnickel. It
 93 | // matches it as closely as possible.
 94 | func wordPrefix(w string) string {
 95 | 	// w
 96 | 	// QString::toLower()
 97 | 	w = strings.ToLower(w)
 98 | 
 99 | 	// QString::leftRef(2)
100 | 	if len(w) > 2 {
101 | 		w = string([]rune(w)[:2])
102 | 	}
103 | 
104 | 	// QString::trimmed()
105 | 	w = strings.TrimSpace(w)
106 | 
107 | 	// simplify the following code by converting to rune slice
108 | 	r := []rune(w)
109 | 
110 | 	// A null byte is a valid Unicode character, but in C, it's treated as
111 | 	// the end of a string. To keep compatibility with libnickel, we need to
112 | 	// end a string there if necessary.
113 | 	for i, c := range r {
114 | 		if c == '\x00' {
115 | 			r = r[:i]
116 | 			break
117 | 		}
118 | 	}
119 | 
120 | 	// DictionaryParser::isCyrillic(w[0])
121 | 	// skip if true
122 | 	if !(len(r) != 0 && unicode.Is(unicode.Cyrillic, r[0])) {
123 | 		// add an 'a' for right padding if not 2 chars
124 | 		if len(r) != 2 {
125 | 			r = append(r, 'a')
126 | 		}
127 | 	}
128 | 
129 | 	// DictionaryParser::isCyrillic(w[0])
130 | 	// skip if != false
131 | 	switch {
132 | 	case !(len(r) != 0 && unicode.Is(unicode.Cyrillic, r[0])):
133 | 		// inlined QChar::isLetter(w[0]), QChar::isLetter(w[1]), unnecessary length check
134 | 		// skip if both true
135 | 		if (len(r) >= 1 && unicode.IsLetter(r[0])) && (len(r) >= 2 && unicode.IsLetter(r[1])) {
136 | 			break
137 | 		}
138 | 		fallthrough
139 | 	case len(r) == 0:
140 | 		// w = QString::fromLatin1_helper("11"..., 2)
141 | 		return "11"
142 | 	}
143 | 
144 | 	return string(r)
145 | }
146 | 


--------------------------------------------------------------------------------
/kobodict/util_test.go:
--------------------------------------------------------------------------------
  1 | package kobodict
  2 | 
  3 | import (
  4 | 	"strconv"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestNormalizeWordReference(t *testing.T) {
  9 | 	for _, tc := range []struct {
 10 | 		v    bool
 11 | 		i, o string
 12 | 	}{
 13 | 		{true, "Asd", "asd"},
 14 | 		{false, "Asd", "Asd"},
 15 | 		{true, " Asd", "asd"},
 16 | 		{false, " Asd", "Asd"},
 17 | 		{true, " Asd ", "asd"},
 18 | 		{false, " Asd ", "Asd"},
 19 | 		{true, " Asd \n", "asd"},
 20 | 		{false, " Asd \n", "Asd"},
 21 | 		{true, " Ȃsd \n", "ȃsd"},
 22 | 		{false, " Ȃsd \n", "Ȃsd"},
 23 | 	} {
 24 | 		t.Logf("word %#v [variant:%t] (%#v)", tc.i, tc.v, tc.o)
 25 | 		if o := NormalizeWordReference(tc.i, tc.v); o != tc.o {
 26 | 			t.Errorf("    got %#v", o)
 27 | 		}
 28 | 	}
 29 | }
 30 | 
 31 | var tcs = []struct{ w, p string }{
 32 | 	// dicthtml-en
 33 | 	{"test", "te"},
 34 | 	{"a-", "11"},
 35 | 	{"-an", "11"},
 36 | 	{"GB", "gb"},
 37 | 
 38 | 	// dicthtml-fr
 39 | 	{"ébahir", "éb"},
 40 | 	{"à", "àa"},
 41 | 	{"a1", "11"},
 42 | 	{"ô", "ôa"},
 43 | 	{"kébab", "ké"},
 44 | 	{"aérer", "aé"},
 45 | 	{"living-room", "li"},
 46 | 
 47 | 	// dicthtml-ja
 48 | 	// Note, Kanji not currently implemented, so not testing (note, the logic
 49 | 	// is in a separate function, anyways).
 50 | 	// {"あ", "あ"},
 51 | 	// {"アークとう", "アー"},
 52 | 
 53 | 	// generated by dictword-test: spaces
 54 | 	{" x", "xa"},
 55 | 	{" ", "11"},
 56 | 	{"x ", "xa"},
 57 | 	{"  ", "11"},
 58 | 	{"   ", "11"},
 59 | 	{"\t\t", "11"},
 60 | 	{"\t\f\t", "11"},
 61 | 	{"x ", "xa"},
 62 | 	{" xx", "xa"},
 63 | 
 64 | 	// generated by dictword-test: spaces where trim/prefix order matters
 65 | 	{"  x", "11"},
 66 | 	{"  xy", "11"},
 67 | 	{"  xyz", "11"},
 68 | 	{"x z", "xa"},
 69 | 	{"x z", "xa"},
 70 | 
 71 | 	// generated by dictword-test: cyrillic
 72 | 	{" д", "д"},
 73 | 	{"д ", "д"},
 74 | 	{" ", "11"},
 75 | 	{"  ", "11"},
 76 | 	{"   ", "11"},
 77 | 	{" дд", "д"},
 78 | 	{"д ", "д"},
 79 | 	{"д", "д"},
 80 | 	{"aд", "aд"},
 81 | 	{"дa", "дa"},
 82 | 	{"aдa", "aд"},
 83 | 	{"дaд", "дa"},
 84 | 
 85 | 	// generated by dictword-test: uppercase accented letters
 86 | 	{"Ȅe", "ȅe"},
 87 | 	{"eȄ", "eȅ"},
 88 | 	{"Ȅ", "ȅa"},
 89 | 	{"Ȅ!", "11"},
 90 | 
 91 | 	// generated by dictword-test: cjk
 92 | 	{" 未", "未a"},
 93 | 	{"  未", "11"},
 94 | 	{"未", "未a"},
 95 | 	{"未未", "未未"},
 96 | 	{"x未", "x未"},
 97 | 	{"未x", "未x"},
 98 | 	{"xy未", "xy"},
 99 | 	{"还没", "还没"},
100 | 
101 | 	// generated by dictword-test: misc
102 | 	{"!", "11"},
103 | 	{"!!", "11"},
104 | 	{"!!!", "11"},
105 | 	{"x!", "11"},
106 | 	{"x!!", "11"},
107 | 	{"xx!", "xx"},
108 | 	{"xxx!", "xx"},
109 | 	{"  !", "11"},
110 | 	{" !!", "11"},
111 | 	{" !!!", "11"},
112 | 	{" !", "11"},
113 | 	{"  !!", "11"},
114 | 	{"   !!!", "11"},
115 | 	{" x!", "xa"},
116 | 	{" x!!", "xa"},
117 | 	{" xx!", "xa"},
118 | 	{" xxx!", "xa"},
119 | 
120 | 	// synthetic
121 | 	{"x\x00y", "xa"},
122 | 	{"\x00xy", "11"},
123 | }
124 | 
125 | func TestWordPrefix(t *testing.T) {
126 | 	for _, tc := range tcs {
127 | 		t.Logf("word %#v (%#v)", tc.w, tc.p)
128 | 		if p := wordPrefix(tc.w); p != tc.p {
129 | 			t.Errorf("    got (original version) %#v", p)
130 | 		}
131 | 		if p := WordPrefix(tc.w); p != tc.p {
132 | 			t.Errorf("    got (simplified version) %#v", p)
133 | 		}
134 | 	}
135 | }
136 | 
137 | func BenchmarkWordPrefix(b *testing.B) {
138 | 	for _, tcf := range []struct {
139 | 		n  string
140 | 		fn func(string) string
141 | 	}{
142 | 		{"Orig/", wordPrefix},
143 | 		{"Smpl/", WordPrefix},
144 | 	} {
145 | 		// all test cases
146 | 		b.Run(tcf.n+"All"+strconv.Itoa(len(tcs)), func(b *testing.B) {
147 | 			for i := 0; i < b.N; i++ {
148 | 				for _, tc := range tcs {
149 | 					tcf.fn(tc.w)
150 | 				}
151 | 			}
152 | 		})
153 | 
154 | 		// near-worst possible case
155 | 		b.Run(tcf.n+"Worst", func(b *testing.B) {
156 | 			for i := 0; i < b.N; i++ {
157 | 				tcf.fn(" 还д 没")
158 | 			}
159 | 		})
160 | 
161 | 		// normal case
162 | 		b.Run(tcf.n+"Normal", func(b *testing.B) {
163 | 			for i := 0; i < b.N; i++ {
164 | 				tcf.fn("Test")
165 | 			}
166 | 		})
167 | 
168 | 		// best case
169 | 		b.Run(tcf.n+"Best", func(b *testing.B) {
170 | 			for i := 0; i < b.N; i++ {
171 | 				tcf.fn("aa")
172 | 			}
173 | 		})
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/kobodict/writer.go:
--------------------------------------------------------------------------------
  1 | package kobodict
  2 | 
  3 | import (
  4 | 	"archive/zip"
  5 | 	"bytes"
  6 | 	"compress/gzip"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"sort"
 10 | 	"strings"
 11 | )
 12 | 
 13 | // Writer creates dictzips. It does not do any validation; it only does what it
 14 | // is told. It is up to the user to ensure the input is valid.
 15 | type Writer struct {
 16 | 	z      *zip.Writer
 17 | 	e      Encrypter
 18 | 	words  map[string]struct{} // doesn't take up space for values
 19 | 	used   map[string]struct{}
 20 | 	closed bool
 21 | 	last   io.WriteCloser
 22 | }
 23 | 
 24 | // Encrypter encrypts dicthtml files.
 25 | type Encrypter interface {
 26 | 	// Encrypt encrypts the provided bytes.
 27 | 	Encrypt([]byte) ([]byte, error)
 28 | }
 29 | 
 30 | // NewWriter creates a dictzip writer writing to w.
 31 | func NewWriter(w io.Writer) *Writer {
 32 | 	return &Writer{
 33 | 		z:     zip.NewWriter(w),
 34 | 		words: map[string]struct{}{},
 35 | 		used:  map[string]struct{}{},
 36 | 	}
 37 | }
 38 | 
 39 | // AddWord normalizes and adds a word to the index. If the word has already been
 40 | // added, it does nothing.
 41 | func (w *Writer) AddWord(word string) error {
 42 | 	if w.closed {
 43 | 		return fmt.Errorf("write to closed writer")
 44 | 	}
 45 | 	w.words[strings.TrimSpace(word)] = struct{}{} // index words aren't normalized except for trimming spaces
 46 | 	return nil
 47 | }
 48 | 
 49 | // CreateDicthtml adds a dicthtml file for the specified prefix and returns a
 50 | // writer which is valid until the next file is created.
 51 | func (w *Writer) CreateDicthtml(prefix string) (io.Writer, error) {
 52 | 	if strings.Contains(prefix, "/") {
 53 | 		return nil, fmt.Errorf("invalid prefix: must not contain slashes")
 54 | 	}
 55 | 	if w.closed {
 56 | 		return nil, fmt.Errorf("writer already closed")
 57 | 	}
 58 | 	if w.last != nil {
 59 | 		if err := w.last.Close(); err != nil {
 60 | 			return nil, fmt.Errorf("close last file writer: %w", err)
 61 | 		}
 62 | 		w.last = nil
 63 | 	}
 64 | 
 65 | 	filename := prefix + ".html"
 66 | 	if _, ok := w.used[filename]; ok {
 67 | 		return nil, fmt.Errorf("file %#v already exists in dictzip", filename)
 68 | 	}
 69 | 
 70 | 	fw, err := w.z.Create(filename)
 71 | 	if err != nil {
 72 | 		return nil, fmt.Errorf("create zip entry: %w", err)
 73 | 	}
 74 | 
 75 | 	if w.e != nil {
 76 | 		ew := newEncryptWriter(w.e, fw)
 77 | 		zw := gzip.NewWriter(ew)
 78 | 
 79 | 		w.last = &funcWriteCloser{
 80 | 			Writer: zw,
 81 | 			Closer: func() error {
 82 | 				if err := zw.Close(); err != nil {
 83 | 					return err
 84 | 				}
 85 | 				return ew.Close()
 86 | 			},
 87 | 		}
 88 | 	} else {
 89 | 		w.last = gzip.NewWriter(fw)
 90 | 	}
 91 | 
 92 | 	w.used[filename] = struct{}{}
 93 | 	return w.last, nil
 94 | }
 95 | 
 96 | // CreateFile adds a raw file with the specified name. Note that Kobo only
 97 | // supports GIF and JPEG files starting with the "GIF" and "JFIF" magic, and the
 98 | // treatment of other files is undefined. In addition, subdirectories are not
 99 | // supported. The behaviour is undefined if a dicthtml file is added this way.
100 | func (w *Writer) CreateFile(filename string) (io.Writer, error) {
101 | 	if strings.Contains(filename, "/") || strings.Contains(filename, "\\") {
102 | 		return nil, fmt.Errorf("invalid filename: must not contain slashes")
103 | 	} else if strings.Contains(filename, "words") {
104 | 		return nil, fmt.Errorf("invalid filename: must not be 'words'")
105 | 	} else if _, ok := w.used[filename]; ok {
106 | 		return nil, fmt.Errorf("file %#v already exists in dictzip", filename)
107 | 	}
108 | 	if w.last != nil {
109 | 		if err := w.last.Close(); err != nil {
110 | 			return nil, fmt.Errorf("close last file writer: %w", err)
111 | 		}
112 | 		w.last = nil
113 | 	}
114 | 
115 | 	fw, err := w.z.Create(filename)
116 | 	if err != nil {
117 | 		return nil, fmt.Errorf("create zip entry: %w", err)
118 | 	}
119 | 
120 | 	w.last = &funcWriteCloser{
121 | 		Writer: fw,
122 | 		Closer: nil,
123 | 	}
124 | 	w.used[filename] = struct{}{}
125 | 	return w.last, nil
126 | }
127 | 
128 | // Exists checks if a file already exists in the dictzip with the specified name.
129 | func (w *Writer) Exists(fn string) bool {
130 | 	_, ok := w.used[fn]
131 | 	return ok
132 | }
133 | 
134 | // Close writes the marisa index and the zip footer. The error should not be
135 | // ignored. It does not close the underlying writer.
136 | func (w *Writer) Close() error {
137 | 	if w.closed {
138 | 		return fmt.Errorf("writer already closed")
139 | 	}
140 | 	if w.last != nil {
141 | 		if err := w.last.Close(); err != nil {
142 | 			return fmt.Errorf("close last file writer: %w", err)
143 | 		}
144 | 		w.last = nil
145 | 	}
146 | 
147 | 	var words []string
148 | 	for word := range w.words {
149 | 		words = append(words, word)
150 | 	}
151 | 	sort.Strings(words)
152 | 
153 | 	if fw, err := w.z.Create("words"); err != nil {
154 | 		return fmt.Errorf("create index zip entry: %w", err)
155 | 	} else if Marisa == nil {
156 | 		return fmt.Errorf("no marisa bindings found")
157 | 	} else if err := Marisa.WriteAll(fw, words); err != nil {
158 | 		return fmt.Errorf("write index: %w", err)
159 | 	}
160 | 
161 | 	if err := w.z.Close(); err != nil {
162 | 		return fmt.Errorf("close zip: %w", err)
163 | 	}
164 | 	return nil
165 | }
166 | 
167 | // SetEncrypter sets the Encrypter used to encrypt dicthtml files. This must be
168 | // will only apply to dicthtml files added after the encrypter is set.
169 | func (w *Writer) SetEncrypter(e Encrypter) {
170 | 	w.e = e
171 | }
172 | 
173 | type encryptWriter struct {
174 | 	e Encrypter
175 | 	w io.Writer
176 | 	b *bytes.Buffer
177 | 	c bool
178 | }
179 | 
180 | func newEncryptWriter(e Encrypter, w io.Writer) io.WriteCloser {
181 | 	return &encryptWriter{
182 | 		e: e,
183 | 		w: w,
184 | 		b: bytes.NewBuffer(nil),
185 | 		c: false,
186 | 	}
187 | }
188 | 
189 | func (e encryptWriter) Write(buf []byte) (n int, err error) {
190 | 	if e.c {
191 | 		return 0, fmt.Errorf("write to closed writer")
192 | 	}
193 | 	return e.b.Write(buf)
194 | }
195 | 
196 | // Close encrypts and writes the buffer to the underlying writer. The error
197 | // should be checked.
198 | func (e encryptWriter) Close() error {
199 | 	if e.c {
200 | 		return fmt.Errorf("writer already closed")
201 | 	}
202 | 	if buf, err := e.e.Encrypt(e.b.Bytes()); err != nil {
203 | 		return fmt.Errorf("encrypt bytes: %w", err)
204 | 	} else if _, err := e.w.Write(buf); err != nil {
205 | 		return fmt.Errorf("write encrypted bytes: %w", err)
206 | 	}
207 | 	return nil
208 | }
209 | 
210 | type funcWriteCloser struct {
211 | 	io.Writer
212 | 	Closer func() error
213 | }
214 | 
215 | func (f *funcWriteCloser) Close() error {
216 | 	if f.Closer != nil {
217 | 		return f.Closer()
218 | 	}
219 | 	return nil
220 | }
221 | 


--------------------------------------------------------------------------------
/kobodict/writer_test.go:
--------------------------------------------------------------------------------
1 | package kobodict
2 | 
3 | // TODO(v1)
4 | 


--------------------------------------------------------------------------------
/marisa/libmarisa_generate.go:
--------------------------------------------------------------------------------
  1 | //+build libmarisa_generate
  2 | 
  3 | package main
  4 | 
  5 | import (
  6 | 	"archive/tar"
  7 | 	"bytes"
  8 | 	"compress/gzip"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"io/ioutil"
 12 | 	"net/http"
 13 | 	"os"
 14 | 	"path"
 15 | 	"regexp"
 16 | 	"strings"
 17 | )
 18 | 
 19 | func main() {
 20 | 	url := "https://github.com/s-yata/marisa-trie/archive/970b20c141f11d9d7572a6bb8d0488f2e0520e22.tar.gz"
 21 | 	version := "970b20c"
 22 | 
 23 | 	if files, err := tarball(url); err != nil {
 24 | 		fmt.Fprintf(os.Stderr, "Error: download tarball %#v: %v\n", url, err)
 25 | 		os.Exit(1)
 26 | 		return
 27 | 	} else if err := func() error {
 28 | 		if mr, err := libmarisa(files, version); err != nil {
 29 | 			return err
 30 | 		} else if mf, err := os.OpenFile("libmarisa.cc", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644); err != nil {
 31 | 			return err
 32 | 		} else if _, err := io.Copy(mf, mr); err != nil {
 33 | 			mf.Close()
 34 | 			return err
 35 | 		} else {
 36 | 			return mf.Close()
 37 | 		}
 38 | 	}(); err != nil {
 39 | 		fmt.Fprintf(os.Stderr, "Error: generate libmarisa.cc: %v\n", err)
 40 | 		os.Exit(1)
 41 | 		return
 42 | 	} else if err := func() error {
 43 | 		if mr, err := hmarisa(files, version); err != nil {
 44 | 			return err
 45 | 		} else if mf, err := os.OpenFile("libmarisa.h", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644); err != nil {
 46 | 			return err
 47 | 		} else if _, err := io.Copy(mf, mr); err != nil {
 48 | 			mf.Close()
 49 | 			return err
 50 | 		} else {
 51 | 			return mf.Close()
 52 | 		}
 53 | 	}(); err != nil {
 54 | 		fmt.Fprintf(os.Stderr, "Error: generate libmarisa.h: %v\n", err)
 55 | 		os.Exit(1)
 56 | 		return
 57 | 	}
 58 | }
 59 | func hmarisa(files map[string][]byte, version string) (io.Reader, error) {
 60 | 	marisaH, err := resolve(files, []string{
 61 | 		"include/marisa.h",
 62 | 	}, "include", "lib")
 63 | 	if err != nil {
 64 | 		return nil, err
 65 | 	}
 66 | 
 67 | 	fmt.Printf("Generating libmarisa.h\n")
 68 | 	return io.MultiReader(
 69 | 		// A custom header.
 70 | 		strings.NewReader("// AUTOMATICALLY GENERATED, DO NOT EDIT!\n"),
 71 | 		strings.NewReader("// merged from marisa-trie "+version+".\n"),
 72 | 		// Include the license info.
 73 | 		bytes.NewReader([]byte{'\n', '/', '/', ' '}),
 74 | 		bytes.NewReader(bytes.ReplaceAll(files["COPYING.md"], []byte{'\n'}, []byte{'\n', '/', '/', ' '})),
 75 | 		bytes.NewReader([]byte{'\n', '\n'}),
 76 | 		// Include the header.
 77 | 		bytes.NewReader(marisaH),
 78 | 	), nil
 79 | }
 80 | 
 81 | func libmarisa(files map[string][]byte, version string) (io.Reader, error) {
 82 | 	marisaGrimoireIOLib, err := resolve(files, []string{
 83 | 		"lib/marisa/grimoire/io/mapper.cc",
 84 | 		"lib/marisa/grimoire/io/reader.cc",
 85 | 		"lib/marisa/grimoire/io/writer.cc",
 86 | 	}, "include", "lib")
 87 | 	if err != nil {
 88 | 		return nil, err
 89 | 	}
 90 | 
 91 | 	marisaGrimoireTrieLib, err := resolve(files, []string{
 92 | 		"lib/marisa/grimoire/trie/tail.cc",
 93 | 		"lib/marisa/grimoire/trie/louds-trie.cc",
 94 | 	}, "include", "lib")
 95 | 	if err != nil {
 96 | 		return nil, err
 97 | 	}
 98 | 
 99 | 	marisaGrimoireVectorLib, err := resolve(files, []string{
100 | 		"lib/marisa/grimoire/vector/bit-vector.cc",
101 | 	}, "include", "lib")
102 | 	if err != nil {
103 | 		return nil, err
104 | 	}
105 | 
106 | 	marisaLib, err := resolve(files, []string{
107 | 		"lib/marisa/agent.cc",
108 | 		"lib/marisa/keyset.cc",
109 | 		"lib/marisa/trie.cc",
110 | 	}, "include", "lib")
111 | 	if err != nil {
112 | 		return nil, err
113 | 	}
114 | 
115 | 	fmt.Printf("Generating libmarisa.cc\n")
116 | 	return io.MultiReader(
117 | 		// A custom header.
118 | 		strings.NewReader("// AUTOMATICALLY GENERATED, DO NOT EDIT!\n"),
119 | 		strings.NewReader("// merged from marisa-trie "+version+".\n"),
120 | 		// Include the license info.
121 | 		bytes.NewReader([]byte{'\n', '/', '/', ' '}),
122 | 		bytes.NewReader(bytes.ReplaceAll(files["COPYING.md"], []byte{'\n'}, []byte{'\n', '/', '/', ' '})),
123 | 		bytes.NewReader([]byte{'\n', '\n'}),
124 | 		// Include the warnings from the Makefile.am CXXFLAGS.
125 | 		// - Note that Clang also recognizes the GCC pragmas.
126 | 		strings.NewReader("#pragma GCC diagnostic warning \"-Wall\"\n"),
127 | 		strings.NewReader("#pragma GCC diagnostic warning \"-Weffc++\"\n"),
128 | 		strings.NewReader("#pragma GCC diagnostic warning \"-Wextra\"\n"),
129 | 		strings.NewReader("#pragma GCC diagnostic warning \"-Wconversion\"\n"),
130 | 		// Silence a warning.
131 | 		strings.NewReader("#pragma GCC diagnostic ignored \"-Wimplicit-fallthrough=\"\n"),
132 | 		// Include the libs themselves.
133 | 		bytes.NewReader(marisaGrimoireIOLib),
134 | 		bytes.NewReader(marisaGrimoireTrieLib),
135 | 		bytes.NewReader(marisaGrimoireVectorLib),
136 | 		bytes.NewReader(marisaLib),
137 | 		// Show info about the generated file.
138 | 		strings.NewReader("#line 1 \"libmarisa_generate.go\"\n"),
139 | 		strings.NewReader("#pragma GCC warning \"Using generated built-in marisa-trie "+version+".\"\n"),
140 | 	), nil
141 | }
142 | 
143 | func tarball(url string) (map[string][]byte, error) {
144 | 	fmt.Printf("Downloading tarball from %s\n", url)
145 | 
146 | 	resp, err := http.Get(url)
147 | 	if err != nil {
148 | 		return nil, err
149 | 	}
150 | 	defer resp.Body.Close()
151 | 
152 | 	zr, err := gzip.NewReader(resp.Body)
153 | 	if err != nil {
154 | 		return nil, err
155 | 	}
156 | 
157 | 	var pfx string
158 | 	files := map[string][]byte{}
159 | 
160 | 	tr := tar.NewReader(zr)
161 | 	for {
162 | 		fh, err := tr.Next()
163 | 		if err == io.EOF {
164 | 			break
165 | 		} else if err != nil {
166 | 			return nil, err
167 | 		}
168 | 
169 | 		if fh.Name == "pax_global_header" || fh.FileInfo().IsDir() {
170 | 			continue
171 | 		}
172 | 
173 | 		if pfx == "" {
174 | 			if strings.HasPrefix(fh.Name, "./") {
175 | 				pfx = "./" + strings.Split(fh.Name, "/")[1] + "/"
176 | 			} else {
177 | 				pfx = strings.Split(fh.Name, "/")[0] + "/"
178 | 			}
179 | 		}
180 | 
181 | 		if !strings.HasPrefix(fh.Name, pfx) {
182 | 			return nil, fmt.Errorf("extract file %#v: doesn't have common prefix %#v", fh.Name, pfx)
183 | 		}
184 | 
185 | 		buf, err := ioutil.ReadAll(tr)
186 | 		if err != nil {
187 | 			return nil, fmt.Errorf("extract file %#v: %w", fh.Name, err)
188 | 		}
189 | 
190 | 		fn := strings.TrimPrefix(fh.Name, pfx)
191 | 		files[fn] = buf
192 | 
193 | 		fmt.Printf("  [D] %s\n", fn) // downloaded
194 | 	}
195 | 
196 | 	return files, nil
197 | }
198 | 
199 | func resolve(files map[string][]byte, filenames []string, includePath ...string) (resolvedFile []byte, err error) {
200 | 	fmt.Printf("Resolving C* source files %s (against:%s) (I = included, S = preserved because not found, R = skipped because already included)\n", filenames, includePath)
201 | 
202 | 	var resolveFn func(indent string, files map[string][]byte, filename string, buf []byte, done []string, includePath []string) (resolvedFile []byte, err error)
203 | 	resolveFn = func(indent string, files map[string][]byte, filename string, buf []byte, done []string, includePath []string) (resolvedFile []byte, err error) {
204 | 		defer func() {
205 | 			if rerr := recover(); rerr != nil {
206 | 				resolvedFile, err = nil, rerr.(error)
207 | 			}
208 | 		}()
209 | 
210 | 		resolvedFile = regexp.MustCompile(`(?m)^\s*#\s*include\s+["'<][^"'>]+["'>]$`).ReplaceAllFunc(buf, func(importBuf []byte) []byte {
211 | 			fn := string(regexp.MustCompile(`["'<]([^"'>]+)["'>]`).FindSubmatch(importBuf)[1])
212 | 
213 | 			for _, ip := range includePath {
214 | 				ifn := path.Join(ip, fn)
215 | 				for _, dfn := range done {
216 | 					if m, _ := path.Match(dfn, ifn); m {
217 | 						fmt.Printf("%s[R] %s\n", indent, fn) // already included
218 | 						return nil
219 | 					}
220 | 				}
221 | 
222 | 				ibuf, ok := files[ifn]
223 | 				if ok {
224 | 					fmt.Printf("%s[I] %s => %s\n", indent, fn, ifn) // include
225 | 					ibuf, err := resolveFn(indent+"    ", files, ifn, ibuf, append(done, ifn), append(includePath, path.Dir(ifn)))
226 | 					if err != nil {
227 | 						panic(fmt.Errorf("resolve %#v: %w", ifn, err))
228 | 					}
229 | 					return append(append([]byte{'\n', '\n'}, ibuf...), '\n', '\n')
230 | 				}
231 | 			}
232 | 
233 | 			fmt.Printf("%s[S] %s\n", indent, fn) // preserve
234 | 			return importBuf
235 | 		})
236 | 
237 | 		return
238 | 	}
239 | 
240 | 	for _, fn := range filenames {
241 | 		if buf, ok := files[fn]; !ok {
242 | 			return nil, fmt.Errorf("file %#v: not found", fn)
243 | 		} else if buf, err := resolveFn("  ", files, fn, buf, []string{fn}, append(includePath, path.Dir(fn))); err != nil {
244 | 			return nil, fmt.Errorf("file %v: %w", fn, err)
245 | 		} else {
246 | 			resolvedFile = append(resolvedFile, buf...)
247 | 			resolvedFile = append(resolvedFile, '\n', '\n')
248 | 		}
249 | 	}
250 | 
251 | 	return resolvedFile, nil
252 | }
253 | 


--------------------------------------------------------------------------------
/marisa/marisa.cc:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <cstring>
 3 | #include <stdexcept>
 4 | #include <string>
 5 | 
 6 | #include "libmarisa.h"
 7 | #include "shim.h"
 8 | 
 9 | #define catch_go_ex(t, ctx)                                                     \
10 |     catch (const t &ex) {                                                       \
11 |         const char* b = ctx;                                                    \
12 |         char* err = reinterpret_cast<char*>(                                    \
13 |             calloc(strlen(b)+strlen(ex.what())+1, sizeof(char)));               \
14 |         strcpy(err, b);                                                         \
15 |         strcat(err, ex.what());                                                 \
16 |         return err;                                                             \
17 |     }
18 | 
19 | #define catch_go                                                                \
20 |     catch_go_ex(marisa::Exception, "marisa: ")                                  \
21 |     catch_go_ex(go::error, "go shim: ")                                         \
22 |     catch_go_ex(std::runtime_error, "c++ runtime: ")                            \
23 |     catch_go_ex(std::exception, "c++ error: ")                                  \
24 |     catch (...) { return strdup("marisa: unknown c++ exception"); }             \
25 |     return NULL;
26 | 
27 | #define go_func extern "C" const char*
28 | 
29 | go_func marisa_read_all(int iid, char ***out_wd, size_t *out_wd_sz) {
30 |     try {
31 |         if (!out_wd || !out_wd_sz)
32 |             throw std::runtime_error("parameter is null");
33 |         go::rstream r(iid);
34 |         marisa::Trie t;
35 |         marisa::read(r, &t);
36 |         marisa::Agent a;
37 |         a.set_query("");
38 |         *out_wd_sz = 0;
39 |         *out_wd = reinterpret_cast<char**>(calloc(t.num_keys(), sizeof(char**)));
40 |         while (t.predictive_search(a)) {
41 |             if (*out_wd_sz == t.num_keys())
42 |                 throw std::runtime_error("expected " + std::to_string(t.num_keys()) + " keys, got more");
43 |             memcpy((*out_wd)[(*out_wd_sz)++] = reinterpret_cast<char*>(calloc(a.key().length()+1, sizeof(char))), a.key().ptr(), a.key().length());
44 |         }
45 |         if (*out_wd_sz != t.num_keys())
46 |             throw std::runtime_error("expected " + std::to_string(t.num_keys()) + " keys, got " + std::to_string(*out_wd_sz));
47 |     } catch_go
48 | }
49 | 
50 | go_func marisa_write_all(int iid, const char** wd, size_t wd_sz) {
51 |     try {
52 |         if (wd_sz && !wd)
53 |             throw std::runtime_error("parameter is null");
54 |         marisa::Keyset k;
55 |         for (size_t i = 0; i < wd_sz; i++)
56 |             k.push_back(wd[i]);
57 |         marisa::Trie t;
58 |         t.build(k);
59 |         go::wstream w(iid);
60 |         marisa::write(w, t);
61 |     } catch_go
62 | }
63 | 


--------------------------------------------------------------------------------
/marisa/marisa.go:
--------------------------------------------------------------------------------
 1 | // Package marisa provides a simplified self-contained CGO wrapper for
 2 | // marisa-trie (https://github.com/s-yata/marisa-trie).
 3 | package marisa
 4 | 
 5 | //go:generate go run -tags libmarisa_generate libmarisa_generate.go
 6 | 
 7 | //#cgo CPPFLAGS: -Wall
 8 | //#cgo LDFLAGS:
 9 | //#include <stddef.h>
10 | //#include <stdlib.h>
11 | //const char* marisa_read_all(int iid, char ***out_wd, size_t *out_wd_sz);
12 | //const char* marisa_write_all(int iid, const char** wd, size_t wd_sz);
13 | import "C"
14 | 
15 | import (
16 | 	"errors"
17 | 	"io"
18 | 	"unsafe"
19 | )
20 | 
21 | func ReadAll(r io.Reader) ([]string, error) {
22 | 	iid := iopPut(r)
23 | 	var out_wd **C.char
24 | 	var out_wd_sz C.size_t
25 | 	err := C.marisa_read_all(
26 | 		(C.int)(iid),
27 | 		(***C.char)(unsafe.Pointer(&out_wd)),
28 | 		(*C.size_t)(unsafe.Pointer(&out_wd_sz)),
29 | 	)
30 | 	iopDel(iid)
31 | 	return gostrs(out_wd, out_wd_sz), goerr(err)
32 | }
33 | 
34 | func WriteAll(w io.Writer, wd []string) error {
35 | 	iid := iopPut(w)
36 | 	wd_ptr, wd_sz, wd_free := cstrs(wd)
37 | 	err := C.marisa_write_all(
38 | 		(C.int)(iid),
39 | 		(**C.char)(wd_ptr),
40 | 		(C.size_t)(wd_sz),
41 | 	)
42 | 	wd_free()
43 | 	iopDel(iid)
44 | 	return goerr(err)
45 | }
46 | 
47 | func goerr(p *C.char) (err error) {
48 | 	if p != nil {
49 | 		err = errors.New(C.GoString(p))
50 | 		C.free(unsafe.Pointer(p))
51 | 	}
52 | 	return
53 | }
54 | 
55 | func gostrs(p **C.char, n C.size_t) (s []string) {
56 | 	if p != nil {
57 | 		s = make([]string, int(n))
58 | 		for i, v := range (*[1 << 28]*C.char)(unsafe.Pointer(p))[:int(n):int(n)] {
59 | 			s[i] = C.GoString(v)
60 | 			C.free(unsafe.Pointer(v))
61 | 		}
62 | 		C.free(unsafe.Pointer(p))
63 | 	}
64 | 	return
65 | }
66 | 
67 | func cstrs(s []string) (p **C.char, n C.size_t, free func()) {
68 | 	n = (C.size_t)(len(s))
69 | 	if len(s) == 0 {
70 | 		free = func() {}
71 | 		return
72 | 	}
73 | 	c := make([]*C.char, len(s))
74 | 	for i, v := range s {
75 | 		c[i] = C.CString(v)
76 | 	}
77 | 	p = (**C.char)(unsafe.Pointer(&c[0]))
78 | 	free = func() {
79 | 		for _, v := range c {
80 | 			C.free(unsafe.Pointer(v))
81 | 		}
82 | 	}
83 | 	return
84 | }
85 | 


--------------------------------------------------------------------------------
/marisa/marisa_test.go:
--------------------------------------------------------------------------------
 1 | package marisa
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"crypto/sha1"
 6 | 	"encoding/hex"
 7 | 	"errors"
 8 | 	"io"
 9 | 	"reflect"
10 | 	"runtime"
11 | 	"strings"
12 | 	"testing"
13 | )
14 | 
15 | func TestTrieIO(t *testing.T) {
16 | 	emptyBuf := bytes.NewBuffer(nil)
17 | 	emptyS := "1aa6c451104c2c1b24ecb66ecb84bde2403c49b1" // marisa-build </dev/null | sha1sum -
18 | 
19 | 	normalWd := []string{"asd", "bnm", "cvb", "dfg"} // for n in asd bnm cvb dfg; do echo $n; done | marisa-build | sha1sum -
20 | 	normalBuf := bytes.NewBuffer(nil)
21 | 	normalS := "bdf9be48216379734fa0256263467ba6ab2e0931"
22 | 
23 | 	t.Run("WriteAll", func(t *testing.T) {
24 | 		t.Run("Error", func(t *testing.T) {
25 | 			err := WriteAll(new(errIO), normalWd)
26 | 			t.Logf("err=%v", err)
27 | 			if v := "MARISA_IO_ERROR"; err == nil || !strings.Contains(err.Error(), v) {
28 | 				t.Errorf("expected err to contain `%v`, got `%v`", v, err)
29 | 			}
30 | 		})
31 | 		t.Run("Empty", func(t *testing.T) {
32 | 			ss := sha1.New()
33 | 			if err := WriteAll(io.MultiWriter(emptyBuf, ss), nil); err != nil {
34 | 				t.Errorf("unexpected error: %v", err)
35 | 			}
36 | 			t.Logf("sum=%x", ss.Sum(nil))
37 | 			if runtime.GOARCH == "amd64" {
38 | 				if v := hex.EncodeToString(ss.Sum(nil)); v != emptyS {
39 | 					t.Errorf("output sha1 mismatch: expected %s, got %s", emptyS, v)
40 | 				}
41 | 			} else {
42 | 				t.Logf("skipping sha1 check on non-amd64 architecture, as the correct file differs slightly on each one (usually by ~4 bytes)")
43 | 			}
44 | 		})
45 | 		t.Run("Normal", func(t *testing.T) {
46 | 			ss := sha1.New()
47 | 			if err := WriteAll(io.MultiWriter(normalBuf, ss), normalWd); err != nil {
48 | 				t.Errorf("unexpected error: %v", err)
49 | 			}
50 | 			t.Logf("sum=%x", ss.Sum(nil))
51 | 			if runtime.GOARCH == "amd64" {
52 | 				if v := hex.EncodeToString(ss.Sum(nil)); v != normalS {
53 | 					t.Errorf("output sha1 mismatch: expected %s, got %s", normalS, v)
54 | 				}
55 | 			} else {
56 | 				t.Logf("skipping sha1 check on non-amd64 architecture, as the correct file differs slightly on each one (usually by ~4 bytes)")
57 | 			}
58 | 		})
59 | 	})
60 | 	t.Run("ReadAll", func(t *testing.T) {
61 | 		t.Run("Error", func(t *testing.T) {
62 | 			wd, err := ReadAll(new(errIO))
63 | 			if v := "MARISA_IO_ERROR"; err == nil || !strings.Contains(err.Error(), v) {
64 | 				t.Errorf("expected err to contain `%v`, got `%v`", v, err)
65 | 			}
66 | 			t.Logf("err=%v", err)
67 | 			if wd != nil {
68 | 				t.Errorf("expected returned slice to be nil, got %#v", wd)
69 | 			}
70 | 		})
71 | 		t.Run("Empty", func(t *testing.T) {
72 | 			wd, err := ReadAll(emptyBuf)
73 | 			if err != nil {
74 | 				t.Errorf("unexpected error: %v", err)
75 | 			}
76 | 			t.Logf("wd=%+s", wd)
77 | 			if len(wd) != 0 {
78 | 				t.Errorf("expected no words to be returned")
79 | 			}
80 | 		})
81 | 		t.Run("Normal", func(t *testing.T) {
82 | 			wd, err := ReadAll(normalBuf)
83 | 			if err != nil {
84 | 				t.Errorf("unexpected error: %v", err)
85 | 			}
86 | 			t.Logf("wd=%+s", wd)
87 | 			if !reflect.DeepEqual(wd, normalWd) {
88 | 				t.Errorf("expected %#v, got %#v", normalWd, wd)
89 | 			}
90 | 		})
91 | 	})
92 | }
93 | 
94 | type errIO struct{}
95 | 
96 | func (*errIO) Write([]byte) (int, error) { return 0, errors.New("go_test_error") }
97 | func (*errIO) Read([]byte) (int, error)  { return 0, errors.New("go_test_error") }
98 | 


--------------------------------------------------------------------------------
/marisa/shim.go:
--------------------------------------------------------------------------------
  1 | package marisa
  2 | 
  3 | //#cgo CXXFLAGS: -std=c++11
  4 | //#include <stdbool.h>
  5 | //#include <stddef.h>
  6 | import "C"
  7 | 
  8 | import (
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"sync"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | // shim.go and shim.h (plus _cgo_export.h implicitly), implement a shim to
 16 | // access Go I/O interfaces efficiently, concurrently, cleanly, and safely from
 17 | // C/C++ code. Note that if any C strings are returned by the Go side, they must
 18 | // be freed on the C side.
 19 | 
 20 | // https://golang.org/issue/13656#issuecomment-253600758
 21 | // https://golang.org/cmd/cgo/#hdr-C_references_to_Go
 22 | // https://stackoverflow.com/a/49879469
 23 | 
 24 | var (
 25 | 	iopMu sync.RWMutex         // for controlling access to the slice header (i.e. https://stackoverflow.com/a/49879469)
 26 | 	iop   = []interface{}{nil} // the 0th element is reserved to prevent mistakes
 27 | )
 28 | 
 29 | // iopPut adds the io.Reader and/or io.Writer, and returns its new iid. The iid
 30 | // will be valid until iopDel is called, but will never be reused.
 31 | func iopPut(rw interface{}) int {
 32 | 	switch rw.(type) {
 33 | 	case io.Reader, io.Writer:
 34 | 		iopMu.Lock()
 35 | 		iop = append(iop, rw)
 36 | 		iid := len(iop) - 1
 37 | 		iopMu.Unlock()
 38 | 		return iid
 39 | 	default:
 40 | 		panic("not a reader, writer, or both")
 41 | 	}
 42 | }
 43 | 
 44 | // iopGet gets the interface referenced by iid. It will panic if iid has never
 45 | // been issued by iopPut, and will return nil if it has been deleted by iopDel.
 46 | func iopGet(iid int) interface{} {
 47 | 	iopMu.RLock()
 48 | 	if iid <= 0 || iid >= len(iop) {
 49 | 		panic("invalid iid")
 50 | 	}
 51 | 	r := iop[iid]
 52 | 	iopMu.RUnlock()
 53 | 	return r
 54 | }
 55 | 
 56 | // iopDel sets the interface referenced by iid to nil to prevent future usage.
 57 | // It will panic if iid has never been issued by iopPut.
 58 | func iopDel(iid int) {
 59 | 	iopMu.RLock()
 60 | 	if iid <= 0 || iid >= len(iop) {
 61 | 		panic("invalid iid")
 62 | 	}
 63 | 	iop[iid] = nil
 64 | 	iopMu.RUnlock()
 65 | }
 66 | 
 67 | //export go_iop_check
 68 | func go_iop_check(iid C.int, t C.int, out_err **C.char) bool /*C.bool*/ {
 69 | 	var n []string
 70 | 	i := iopGet(int(iid))
 71 | 	if t&(1<<0) != 0 { // go_iop_type::reader
 72 | 		if _, ok := iopGet(int(iid)).(io.Reader); !ok {
 73 | 			n = append(n, "io.Reader")
 74 | 		}
 75 | 	}
 76 | 	if t&(1<<1) != 0 { // go_iop_type::writer
 77 | 		if _, ok := iopGet(int(iid)).(io.Writer); !ok {
 78 | 			n = append(n, "io.Writer")
 79 | 		}
 80 | 	}
 81 | 	if out_err != nil {
 82 | 		if len(n) != 0 {
 83 | 			*out_err = C.CString(fmt.Sprintf("iid %d: underlying type %T does not implement types %s", int(iid), i, n))
 84 | 		} else {
 85 | 			*out_err = nil
 86 | 		}
 87 | 	}
 88 | 	return len(n) == 0
 89 | }
 90 | 
 91 | //export go_iop_read
 92 | func go_iop_read(iid C.int, buf *C.char, buf_n C.size_t, out_err **C.char) C.ptrdiff_t {
 93 | 	*out_err = nil
 94 | 	switch i := iopGet(int(iid)).(type) {
 95 | 	case io.Reader:
 96 | 		n, err := i.Read((*[1 << 28]byte)(unsafe.Pointer(buf))[:int(buf_n):int(buf_n)])
 97 | 		if err == io.EOF {
 98 | 			if n == 0 {
 99 | 				return C.ptrdiff_t(-1)
100 | 			}
101 | 		} else if err != nil {
102 | 			*out_err = C.CString(fmt.Sprintf("go_iop_read: read up to %d bytes from iid %d: %v", buf_n, int(iid), err))
103 | 		}
104 | 		return C.ptrdiff_t(n)
105 | 	case nil:
106 | 		*out_err = C.CString(fmt.Sprintf("go_iop_read: iid %d has been deleted", int(iid)))
107 | 		return C.ptrdiff_t(0)
108 | 	default:
109 | 		*out_err = C.CString(fmt.Sprintf("go_iop_read: iid %d is a %T, not an io.Reader", int(iid), i))
110 | 		return C.ptrdiff_t(0)
111 | 	}
112 | }
113 | 
114 | //export go_iop_write
115 | func go_iop_write(iid C.int, buf *C.char, buf_n C.size_t, out_err **C.char) C.ptrdiff_t {
116 | 	*out_err = nil
117 | 	switch i := iopGet(int(iid)).(type) {
118 | 	case io.Writer:
119 | 		n, err := i.Write((*[1 << 28]byte)(unsafe.Pointer(buf))[:int(buf_n):int(buf_n)])
120 | 		if err == io.EOF {
121 | 			if n == 0 {
122 | 				return C.ptrdiff_t(-1)
123 | 			}
124 | 		} else if err != nil {
125 | 			*out_err = C.CString(fmt.Sprintf("go_iop_write: write up to %d bytes to iid %d: %v", buf_n, int(iid), err))
126 | 		}
127 | 		return C.ptrdiff_t(n)
128 | 	case nil:
129 | 		*out_err = C.CString(fmt.Sprintf("go_iop_write: iid %d has been deleted", int(iid)))
130 | 		return C.ptrdiff_t(0)
131 | 	default:
132 | 		*out_err = C.CString(fmt.Sprintf("go_iop_write: iid %d is a %T, not an io.Writer", int(iid), i))
133 | 		return C.ptrdiff_t(0)
134 | 	}
135 | }
136 | 


--------------------------------------------------------------------------------
/marisa/shim.h:
--------------------------------------------------------------------------------
  1 | #ifndef GO_SHIM_H
  2 | #define GO_SHIM_H
  3 | 
  4 | #ifdef __cplusplus
  5 | #include <cstddef>
  6 | extern "C" {
  7 | #else
  8 | #include <stdbool.h>
  9 | #include <stddef.h>
 10 | #endif
 11 | 
 12 | // go_iop_type represents interfaces an iid may implement.
 13 | enum go_iop_type {
 14 |     reader = 1 << 0, // io.Reader
 15 |     writer = 1 << 1, // io.Writer
 16 | };
 17 | 
 18 | // go_iop_reader checks if the specified iid implements the specified ORed type
 19 | // flags. Note that it doesn't have to be checked here, as go_iop_* will return
 20 | // an error if it doesn't implement the necessary interfaces. If out_err is not
 21 | // NULL and the return value is false, it will be set to an error message, which
 22 | // must be freed by the caller, if the iid doesn't implement the specified
 23 | // flags.
 24 | bool go_iop_check(int iid, int t, char **out_err);
 25 | 
 26 | // Note: we use ptrdiff_t over ssize_t for portability (and not size_t because
 27 | // it will return -1 for EOF). Also, note that C++'s std::streamsize uses this
 28 | // internally too, which is a nice advantage.
 29 | 
 30 | // go_iop_read reads from the iid's underlying io.Reader. It has the same
 31 | // semantics as the Go one, but io.EOF is returned as -1. out_err must be a
 32 | // valid pointer to a char pointer. If an error occured, it is set and must be
 33 | // freed by the caller.
 34 | ptrdiff_t go_iop_read(int iid, const char *p, size_t n, char **out_err);
 35 | // go_iop_write writes to the iid's underlying io.Writer. It has the same
 36 | // semantics as the Go one, but io.EOF is returned as -1. out_err must be a
 37 | // valid pointer to a char pointer. If an error occured, it is set and must be
 38 | // freed by the caller.
 39 | ptrdiff_t go_iop_write(int iid, const char *p, size_t n, char **out_err);
 40 | 
 41 | #ifdef __cplusplus
 42 | }
 43 | 
 44 | #include <cstdarg>
 45 | #include <cstdlib>
 46 | #include <iostream>
 47 | #include <stdexcept>
 48 | 
 49 | // https://golang.org/cmd/cgo/#hdr-C_references_to_Go
 50 | // https://en.cppreference.com/w/cpp/io/basic_streambuf <- this describes it better than many of the other sites I found
 51 | 
 52 | namespace go {
 53 | 
 54 | bool dbg(const char* format, ...) {
 55 |     static bool _dbg = getenv("GOSHIMDEBUG") ? getenv("GOSHIMDEBUG")[0] == '1' && getenv("GOSHIMDEBUG")[1] == '\0' : false;
 56 |     if (!_dbg)
 57 |         return false;
 58 |     fprintf(stderr, "GOSHIMDEBUG: ");
 59 |     va_list arg;
 60 |     va_start(arg, format);
 61 |     vfprintf(stderr, format, arg);
 62 |     va_end(arg);
 63 |     fflush(stderr);
 64 |     return true;
 65 | }
 66 | 
 67 | class error : public std::runtime_error {
 68 | public:
 69 |     error(const char* what) : std::runtime_error(what) {
 70 |         go::dbg("new go::error(%s)\n", what);
 71 |     };
 72 | 
 73 |     // check checks an output err pointer and frees+throws it if set.
 74 |     static void check(char* err) {
 75 |         if (!err)
 76 |             return;
 77 |         go::error ex = go::error(err);
 78 |         free(err);
 79 |         throw ex;
 80 |     }
 81 | };
 82 | 
 83 | class iopbuf : public std::basic_streambuf<char> {
 84 |     int iid_;
 85 |     char rbuf_; // single-byte read buffer (i.e. direct access to the io.Reader)
 86 | public:
 87 |     static_assert((std::is_same<iopbuf::char_type, char>::value && std::is_same<iopbuf::traits_type::char_type, char>::value), "Go shim only supports char"); // just to be safe
 88 |     #ifndef __clang__
 89 |     static_assert(iopbuf::traits_type::eof() != iopbuf::traits_type::to_int_type((char) 0xFF), "EOF not distinct from 0xFF"); // this is already specified in the spec, but just to make sure
 90 |     #endif
 91 | 
 92 |     iopbuf(int iid) : iid_(iid) {
 93 |         this->setg(&this->rbuf_, &this->rbuf_ + 1, &this->rbuf_ + 1); // set the buffer, but at the end to force the next read to underflow
 94 |     }
 95 | 
 96 |     iopbuf(int iid, int t) : iopbuf(iid) {
 97 |         char* err = NULL;
 98 |         go_iop_check(iid, t, &err);
 99 |         go::error::check(err);
100 |     }
101 | 
102 |     iopbuf::int_type underflow() override {
103 |         // This is all that's strictly needed for reading. Note that we can't
104 |         // just return the char, and we must set the buffer to point to it to
105 |         // conform to the expected postconditions and prevent unusual bugs from
106 |         // popping up.
107 | 
108 |         char* err = NULL;
109 |         ptrdiff_t n = go_iop_read(this->iid_, &this->rbuf_, 1, &err);
110 |         go::dbg("underflow: go_iop_read(%d, 1) = %td %02x err=%s\n", this->iid_, n, this->rbuf_, err); fflush(stdout);
111 |         go::error::check(err);
112 | 
113 |         this->setg(&this->rbuf_, &this->rbuf_, &this->rbuf_ + (n>0 ? n : 0));   // Update the current byte.
114 |         return this->gptr() == this->egptr()                                    // If the new current pos == past end of buffer, no byte was read (n<=0).
115 |             ? iopbuf::traits_type::eof()                                        // If no byte was read (and no error was thrown earlier), it's an EOF.
116 |             : iopbuf::traits_type::to_int_type(this->rbuf_);                    // Otherwise, return the byte we just read (note: without to_int_type, 0xFF would be sign extended to -1/eof).
117 |     }
118 | 
119 |     std::streamsize xsgetn(iopbuf::char_type* buf, std::streamsize buf_n) override {
120 |         // We can provide a more efficient bulk read implementation than the
121 |         // default one which gets each byte one-by-one in a loop.
122 |         // Note: Remember to test ::underflow by forcing it to use the default
123 |         // implementation: return std::streambuf::xsgetn(buf, buf_n);
124 | 
125 |         std::streamsize t = 0;
126 | 
127 |         ptrdiff_t n = 0;
128 |         char* err = NULL;
129 |         while (t != buf_n && n != -1) {
130 |             n = go_iop_read(this->iid_, buf+t, buf_n-t, &err);
131 |             go::dbg("xsgetn: go_iop_read(%d, %zu) = %td (%td/%td) err=%s\n", this->iid_, buf_n-t, n, t+(n>0 ? n : 0), buf_n, err); fflush(stdout);
132 |             t += n>0 ? n : 0;
133 |             if (t > buf_n)
134 |                 throw go::error("read returned too many bytes!");
135 |             go::error::check(err);
136 |         }
137 | 
138 |         this->rbuf_ = t>0 ? buf[t-1] : 0;                                       // Set the current byte to the last one read, if any.
139 |         this->setg(&this->rbuf_, &this->rbuf_, &this->rbuf_ + (t>0 ? 1 : 0));   // Update the current byte.
140 |         return this->gptr() == this->egptr()                                    // If the new current pos == past end of buffer, no byte was read (n<=0).
141 |             ? iopbuf::traits_type::eof()                                        // If no byte was read (and no error was thrown earlier), it's an EOF
142 |             : t;                                                                // Otherwise, return the number of bytes read.
143 |     }
144 | 
145 |     iopbuf::int_type overflow(iopbuf::int_type c = iopbuf::traits_type::eof()) override {
146 |         // Unlike for reading, we don't have to use a buffer (you can read a
147 |         // byte advancing, but you can't do that kind of thing when writing),
148 |         // so we'll just write it directly. This makes the implementation much
149 |         // simpler, as we're basically just passing the calls to the Go funcs
150 |         // directly.
151 | 
152 |         // Usually, we would flush the buffer if given an EOF instead of a char,
153 |         // but we're not using one, so it's a no-op.
154 |         if (iopbuf::traits_type::eq_int_type(c, iopbuf::traits_type::eof()))
155 |             return 0;
156 | 
157 |         // Since the logic is basically a simplified version of xsputn, just
158 |         // with a single char, it's easier just to call it and implement the
159 |         // bulk of the logic there.
160 |         if (this->xsputn(reinterpret_cast<iopbuf::traits_type::char_type*>(&c), 1) != 1)
161 |             throw go::error("short write"); // we still need to check for a short write
162 |         return c;
163 |     }
164 | 
165 |     std::streamsize xsputn(const iopbuf::char_type* buf, std::streamsize buf_n) override {
166 |         char* err = NULL;
167 |         ptrdiff_t n = go_iop_write(this->iid_, buf, buf_n, &err);
168 |         go::error::check(err);
169 |         if (n == -1)
170 |             throw go::error("EOF while writing to Go writer");
171 |         return n;
172 |     }
173 | };
174 | 
175 | class rwstream : private iopbuf, public std::iostream {
176 | public: rwstream(int iid) : iopbuf(iid, go_iop_type::reader|go_iop_type::writer), std::iostream(this) {}
177 | };
178 | 
179 | class wstream : private iopbuf, public std::ostream {
180 | public: wstream(int iid) : iopbuf(iid, go_iop_type::writer), std::ostream(this) {}
181 | };
182 | 
183 | class rstream : private iopbuf, public std::istream {
184 | public: rstream(int iid) : iopbuf(iid, go_iop_type::reader), std::istream(this) {}
185 | };
186 | 
187 | }
188 | 
189 | #endif
190 | #endif


--------------------------------------------------------------------------------