├── .appveyor.yml ├── .drone.yml ├── .gitattributes ├── .travis.yml ├── LICENSE.md ├── README.md ├── cmd ├── dictgen │ └── main.go └── dictutil │ ├── install.go │ ├── main.go │ ├── pack.go │ ├── prefix.go │ ├── uninstall.go │ └── unpack.go ├── dictgen ├── dictfile.go ├── dictfile_test.go ├── dictgen.go ├── image.go └── image_test.go ├── docs ├── _config.yml ├── _includes │ └── head_custom.html ├── dictgen │ └── index.md ├── dicthtml │ ├── format.md │ ├── index.md │ ├── install.md │ ├── matching.md │ ├── prefixes.md │ ├── v1v2-1.png │ ├── v1v2-2.png │ └── v1v2.md ├── dictutil │ ├── index.md │ ├── install.md │ ├── pack.md │ ├── prefix.md │ ├── uninstall.md │ └── unpack.md ├── examples │ ├── bgl-convert.md │ ├── dictzip-decompile.md │ ├── gotdict-convert.md │ ├── index.md │ └── webster1913-convert.md └── index.md ├── examples ├── bgl-convert │ └── index.html ├── dictzip-decompile │ ├── main.go │ └── parse.go ├── gotdict-convert │ ├── gotdict │ │ └── parser.go │ └── main.go └── webster1913-convert │ ├── main.go │ └── webster1913 │ └── parser.go ├── go.mod ├── go.sum ├── kobodict ├── crypt.go ├── crypt_test.go ├── fs.go ├── fs_test.go ├── marisa.go ├── marisa │ ├── marisa.go │ ├── marisa_cgo.go │ └── marisa_test.go ├── reader.go ├── reader_test.go ├── util.go ├── util_test.go ├── writer.go └── writer_test.go └── marisa ├── libmarisa.cc ├── libmarisa.h ├── libmarisa_generate.go ├── marisa.cc ├── marisa.go ├── marisa_test.go ├── shim.go └── shim.h /.appveyor.yml: -------------------------------------------------------------------------------- 1 | image: ubuntu 2 | version: "{build}" 3 | 4 | environment: 5 | GO111MODULE: on 6 | 7 | install: 8 | - go mod download 9 | 10 | build_script: 11 | - mkdir bin gotdict webster1913 12 | - CGO_ENABLED=1 go build -o ./bin/dictgen ./cmd/dictgen 13 | - CGO_ENABLED=0 go build -o ./bin/gotdict-convert ./examples/gotdict-convert 14 | - CGO_ENABLED=0 go build -o ./bin/webster1913-convert ./examples/webster1913-convert 15 | - curl -#Lo ./webster1913/webster1913.txt http://www.gutenberg.org/ebooks/29765.txt.utf-8 16 | - curl -#Lo - https://github.com/wjdp/gotdict/archive/6b4d6cdbb1f5d899d418783ab842f487aafa79ec.tar.gz | tar -xzf - --strip-components=1 -C ./gotdict 17 | - ./bin/gotdict-convert -o ./gotdict/gotdict.df -g ./gotdict --images 18 | - ./bin/gotdict-convert -o ./gotdict/gotdict.noimg.df -g ./gotdict 19 | - ./bin/webster1913-convert -o ./webster1913/webster1913.df ./webster1913/webster1913.txt 20 | - ./bin/dictgen -Ibase64 -o ./gotdict/dicthtml-gt.zip ./gotdict/gotdict.df 21 | - ./bin/dictgen -Iremove -o ./gotdict/dicthtml-gt.noimg.zip ./gotdict/gotdict.noimg.df 22 | - ./bin/dictgen -Iremove -o ./webster1913/dicthtml-wb.zip ./webster1913/webster1913.df 23 | 24 | test_script: 25 | - go test -v -cover ./... 26 | - mkdir tmp 27 | - CGO_ENABLED=1 go build -o ./bin/dictutil ./cmd/dictutil 28 | - ./bin/dictutil u -o ./tmp/1 ./gotdict/dicthtml-gt.zip 29 | - ./bin/dictutil u -o ./tmp/2 ./gotdict/dicthtml-gt.noimg.zip 30 | - ./bin/dictutil u -o ./tmp/3 ./webster1913/dicthtml-wb.zip 31 | - ./bin/dictutil p -o ./tmp/1.zip ./tmp/1 32 | - ./bin/dictutil p -o ./tmp/2.zip ./tmp/2 33 | - ./bin/dictutil p -o ./tmp/3.zip ./tmp/3 34 | - sha1sum ./gotdict/dicthtml-gt.zip ./gotdict/dicthtml-gt.noimg.zip ./webster1913/dicthtml-wb.zip 35 | - sha1sum ./tmp/1.zip ./tmp/2.zip ./tmp/3.zip 36 | - cmp ./tmp/1.zip ./gotdict/dicthtml-gt.zip 37 | - cmp ./tmp/2.zip ./gotdict/dicthtml-gt.noimg.zip 38 | - cmp ./tmp/3.zip ./webster1913/dicthtml-wb.zip 39 | 40 | artifacts: 41 | - path: gotdict/gotdict.df 42 | - path: gotdict/gotdict.noimg.df 43 | - path: gotdict/dicthtml-gt.zip 44 | - path: gotdict/dicthtml-gt.noimg.zip 45 | - path: webster1913/webster1913.df 46 | - path: webster1913/dicthtml-wb.zip 47 | 48 | deploy: off 49 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | marisa/libmarisa.cc linguist-generated=true 2 | marisa/libmarisa.h linguist-generated=true -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: 2 | - osx 3 | 4 | language: go 5 | 6 | go: 7 | - 1.14.x 8 | 9 | env: 10 | GO111MODULE: "on" 11 | 12 | script: 13 | - go run -mod=readonly ./cmd/dictutil --help 14 | - go run -mod=readonly ./cmd/dictgen --help 15 | - go run -mod=readonly ./examples/dictzip-decompile --help 16 | - go run -mod=readonly ./examples/gotdict-convert --help 17 | - go run -mod=readonly ./examples/webster1913-convert --help 18 | - go test -mod=readonly -v ./... 19 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Patrick Gaskin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

dictutil

2 | 3 | [![](https://img.shields.io/github/v/release/pgaskin/dictutil?include_prereleases)](https://github.com/pgaskin/dictutil/releases) [![](https://img.shields.io/drone/build/pgaskin/dictutil/master)](https://cloud.drone.io/pgaskin/dictutil) [![](https://img.shields.io/badge/godoc-reference-blue.svg)](https://pkg.go.dev/mod/github.com/pgaskin/dictutil?tab=versions) [![](https://goreportcard.com/badge/github.com/pgaskin/dictutil)](https://goreportcard.com/report/github.com/pgaskin/dictutil) 4 | 5 | This repository contains a collection of tools and libraries to work with Kobo dictionaries, plus comprehensive documentation of Kobo's dictionary format. 6 | 7 | Unlike previous attempts at working with Kobo dictionaries, dictutil has full support for all features supported by nickel (word prefixes, unicode, variants, images, etc), with a focus on simplicity, correctness (prefix generation and other features are directly tested against libnickel's code and regexps, v1/v2 dictionaries are differentiated), and completeness (most of the research was done by reverse-engineering libnickel). 8 | 9 | Dictutil consists of multiple tools and libraries: 10 | - [**dictutil**](https://pgaskin.net/dictutil/dictutil/) provides commands for installing, removing, unpacking, packing, and performing low-level modifications and tests on Kobo dictionaries. All operations are intended to be correct, lossless, and deterministic. 11 | - [**dictgen**](https://pgaskin.net/dictutil/dictgen/) simplifies creating full-featured dictionaries for Kobo eReaders, with support for images, unicode prefixes, raw html, markdown, and more. 12 | - [**dicthtml**](https://pgaskin.net/dictutil/dicthtml/) documents Kobo's dictionary format and how it works. 13 | - [**examples/gotdict-convert**](https://pgaskin.net/dictutil/examples/gotdict-convert.html) is a working example of using dictutil to convert [GOTDict](https://github.com/wjdp/gotdict) into a Kobo dictionary. 14 | - [**examples/webster1913-convert**](https://pgaskin.net/dictutil/examples/webster1913-convert.html) is a working example of using dictutil to convert [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) into a Kobo dictionary. 15 | - [**examples/dictzip-decompile**](https://pgaskin.net/dictutil/examples/dictzip-decompile.html) is an **experimental** tool to convert a dictzip into a dictfile. 16 | - [**examples/bgl-convert**](https://pgaskin.net/dictutil/examples/bgl-convert.html) is a simple tool to convert Babylon BGL dictionaries to a dictfile. 17 | - *Library:* [**kobodict**](https://pkg.go.dev/github.com/pgaskin/dictutil/kobodict) provides support for reading, writing, encrypting, and decrypting Kobo dictionaries. 18 | - *Library:* [**dictgen**](https://pkg.go.dev/github.com/pgaskin/dictutil/dictgen) provides the functionality of dictgen as a library. 19 | - *Library:* [**marisa**](./marisa) provides a simplified self-contained CGO wrapper for [marisa-trie](https://github.com/s-yata/marisa-trie). 20 | 21 | Dictutil implements [version 2](https://pgaskin.net/dictutil/dicthtml/v1v2.html) of the Kobo dictionary format, which supports firmware versions 4.7.10364+. 22 | 23 | For more information, see the [documentation](https://pgaskin.net/dictutil/). If you just want a quick overview of the utilities provided, continue reading below. 24 | 25 | ## Download 26 | - **Documentation** can be found on the [website](https://pgaskin.net/dictutil/). 27 | - **Tools** (dictutil, dictgen, gotdict-convert, webster1913-convert) can be downloaded from the [releases](https://github.com/pgaskin/dictutil/releases) page. 28 | - **Pre-built dictionaries** from gotdict-convert and webster1913-convert can be downloaded from [AppVeyor](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts) or from the links below: 29 | - GOTDict *(with images, firmware 4.20.14601+)*: [dictzip (dicthtml-gt.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.df?branch=master&all=false&pr=false) 30 | - GOTDict *(without images)*: [dictzip (dicthtml-gt.noimg.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.noimg.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.noimg.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.noimg.df?branch=master&all=false&pr=false) 31 | - Webster's 1913 Dictionary: [dictzip (dicthtml-wb.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/dicthtml-wb.zip?branch=master&all=false&pr=false), [source dictfile (webster1913.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/webster1913.df?branch=master&all=false&pr=false) 32 | - **API documentation** for the Go libraries can be found on [pkg.go.dev](https://pkg.go.dev/github.com/pgaskin/dictutil). 33 | 34 | ## Usage 35 | See the [documentation](https://pgaskin.net/dictutil/) for more detailed information and examples. 36 | 37 | ### dictutil 38 | 39 | ``` 40 | Usage: dictutil command [options] [arguments] 41 | 42 | Dictutil provides low-level utilities to manipulate Kobo dictionaries (v2). 43 | 44 | Commands: 45 | install (I) Install a dictzip file 46 | pack (p) Pack a dictzip file 47 | prefix (x) Calculate the prefix for a word 48 | uninstall (U) Uninstall a dictzip file 49 | unpack (u) Unpack a dictzip file 50 | help Show help for all commands 51 | 52 | Options: 53 | -h, --help Show this help text 54 | ``` 55 | 56 | ``` 57 | Usage: dictutil install [options] dictzip 58 | 59 | Options: 60 | -k, --kobo string KOBOeReader path (default: automatically detected) 61 | -l, --locale string Locale name to use (format: ALPHANUMERIC{2}[-ALPHANUMERIC{2}]) (default: detected from filename if in format dicthtml-**.zip) 62 | -n, --name string Custom additional label for dictionary (ignored when replacing built-in dictionaries) (doesn't have any effect on 4.20.14601+) 63 | -b, --builtin string How to handle built-in locales [replace = replace and prevent from syncing] [ignore = replace and leave syncing as-is] (doesn't have any effect on 4.24.15672+) (default "replace") 64 | -B, --no-custom Whether to force installation to .kobo/dict instead of .kobo/custom-dict (4.24.15672+ only) 65 | --use-extra-locales Whether to use ExtraLocales on 4.24.15672+ if not a built-in dictionary (this is not required anymore since 4.24.15672) (4.24.15672+ only) 66 | -h, --help Show this help text 67 | 68 | Note: 69 | If you are not replacing a built-in dictionary and are using a firmware 70 | version before 4.24.15672, the 'Enable searches on extra dictionaries patch' 71 | must be installed or you will not be able to select your custom dictionary. 72 | ``` 73 | 74 | ``` 75 | Usage: dictutil uninstall [options] locale 76 | 77 | Options: 78 | -k, --kobo string KOBOeReader path (default: automatically detected) 79 | -b, --builtin string How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+) (default "normal") 80 | -B, --no-custom Uninstall built-in dictionaries instead of custom ones on 4.24.15672+ 81 | -h, --help Show this help text 82 | ``` 83 | 84 | ``` 85 | Usage: dictutil pack [options] dictdir 86 | 87 | Options: 88 | -o, --output string The output dictzip filename (will be overwritten if it exists) (default "dicthtml.zip") 89 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex) 90 | -h, --help Show this help text 91 | ``` 92 | 93 | ``` 94 | Usage: dictutil unpack [options] dictzip 95 | 96 | Options: 97 | -o, --output string The output directory (must not exist) (default: the basename of the input without the extension) 98 | -c, --crypt string Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex) 99 | -h, --help Show this help text 100 | ``` 101 | 102 | ``` 103 | Usage: dictutil prefix [options] word... 104 | 105 | Options: 106 | -f, --format string The output format (go-slice, go-map, csv, tsv, json-array, json-object) (default "json-array") 107 | -h, --help Show this help text 108 | ``` 109 | 110 | ### dictgen 111 | 112 | ``` 113 | Usage: dictgen [options] dictfile... 114 | 115 | Options: 116 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "dicthtml.zip") 117 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex) 118 | -I, --image-method string How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove) (default "base64") 119 | --remove-footer Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary) 120 | -h, --help Show this help text 121 | 122 | If multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename. 123 | 124 | Note that the only usable image method is currently removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details. 125 | 126 | See https://pgaskin.net/dictutil/dictgen for more information about the dictfile format. 127 | ``` 128 | 129 | **See [here](https://pgaskin.net/dictutil/dictgen/) for information and examples of the dictfile format.** 130 | 131 | ### gotdict-convert 132 | 133 | ``` 134 | Usage: gotdict-convert [options] 135 | 136 | Options: 137 | -g, --gotdict string The path to the local copy of github.com/wjdp/gotdict. (default "./gotdict") 138 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./gotdict.df") 139 | -I, --images Include images in dictfile 140 | -h, --help Show this help text 141 | 142 | To convert the resulting dictfile into a dictzip, use dictgen. 143 | ``` 144 | 145 | ### webster1913-convert 146 | 147 | ``` 148 | Usage: webster1913-convert [options] gutenberg_webster1913_path 149 | 150 | Options: 151 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./webster1913.df") 152 | --dump Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging) 153 | -h, --help Show this help text 154 | 155 | Arguments: 156 | gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin. 157 | 158 | To convert the resulting dictfile into a dictzip, use dictgen. 159 | ``` 160 | 161 | The original dictionary can be downloaded [here](http://www.gutenberg.org/ebooks/29765.txt.utf-8) or [here](https://github.com/pgaskin/dictserver/raw/master/data/dictionary.txt). 162 | 163 | ### dictzip-decompile 164 | 165 | ``` 166 | Usage: dictzip-decompile [options] dictzip 167 | 168 | Options: 169 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./decompiled.df") 170 | -r, --resources Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled) 171 | -h, --help Show this help text 172 | 173 | Arguments: 174 | dictzip is the path to the dictzip to decompile. 175 | 176 | To convert the resulting dictfile into a dictzip, use dictgen. 177 | 178 | Note: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown. 179 | 180 | This is an experimental tool, and the output may not be perfect on complex dictionaries. 181 | ``` 182 | -------------------------------------------------------------------------------- /cmd/dictgen/main.go: -------------------------------------------------------------------------------- 1 | // Command dictgen is a CLI wrapper around package dictgen. 2 | package main 3 | 4 | import ( 5 | "encoding/hex" 6 | "fmt" 7 | "io" 8 | "os" 9 | "strings" 10 | 11 | _ "image/gif" 12 | _ "image/jpeg" 13 | _ "image/png" 14 | 15 | "github.com/pgaskin/dictutil/dictgen" 16 | "github.com/pgaskin/dictutil/kobodict" 17 | "github.com/spf13/pflag" 18 | 19 | _ "github.com/pgaskin/dictutil/kobodict/marisa" 20 | ) 21 | 22 | var version = "dev" 23 | 24 | func main() { 25 | pflag.CommandLine.SortFlags = false 26 | output := pflag.StringP("output", "o", "dicthtml.zip", "The output filename (will be overwritten if it exists) (- is stdout)") 27 | crypt := pflag.StringP("crypt", "c", "", "Encrypt the dictzip using the specified encryption method (format: method:keyhex)") 28 | imageMethod := pflag.StringP("image-method", "I", "base64", "How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove)") 29 | removeFooter := pflag.Bool("remove-footer", false, "Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary)") 30 | help := pflag.BoolP("help", "h", false, "Show this help text") 31 | pflag.Parse() 32 | 33 | if *help || pflag.NArg() == 0 { 34 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictfile...\n\nVersion: dictgen %s\n\nOptions:\n%s\nIf multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename.\n\nNote that currently, the only usable image method is removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary if images are enabled), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details.\n\nSee https://pgaskin.net/dictutil/dictgen for more information about the dictfile format.\n", os.Args[0], version, pflag.CommandLine.FlagUsages()) 35 | os.Exit(0) 36 | return 37 | } 38 | 39 | var e kobodict.Crypter 40 | if *crypt != "" { 41 | if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 { 42 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n") 43 | os.Exit(2) 44 | return 45 | } else if key, err := hex.DecodeString(spl[1]); err != nil { 46 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err) 47 | os.Exit(2) 48 | return 49 | } else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil { 50 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err) 51 | os.Exit(2) 52 | return 53 | } else { 54 | e = enc 55 | } 56 | } 57 | 58 | var ih dictgen.ImageHandler 59 | switch *imageMethod { 60 | case "base64": 61 | ih = new(dictgen.ImageHandlerBase64) 62 | case "embed": 63 | ih = new(dictgen.ImageHandlerEmbed) 64 | case "remove": 65 | ih = new(dictgen.ImageHandlerRemove) 66 | default: 67 | fmt.Fprintf(os.Stderr, "Error: invalid value for --image-method, see --help for details.") 68 | os.Exit(2) 69 | return 70 | } 71 | 72 | var tdf dictgen.DictFile 73 | 74 | fmt.Fprintf(os.Stderr, "Parsing dictfiles.\n") 75 | var seenStdin bool 76 | for _, fn := range pflag.Args() { 77 | if fn == "-" { 78 | if seenStdin { 79 | fmt.Fprintf(os.Stderr, "Error: stdin can only be specified once.\n") 80 | os.Exit(1) 81 | return 82 | } 83 | seenStdin = true 84 | } 85 | 86 | if err := func() error { 87 | var fr io.Reader 88 | if fn == "-" { 89 | fr = os.Stdin 90 | } else { 91 | f, err := os.OpenFile(fn, os.O_RDONLY, 0) 92 | if err != nil { 93 | return err 94 | } 95 | defer f.Close() 96 | fr = f 97 | } 98 | 99 | if df, err := dictgen.ParseDictFile(fr); err != nil { 100 | return err 101 | } else if err := df.Validate(); err != nil { 102 | return err 103 | } else { 104 | tdf = append(tdf, df...) 105 | } 106 | 107 | return nil 108 | }(); err != nil { 109 | fmt.Fprintf(os.Stderr, "Error: input %#v: %v.\n", fn, err) 110 | os.Exit(1) 111 | return 112 | } 113 | } 114 | 115 | if *removeFooter { 116 | fmt.Fprintf(os.Stderr, "Appending HTML code to remove entry footers (note: you don't need this and should not use it unless you are replacing a dictionary which adds it, such as the French one).\n") 117 | for _, dfe := range tdf { 118 | dfe.PostRawHTML += `` 119 | } 120 | } 121 | 122 | fmt.Fprintf(os.Stderr, "Opening output.\n") 123 | var f io.WriteCloser 124 | switch *output { 125 | case "-": 126 | f = os.Stdout 127 | default: 128 | ff, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 129 | if err != nil { 130 | fmt.Fprintf(os.Stderr, "Error: create dictzip: %v\n", err) 131 | os.Exit(1) 132 | return 133 | } 134 | f = ff 135 | } 136 | 137 | fmt.Fprintf(os.Stderr, "Generating dictzip.\n") 138 | dw := kobodict.NewWriter(f) 139 | dw.SetEncrypter(e) 140 | if e != nil { 141 | fmt.Fprintf(os.Stderr, " Using encryption.\n") 142 | } 143 | if ih != nil { 144 | fmt.Fprintf(os.Stderr, " Using image method: %s.\n", ih.Description()) 145 | } 146 | if err := tdf.WriteDictzip(dw, ih, dictgen.ImageFuncFilesystem); err != nil { 147 | f.Close() 148 | fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err) 149 | os.Exit(1) 150 | return 151 | } else if err := dw.Close(); err != nil { 152 | f.Close() 153 | fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err) 154 | os.Exit(1) 155 | return 156 | } else if err := f.Close(); err != nil { 157 | fmt.Fprintf(os.Stderr, "Error: write dictzip: %v\n", err) 158 | os.Exit(1) 159 | return 160 | } 161 | 162 | fmt.Fprintf(os.Stderr, "Successfully wrote %d entries from %d dictfile(s) to dictzip %s.\n", len(tdf), pflag.NArg(), *output) 163 | os.Exit(0) 164 | } 165 | -------------------------------------------------------------------------------- /cmd/dictutil/main.go: -------------------------------------------------------------------------------- 1 | // Command dictutil provides commands for installing, removing, unpacking, 2 | // packing, and performing low-level modifications and tests on Kobo 3 | // dictionaries. 4 | package main 5 | 6 | import ( 7 | "fmt" 8 | "os" 9 | "sort" 10 | 11 | "github.com/spf13/pflag" 12 | 13 | _ "github.com/pgaskin/dictutil/kobodict/marisa" 14 | ) 15 | 16 | var version = "dev" 17 | 18 | var commands []*command 19 | 20 | type command struct { 21 | Name string 22 | Short string 23 | Description string 24 | Main func(args []string, fs *pflag.FlagSet) int 25 | } 26 | 27 | func main() { 28 | sort.Slice(commands, func(i, j int) bool { 29 | return commands[i].Name < commands[j].Name 30 | }) 31 | 32 | cmdMap := map[string]*command{} 33 | for _, cmd := range commands { 34 | for _, v := range []string{cmd.Name, cmd.Short} { 35 | if _, seen := cmdMap[v]; seen { 36 | panic("command already set: " + v) 37 | } 38 | cmdMap[v] = cmd 39 | } 40 | } 41 | 42 | if len(os.Args) < 2 { 43 | globalHelp() 44 | os.Exit(0) 45 | } 46 | 47 | if os.Args[1] == "help" { 48 | globalHelp() 49 | for _, cmd := range commands { 50 | fmt.Printf("\n### Help for %s:\n\n", cmd.Name) 51 | z := os.Args[0] + " " + cmd.Name 52 | cmd.Main([]string{z, "--help"}, pflag.NewFlagSet(z, pflag.ExitOnError)) 53 | } 54 | } else if cmd, ok := cmdMap[os.Args[1]]; !ok { 55 | globalHelp() 56 | os.Exit(0) 57 | } else { 58 | args := append([]string{os.Args[0] + " " + os.Args[1]}, os.Args[2:]...) 59 | fs := pflag.NewFlagSet(args[0], pflag.ExitOnError) 60 | os.Exit(cmd.Main(args, fs)) 61 | } 62 | } 63 | 64 | func globalHelp() { 65 | fmt.Fprintf(os.Stderr, "Usage: %s command [options] [arguments]\n\nDictutil provides low-level utilities to manipulate Kobo dictionaries (v2).\n\nVersion: dictutil %s\n\nCommands:\n", os.Args[0], version) 66 | for _, cmd := range commands { 67 | fmt.Fprintf(os.Stderr, " %-20s %s\n", fmt.Sprintf("%s (%s)", cmd.Name, cmd.Short), cmd.Description) 68 | } 69 | fmt.Fprintf(os.Stderr, " %-20s %s\n", "help", "Show help for all commands") 70 | fmt.Fprintf(os.Stderr, "\nOptions:\n -h, --help Show this help text\n") 71 | } 72 | -------------------------------------------------------------------------------- /cmd/dictutil/pack.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/hex" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "path/filepath" 9 | "runtime" 10 | "strings" 11 | 12 | "github.com/pgaskin/dictutil/kobodict" 13 | "github.com/spf13/pflag" 14 | ) 15 | 16 | func init() { 17 | commands = append(commands, &command{Name: "pack", Short: "p", Description: "Pack a dictzip file", Main: packMain}) 18 | } 19 | 20 | func packMain(args []string, fs *pflag.FlagSet) int { 21 | fs.SortFlags = false 22 | output := fs.StringP("output", "o", "dicthtml.zip", "The output dictzip filename (will be overwritten if it exists)") 23 | crypt := fs.StringP("crypt", "c", "", "Encrypt the dictzip using the specified encryption method (format: method:keyhex)") 24 | help := fs.BoolP("help", "h", false, "Show this help text") 25 | fs.Parse(args[1:]) 26 | 27 | if *help || fs.NArg() != 1 { 28 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictdir\n\nOptions:\n%s", args[0], fs.FlagUsages()) 29 | return 0 30 | } 31 | 32 | var c kobodict.Crypter 33 | if *crypt != "" { 34 | if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 { 35 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n") 36 | return 2 37 | } else if key, err := hex.DecodeString(spl[1]); err != nil { 38 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err) 39 | return 2 40 | } else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil { 41 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err) 42 | return 2 43 | } else { 44 | c = enc 45 | } 46 | } 47 | 48 | fn, err := filepath.Abs(fs.Args()[0]) 49 | if err != nil { 50 | fmt.Fprintf(os.Stderr, "Error: resolve input path %#v: %v.\n", fs.Args()[0], err) 51 | return 2 52 | } 53 | 54 | ofn, err := filepath.Abs(*output) 55 | if err != nil { 56 | fmt.Fprintf(os.Stderr, "Error: resolve output path %#v: %v.\n", *output, err) 57 | return 2 58 | } 59 | 60 | if fi, err := os.Stat(fn); err != nil { 61 | fmt.Fprintf(os.Stderr, "Error: inaccessible input dir %#v: %v.\n", fn, err) 62 | return 2 63 | } else if !fi.IsDir() { 64 | fmt.Fprintf(os.Stderr, "Error: input %#v is not a dir.\n", fn) 65 | return 2 66 | } 67 | 68 | fmt.Printf("Creating output temp file\n") 69 | f, err := ioutil.TempFile(filepath.Dir(ofn), "tmp_dicthtml.*.zip") 70 | if err != nil { 71 | fmt.Fprintf(os.Stderr, "Error: create output temp file: %v.\n", err) 72 | return 2 73 | } 74 | defer os.Remove(f.Name()) 75 | defer f.Close() 76 | 77 | fmt.Printf("Packing dictzip.\n") 78 | dw := kobodict.NewWriter(f) 79 | defer dw.Close() 80 | 81 | dw.SetEncrypter(c) 82 | 83 | if err := kobodict.Pack(dw, fn); err != nil { 84 | fmt.Fprintf(os.Stderr, "Error: pack input dir %#v to %#v: %v.\n", fn, ofn, err) 85 | return 1 86 | } 87 | 88 | if err := dw.Close(); err != nil { 89 | fmt.Fprintf(os.Stderr, "Error: pack input dir %#v to %#v: %v.\n", fn, ofn, err) 90 | return 1 91 | } 92 | 93 | fmt.Printf("Renaming output file.\n") 94 | if err := f.Chmod(0644); err != nil && runtime.GOOS != "windows" { 95 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err) 96 | return 2 97 | } 98 | if err := f.Sync(); err != nil { 99 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err) 100 | return 2 101 | } 102 | if err := f.Close(); err != nil { 103 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err) 104 | return 2 105 | } 106 | if err := os.Rename(f.Name(), ofn); err != nil { // this will replace existing files properly on Go1.5+ 107 | fmt.Fprintf(os.Stderr, "Error: rename output file: %v.\n", err) 108 | return 2 109 | } 110 | 111 | fmt.Printf("Successfully packed dictdir %#v to dictzip %#v.\n", fn, ofn) 112 | return 0 113 | } 114 | -------------------------------------------------------------------------------- /cmd/dictutil/prefix.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/pgaskin/dictutil/kobodict" 8 | "github.com/spf13/pflag" 9 | ) 10 | 11 | func init() { 12 | commands = append(commands, &command{Name: "prefix", Short: "x", Description: "Calculate the prefix for a word", Main: prefixMain}) 13 | } 14 | 15 | func prefixMain(args []string, fs *pflag.FlagSet) int { 16 | fs.SortFlags = false 17 | format := fs.StringP("format", "f", "json-array", "The output format (go-slice, go-map, csv, tsv, json-array, json-object)") 18 | help := fs.BoolP("help", "h", false, "Show this help text") 19 | fs.Parse(args[1:]) 20 | 21 | if *help || fs.NArg() == 0 { 22 | fmt.Fprintf(os.Stderr, "Usage: %s [options] word...\n\nOptions:\n%s", args[0], fs.FlagUsages()) 23 | return 0 24 | } 25 | 26 | if *format != "go-slice" && *format != "go-map" && *format != "csv" && *format != "tsv" && *format != "json-array" && *format != "json-object" { 27 | fmt.Fprintf(os.Stderr, "Error: invalid format %#v, see --help for more details.\n", *format) 28 | return 2 29 | } 30 | 31 | switch *format { 32 | case "go-slice": 33 | fmt.Printf("[][]string{\n") 34 | case "go-map": 35 | fmt.Printf("map[string]string{\n") 36 | case "csv", "tsv": 37 | break 38 | case "json-array": 39 | fmt.Printf("[\n") 40 | case "json-object": 41 | fmt.Printf("{\n") 42 | default: 43 | panic("invalid output format") 44 | } 45 | 46 | for i, word := range fs.Args() { 47 | prefix := kobodict.WordPrefix(word) 48 | last := i == fs.NArg()-1 49 | 50 | switch *format { 51 | case "go-slice": 52 | fmt.Printf("\t{%#v, %#v},\n", word, prefix) 53 | case "go-map": 54 | fmt.Printf("\t%#v: %#v,\n", word, prefix) 55 | case "csv": 56 | fmt.Printf("%s,%s\n", word, prefix) 57 | case "tsv": 58 | fmt.Printf("%s\t%s\n", word, prefix) 59 | case "json-array": 60 | fmt.Printf(" [%#v, %#v]", word, prefix) 61 | if last { 62 | fmt.Printf("\n") 63 | } else { 64 | fmt.Printf(",\n") 65 | } 66 | case "json-object": 67 | fmt.Printf(" %#v: %#v", word, prefix) 68 | if last { 69 | fmt.Printf("\n") 70 | } else { 71 | fmt.Printf(",\n") 72 | } 73 | default: 74 | panic("invalid output format") 75 | } 76 | } 77 | 78 | switch *format { 79 | case "csv", "tsv": 80 | break 81 | case "json-array": 82 | fmt.Printf("]\n") 83 | case "json-object", "go-slice", "go-map": 84 | fmt.Printf("}\n") 85 | default: 86 | panic("invalid output format") 87 | } 88 | 89 | return 0 90 | } 91 | -------------------------------------------------------------------------------- /cmd/dictutil/uninstall.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "database/sql" 7 | "fmt" 8 | "io" 9 | "net/http" 10 | "os" 11 | "path/filepath" 12 | "regexp" 13 | "sort" 14 | "strings" 15 | 16 | "github.com/pgaskin/koboutils/v2/kobo" 17 | "github.com/spf13/pflag" 18 | ) 19 | 20 | func init() { 21 | commands = append(commands, &command{Name: "uninstall", Short: "U", Description: "Uninstall a dictzip file", Main: uninstallMain}) 22 | } 23 | 24 | func uninstallMain(args []string, fs *pflag.FlagSet) int { 25 | fs.SortFlags = false 26 | root := fs.StringP("kobo", "k", "", "KOBOeReader path (default: automatically detected)") 27 | builtin := fs.StringP("builtin", "b", "normal", "How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+)") 28 | noCustom := fs.BoolP("no-custom", "B", false, "Uninstall built-in dictionaries instead of custom ones on 4.24.15672+") 29 | help := fs.BoolP("help", "h", false, "Show this help text") 30 | fs.Parse(args[1:]) 31 | 32 | if *help || fs.NArg() != 1 { 33 | fmt.Fprintf(os.Stderr, "Usage: %s [options] locale|dicthtml-name.zip\n\nOptions:\n%s\n", args[0], fs.FlagUsages()) 34 | builtinHelp() 35 | return 0 36 | } 37 | 38 | if *builtin != "normal" && *builtin != "delete" && *builtin != "restore" { 39 | fmt.Fprintf(os.Stderr, "Error: invalid built-in dictionary mode %#v, see --help for more details.\n", *builtin) 40 | return 2 41 | } 42 | 43 | kobopath, version, err := findDevice(*root) 44 | if err != nil { 45 | fmt.Fprintf(os.Stderr, "Error: could not detect a Kobo eReader (you can specify one manually with --kobo): %v.\n", err) 46 | return 1 47 | } 48 | 49 | fmt.Printf("Found Kobo eReader at %s with firmware version %s.\n", kobopath, version) 50 | if kobo.VersionCompare(version, "4.7.10364") < 0 { 51 | fmt.Fprintf(os.Stderr, "Error: firmware version too old (v2 dictionaries were only introduced in 4.7.10364).\n") 52 | return 1 53 | } 54 | 55 | fw14601 := kobo.VersionCompare(version, "4.20.14601") >= 0 // https://github.com/pgaskin/kobopatch-patches/issues/49 56 | fw15672 := kobo.VersionCompare(version, "4.24.15672") >= 0 // https://github.com/pgaskin/kobopatch-patches/issues/76 57 | 58 | var dictPath, dictLocale string 59 | if dictLocale = strings.TrimLeft(fs.Args()[0], "-"); dictLocale == "en" { 60 | if fw15672 && !*noCustom { 61 | dictPath = filepath.Join(kobopath, ".kobo", "custom-dict", "dicthtml.zip") 62 | } else { 63 | dictPath = filepath.Join(kobopath, ".kobo", "dict", "dicthtml.zip") 64 | } 65 | } else if regexp.MustCompile(`^[a-zA-Z0-9-]+$`).MatchString(dictLocale) { 66 | if fw15672 && !*noCustom { 67 | dictPath = filepath.Join(kobopath, ".kobo", "custom-dict", "dicthtml-"+dictLocale+".zip") 68 | } else { 69 | dictPath = filepath.Join(kobopath, ".kobo", "dict", "dicthtml-"+dictLocale+".zip") 70 | } 71 | } else { 72 | fmt.Fprintf(os.Stderr, "Error: invalid locale name.\n") 73 | return 1 74 | } 75 | dictSuffix := "-" + dictLocale 76 | _, dictBuiltin := builtinDict[dictLocale] 77 | 78 | fmt.Printf("Uninstalling dictionary %#v (locale: %s).\n\n", dictPath, dictLocale) 79 | 80 | fmt.Printf("Updating database.\n") 81 | if fw15672 { 82 | // We won't bother to check the DB anymore since it's been a while since 83 | // 4.20.14601, and everyone who would be confused by the dictionary 84 | // table probaby would have already seen the message. 85 | fmt.Printf(" No need to update dictionary table on 4.24.15672+, skipping.\n") 86 | } else { 87 | if err := func() error { 88 | db, err := sql.Open("sqlite3", filepath.Join(kobopath, ".kobo", "KoboReader.sqlite")) 89 | if err != nil { 90 | return fmt.Errorf("open database: %w", err) 91 | } 92 | defer db.Close() 93 | 94 | if exists, err := func() (bool, error) { 95 | res, err := db.Query(`SELECT name FROM sqlite_master WHERE type="table" AND name="Dictionary";`) 96 | if err != nil { 97 | return false, fmt.Errorf("check dictionary table: %w", err) 98 | } 99 | defer res.Close() 100 | 101 | if !res.Next() { // if no rows are returned, there was an error or the table didn't exist 102 | if err := res.Err(); err != nil { 103 | return false, fmt.Errorf("check dictionary table: %w", err) 104 | } 105 | return false, nil 106 | } 107 | return true, nil 108 | }(); err != nil { 109 | return fmt.Errorf("check dictionary table: %w", err) 110 | } else if exists { 111 | if fw14601 { 112 | fmt.Printf(" Note: the dictionary table is unnecessary and inconsequential in firmware 4.20.14601+ and can be safely removed.\n") 113 | } 114 | } else { 115 | if fw14601 { 116 | // show a message to prevent confusion 117 | fmt.Printf(" No need to update dictionary table on 4.20.14601+, skipping.\n") 118 | return nil 119 | } else { 120 | return fmt.Errorf("check dictionary table: not found, and version < 4.20.14123") 121 | } 122 | } 123 | 124 | if !dictBuiltin || *builtin == "delete" { 125 | if res, err := db.Exec("DELETE FROM Dictionary WHERE Suffix = ?", dictSuffix); err != nil { 126 | return fmt.Errorf("delete row from database: %w", err) 127 | } else if ra, _ := res.RowsAffected(); ra == 0 { 128 | fmt.Printf(" Row already removed from database (suffix=%s).\n", dictSuffix) 129 | } else { 130 | fmt.Printf(" Removed row from database (suffix=%s).\n", dictSuffix) 131 | } 132 | } 133 | 134 | if dictBuiltin && *builtin == "normal" { 135 | if _, err := db.Exec("UPDATE Dictionary SET Installed = ? WHERE Suffix = ?", "false", dictSuffix); err != nil { 136 | return fmt.Errorf("update row in database: %w", err) 137 | } else { 138 | fmt.Printf(" Set IsInstalled to false in database for built-in dictionary (suffix=%s).\n", dictSuffix) 139 | } 140 | } 141 | 142 | if dictBuiltin && *builtin == "restore" { 143 | if _, err := db.Exec("UPDATE Dictionary SET Installed = ? WHERE Suffix = ?", "true", dictSuffix); err != nil { 144 | return fmt.Errorf("update row in database: %w", err) 145 | } else { 146 | fmt.Printf(" Set IsInstalled to true in database for built-in dictionary (suffix=%s).\n", dictSuffix) 147 | } 148 | } 149 | 150 | if err := db.Close(); err != nil { 151 | return fmt.Errorf("close database: %w", err) 152 | } 153 | 154 | return nil 155 | }(); err != nil { 156 | fmt.Fprintf(os.Stderr, "Error: update database: %v.\n", err) 157 | return 1 158 | } 159 | } 160 | 161 | fmt.Printf("Updating ExtraLocales.\n") 162 | if dictBuiltin { 163 | fmt.Printf(" No need; built-in dictionary.\n") 164 | } else { 165 | if err := func() error { 166 | cfg := filepath.Join(kobopath, ".kobo", "Kobo", "Kobo eReader.conf") 167 | 168 | f, err := os.OpenFile(cfg, os.O_RDONLY, 0) 169 | if err != nil { 170 | return fmt.Errorf("open config file: %w", err) 171 | } 172 | defer f.Close() 173 | 174 | var locales []string 175 | var filtered bool 176 | buf := bytes.NewBuffer(nil) 177 | 178 | fs := bufio.NewScanner(f) 179 | for fs.Scan() { 180 | if bytes.HasPrefix(fs.Bytes(), []byte("ExtraLocales=")) { 181 | for _, loc := range strings.Split(strings.SplitN(fs.Text(), "=", 2)[1], ",") { 182 | loc = strings.TrimSpace(loc) 183 | if loc == dictLocale { 184 | filtered = true 185 | } else { 186 | locales = append(locales, loc) 187 | } 188 | } 189 | continue 190 | } 191 | _, _ = buf.Write(fs.Bytes()) // err is always nil 192 | buf.WriteRune('\n') 193 | } 194 | 195 | if !filtered { 196 | fmt.Printf(" Locale %#v already removed from ExtraLocales (or wasn't there to begin with).\n", dictLocale) 197 | return nil 198 | } 199 | 200 | fmt.Printf(" Removing locale %#v from ExtraLocales.\n", dictLocale) 201 | sort.Strings(locales) 202 | 203 | buf.WriteString("\n[ApplicationPreferences]\n") // this will get merged by Qt 204 | buf.WriteString("ExtraLocales=" + strings.Join(locales, ",")) 205 | 206 | f.Close() 207 | 208 | fo, err := os.OpenFile(cfg+".tmp", os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 209 | if err != nil { 210 | return fmt.Errorf("open new config file: %w", err) 211 | } 212 | defer os.Remove(cfg + ".tmp") 213 | defer fo.Close() 214 | 215 | if _, err := fo.Write(buf.Bytes()); err != nil { 216 | return fmt.Errorf("write new config file: %w", err) 217 | } 218 | 219 | if err := fo.Sync(); err != nil { 220 | return fmt.Errorf("write new config file: %w", err) 221 | } 222 | 223 | if err := fo.Close(); err != nil { 224 | return fmt.Errorf("write new config file: %w", err) 225 | } 226 | 227 | if err := os.Rename(cfg+".tmp", cfg); err != nil { 228 | return fmt.Errorf("rename new config file: %w", err) 229 | } 230 | 231 | return nil 232 | }(); err != nil { 233 | fmt.Fprintf(os.Stderr, "Error: update ExtraLocales: %v.\n", err) 234 | return 1 235 | } 236 | } 237 | 238 | fmt.Printf("Removing dictzip.\n") 239 | if err := os.Remove(dictPath); os.IsNotExist(err) { // this will still remove it if it's readonly on Windows (golang/go@2ffb3e5d905b5622204d199128dec06cefd57790) 240 | fmt.Printf(" Already removed.\n") 241 | } else if err != nil { 242 | fmt.Fprintf(os.Stderr, "Error: remove dictzip: %v.\n", err) 243 | return 1 244 | } else { 245 | fmt.Printf(" Removed.\n") 246 | } 247 | 248 | if *builtin == "restore" { 249 | // TODO: reconsider whether this belongs in uninstall, as: 250 | // - This doesn't update the file size. 251 | // - This doesn't ensure there is actually a DB entry for the restored 252 | // dict. 253 | // - This isn't really uninstalling. 254 | // - It might not even belong in dictutil at all because the URLs may 255 | // change (and it isn't that hard to manually download a dictionary 256 | // to install it with dictutil install) 257 | 258 | url := "https://kbdownload1-a.akamaihd.net/ereader/dictionaries/v2/" 259 | if fw15672 { 260 | url = "https://kbdownload1-a.akamaihd.net/ereader/dictionaries/v3/" 261 | } 262 | url += filepath.Base(dictPath) 263 | 264 | fmt.Printf("Restoring original dictionary from %#v.\n", url) 265 | 266 | if err := func() error { 267 | resp, err := http.Get(url) 268 | if err != nil { 269 | return fmt.Errorf("get dictionary: %w", err) 270 | } 271 | defer resp.Body.Close() 272 | 273 | if resp.StatusCode != http.StatusOK { 274 | return fmt.Errorf("get dictionary: response status %s", resp.Status) 275 | } 276 | 277 | df, err := os.OpenFile(dictPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 278 | if err != nil { 279 | return fmt.Errorf("open output dictzip: %w", err) 280 | } 281 | defer df.Close() 282 | 283 | if _, err := io.Copy(df, resp.Body); err != nil { 284 | return fmt.Errorf("write output dictzip: %w", err) 285 | } 286 | 287 | if err := df.Close(); err != nil { 288 | return fmt.Errorf("write output dictzip: %w", err) 289 | } 290 | 291 | return nil 292 | }(); err != nil { 293 | fmt.Fprintf(os.Stderr, "Error: download dictionary: %v.\n", err) 294 | return 1 295 | } 296 | } 297 | 298 | fmt.Printf("\nSuccessfully uninstalled dictionary for locale %s.\n", dictLocale) 299 | 300 | return 0 301 | } 302 | -------------------------------------------------------------------------------- /cmd/dictutil/unpack.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/hex" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "strings" 9 | 10 | "github.com/pgaskin/dictutil/kobodict" 11 | "github.com/spf13/pflag" 12 | ) 13 | 14 | func init() { 15 | commands = append(commands, &command{Name: "unpack", Short: "u", Description: "Unpack a dictzip file", Main: unpackMain}) 16 | } 17 | 18 | func unpackMain(args []string, fs *pflag.FlagSet) int { 19 | fs.SortFlags = false 20 | output := fs.StringP("output", "o", "", "The output directory (must not exist) (default: the basename of the input without the extension)") 21 | crypt := fs.StringP("crypt", "c", "", "Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex)") 22 | help := fs.BoolP("help", "h", false, "Show this help text") 23 | fs.Parse(args[1:]) 24 | 25 | if *help || fs.NArg() != 1 { 26 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictzip\n\nOptions:\n%s", args[0], fs.FlagUsages()) 27 | return 0 28 | } 29 | 30 | var c kobodict.Crypter 31 | if *crypt != "" { 32 | if spl := strings.SplitN(*crypt, ":", 2); len(spl) < 2 { 33 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: no ':' found.\n") 34 | return 2 35 | } else if key, err := hex.DecodeString(spl[1]); err != nil { 36 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: decode hex: %v.\n", err) 37 | return 2 38 | } else if enc, err := kobodict.NewCrypter(spl[0], key); err != nil { 39 | fmt.Fprintf(os.Stderr, "Error: invalid format for --encrypt: initialize encrypter: %v.\n", err) 40 | return 2 41 | } else { 42 | c = enc 43 | } 44 | } 45 | 46 | fn, err := filepath.Abs(fs.Args()[0]) 47 | if err != nil { 48 | fmt.Fprintf(os.Stderr, "Error: resolve input path %#v: %v.\n", fs.Args()[0], err) 49 | return 2 50 | } 51 | 52 | ofn := *output 53 | if ofn == "" { 54 | ofn = strings.TrimSuffix(filepath.Base(fn), filepath.Ext(fn)) 55 | } 56 | 57 | fmt.Printf("Opening input dictzip.\n") 58 | f, err := os.Open(fn) 59 | if err != nil { 60 | fmt.Fprintf(os.Stderr, "Error: open input file %#v: %v.\n", fn, err) 61 | return 1 62 | } 63 | defer f.Close() 64 | 65 | s, err := f.Stat() 66 | if err != nil { 67 | fmt.Fprintf(os.Stderr, "Error: stat input file %#v: %v.\n", fn, err) 68 | return 1 69 | } 70 | 71 | fmt.Printf("Parsing dictzip.\n") 72 | dr, err := kobodict.NewReader(f, s.Size()) 73 | if err != nil { 74 | fmt.Fprintf(os.Stderr, "Error: parse input file %#v: %v.\n", fn, err) 75 | return 1 76 | } 77 | dr.SetDecrypter(c) 78 | 79 | fmt.Printf("Unpacking dictzip.\n") 80 | if err := kobodict.Unpack(dr, ofn); err != nil { 81 | fmt.Fprintf(os.Stderr, "Error: unpack input file %#v to %#v: %v.\n", fn, ofn, err) 82 | return 1 83 | } 84 | 85 | fmt.Printf("Successfully unpacked dictzip %#v to dictdir %#v.\n", fn, ofn) 86 | return 0 87 | } 88 | -------------------------------------------------------------------------------- /dictgen/dictfile.go: -------------------------------------------------------------------------------- 1 | package dictgen 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "strings" 8 | "text/template" 9 | ) 10 | 11 | // A DictFile is a high-level representation of a Kobo dictionary. 12 | type DictFile []*DictFileEntry 13 | 14 | // DictFileEntry represents a single entry in the DictFile. 15 | type DictFileEntry struct { 16 | Headword string 17 | Variant []string 18 | 19 | NoHeader bool 20 | HeaderInfo string 21 | 22 | RawHTML bool 23 | Definition string 24 | 25 | PostRawHTML string // will not be parsed or saved, only to be used for runtime additions before generating 26 | 27 | line int // for internal use if parsed, zero otherwise 28 | } 29 | 30 | // ParseDictFile parses a DictFile from it's textual representation (usually 31 | // stored in a file with the extension .df). 32 | func ParseDictFile(r io.Reader) (DictFile, error) { 33 | var df DictFile 34 | var dfe *DictFileEntry 35 | 36 | br := bufio.NewScanner(r) 37 | br.Buffer(make([]byte, 64*1024), 2048*1024) // start with a 64KiB buffer, but allow up to 2MiB (for dictfiles with long lines of raw HTML) 38 | var line int 39 | 40 | for br.Scan() { 41 | buf := br.Bytes() 42 | line++ 43 | 44 | if len(buf) == 0 { 45 | // if in a block and after the metadata (in the definition), 46 | // preserve the blank line 47 | if dfe != nil && len(dfe.Definition) != 0 { 48 | dfe.Definition += "\n" 49 | } 50 | continue 51 | } 52 | 53 | switch buf[0] { 54 | case '@': 55 | // start another one 56 | dfe = new(DictFileEntry) 57 | 58 | // add the headword and line info 59 | dfe.Headword = strings.TrimSpace(string(buf[1:])) 60 | dfe.line = line 61 | 62 | // but error if the headword is blank (note that duplicates are 63 | // acceptable, and encouraged in some cases; Kobo will merge it; 64 | // try looking up 'be' in the English dictionary) 65 | if len(dfe.Headword) == 0 { 66 | return nil, fmt.Errorf("dictfile: line %d: empty headword after @", line) 67 | } 68 | 69 | // otherwise, add it to the dictfile (remember it's a pointer, it'll 70 | // still get updated) 71 | df = append(df, dfe) 72 | case ':': 73 | // if not in a block (before the first @), return an error 74 | if dfe == nil { 75 | return nil, fmt.Errorf("dictfile: line %d: header info (: or ::) specified before word (@)", line) 76 | } 77 | 78 | // if already after the metadata (in the definition), return an error 79 | if len(dfe.Definition) != 0 { 80 | return nil, fmt.Errorf("dictfile: line %d: header info (: or ::) specified within definition content (prepend a space if this was intended to be part of the definition itself)", line) 81 | } 82 | 83 | // if already seen the header info (a line starting with :) 84 | if dfe.NoHeader || len(dfe.HeaderInfo) != 0 { 85 | return nil, fmt.Errorf("dictfile: line %d: multiple header infos (: or ::) specified in definition block", line) 86 | } 87 | 88 | // put the trimmed text in the header info, or disable the header if 89 | // it is :: 90 | if len(buf) >= 2 { 91 | if buf[1] == ':' { 92 | if len(strings.TrimSpace(string(buf[2:]))) != 0 { 93 | return nil, fmt.Errorf("dictfile: line %d: extra data after no header specified (::)", line) 94 | } 95 | dfe.NoHeader = true 96 | } else { 97 | dfe.HeaderInfo = strings.TrimSpace(string(buf[1:])) 98 | } 99 | } else { 100 | dfe.HeaderInfo = "" 101 | } 102 | case '&': 103 | // if not in a block, error 104 | if dfe == nil { 105 | return nil, fmt.Errorf("dictfile: line %d: variant (&) specified before word (@)", line) 106 | } 107 | 108 | // if already after the metadata (in the definition), error 109 | if len(dfe.Definition) != 0 { 110 | return nil, fmt.Errorf("dictfile: line %d: variant (&) specified within definition content (prepend a space if this was intended to be part of the definition itself)", line) 111 | } 112 | 113 | // trim the rest of the line (error if nothing left) 114 | v := strings.TrimSpace(string(buf[1:])) 115 | if len(v) == 0 { 116 | return nil, fmt.Errorf("dictfile: line %d: no word after variant specifier (&)", line) 117 | } 118 | 119 | // and add it to the variant list 120 | dfe.Variant = append(dfe.Variant, v) 121 | default: 122 | // if not in a block, error 123 | if dfe == nil { 124 | return nil, fmt.Errorf("dictfile: line %d: definition specified before word (@)", line) 125 | } 126 | 127 | // append the line to the definition 128 | dfe.Definition += string(buf) + "\n" 129 | } 130 | } 131 | 132 | // check for read errors 133 | if err := br.Err(); err != nil { 134 | return nil, err 135 | } 136 | 137 | // and finally, update the raw html flag and cleanup whitespace 138 | for _, dfe := range df { 139 | dfe.Definition = strings.TrimSpace(dfe.Definition) 140 | 141 | if v := strings.TrimSpace(strings.TrimPrefix(dfe.Definition, "")); v != dfe.Definition { 142 | if strings.HasSuffix(v, "") { 143 | return nil, fmt.Errorf("dictfile: entry at line %d: raw HTML definitions are specified with , but SHOULD NOT be a full HTML document ending with ", dfe.line) 144 | } 145 | dfe.RawHTML = true 146 | dfe.Definition = v 147 | } else if strings.Contains(dfe.Definition, "") { 148 | return nil, fmt.Errorf("dictfile: entry at line %d: why does the definition contain a tag ... to make it raw HTML, it should be at the very beginning", dfe.line) 149 | } 150 | } 151 | 152 | // note: validation is done separately (and always done before generation) 153 | 154 | return df, nil 155 | } 156 | 157 | // Validate validates the entries in the DictFile. Note that duplicate entries 158 | // are fine, and are encouraged if necessary (Kobo will merge them). 159 | func (df DictFile) Validate() error { 160 | illegal := func(s string, word bool) error { 161 | if word && strings.Contains(s, "\"") { 162 | return fmt.Errorf("must not contain %#v", "\"") 163 | } 164 | for _, c := range []string{ 165 | "{{end -}} 243 | 244 | {{with .Definition}} 245 | {{dfesc .}}{{end -}} 246 | 247 | {{- /* keep trailing newline at end of template */}} 248 | `)) 249 | 250 | func (d DictFileEntry) writeDictFileEntry(w io.Writer) error { 251 | return dictFileEntryTmpl.Execute(w, d) 252 | } 253 | -------------------------------------------------------------------------------- /dictgen/dictfile_test.go: -------------------------------------------------------------------------------- 1 | package dictgen 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "reflect" 8 | "sort" 9 | "strings" 10 | "testing" 11 | ) 12 | 13 | type testcase struct { 14 | What string 15 | 16 | In string 17 | Err error 18 | 19 | Out DictFile 20 | 21 | OutDictFile string 22 | OutKoboHTML string 23 | } 24 | 25 | // TODO(v1): more specific tests 26 | var testcases = []testcase{{ 27 | What: "some of everything", 28 | In: `@ blank 29 | 30 | @ headword 31 | : info 32 | & variant1 33 | &variant2 34 | test 35 | test 36 | 37 | @ custom 38 | & NORMALIZEME 39 | :: 40 | 41 | custom word: 42 |

test

43 | @ markdown 44 | :-test 45 | 1. Definition point 1. 46 | - Blah 47 | - Blah 48 | 2. Blah blah blah. 49 | 3. Blah *blah* **blah**! 50 | 51 | Blah blah blah.`, 52 | Out: DictFile{ 53 | {Headword: "blank", Variant: []string(nil), NoHeader: false, HeaderInfo: "", RawHTML: false, Definition: "", line: 1}, 54 | {Headword: "headword", Variant: []string{"variant1", "variant2"}, NoHeader: false, HeaderInfo: "info", RawHTML: false, Definition: "test\ntest", line: 3}, 55 | {Headword: "custom", Variant: []string{"NORMALIZEME"}, NoHeader: true, HeaderInfo: "", RawHTML: true, Definition: "custom word:\n

test

", line: 10}, 56 | {Headword: "markdown", Variant: []string(nil), NoHeader: false, HeaderInfo: "-test", RawHTML: false, Definition: "1. Definition point 1.\n - Blah\n - Blah\n2. Blah blah blah.\n3. Blah *blah* **blah**!\n\nBlah blah blah.", line: 16}, 57 | }, 58 | OutDictFile: `@ blank 59 | 60 | @ headword 61 | : info 62 | & variant1 63 | & variant2 64 | test 65 | test 66 | 67 | @ custom 68 | :: 69 | & NORMALIZEME 70 | 71 | custom word: 72 |

test

73 | 74 | @ markdown 75 | : -test 76 | 1. Definition point 1. 77 | - Blah 78 | - Blah 79 | 2. Blah blah blah. 80 | 3. Blah *blah* **blah**! 81 | 82 | Blah blah blah. 83 | 84 | `, 85 | OutKoboHTML: `

blank

custom word: 86 |

test

headword info

test 87 | test

markdown -test

    88 |
  1. Definition point 1. 89 | 90 |
      91 |
    • Blah
    • 92 |
    • Blah
    • 93 |
  2. 94 |
  3. Blah blah blah.
  4. 95 |
  5. Blah blah blah!
  6. 96 |
97 | 98 |

Blah blah blah.

`, 99 | }} 100 | 101 | func TestDictFile(t *testing.T) { 102 | for _, tc := range testcases { 103 | t.Logf("case %#v", tc.What) 104 | 105 | df, err := ParseDictFile(strings.NewReader(tc.In)) 106 | if tc.Err == nil && err != nil { 107 | t.Fatalf("case %#v: parse dictfile: unexpected error: %v", tc.What, err) 108 | } else if tc.Err != nil && err == nil { 109 | t.Fatalf("case %#v: parse dictfile: expected error (%v)", tc.What, tc.Err) 110 | } else if tc.Err != nil && tc.Err.Error() != err.Error() { 111 | t.Fatalf("case %#v: parse dictfile: expected error (%v), got: %v", tc.What, tc.Err, err) 112 | } 113 | 114 | exp, err := json.MarshalIndent(tc.Out, "| ", " ") 115 | if err != nil { 116 | panic(err) 117 | } 118 | 119 | act, err := json.MarshalIndent(df, "| ", " ") 120 | if err != nil { 121 | panic(err) 122 | } 123 | 124 | if !reflect.DeepEqual(exp, act) { 125 | for _, dfe := range df { 126 | fmt.Printf("%#v,\n", dfe) 127 | } 128 | t.Fatalf("case %#v: expected:\n%s\n\ngot:\n%s", tc.What, exp, act) 129 | } 130 | 131 | buf := bytes.NewBuffer(nil) 132 | if err := df.WriteDictFile(buf); err != nil { 133 | t.Fatalf("case %#v: write dictfile: unexpected error: %v", tc.What, err) 134 | } else if tc.OutDictFile != buf.String() { 135 | fmt.Printf("expected:\n`%s`\n\ngot:\n`%s`", tc.OutDictFile, buf.String()) 136 | t.Fatalf("case %#v: unexpected dictfile output", tc.What) 137 | } 138 | 139 | pdf, err := ParseDictFile(buf) 140 | if err != nil { 141 | t.Fatalf("case %#v: reparse written dictfile: unexpected error: %v", tc.What, err) 142 | } 143 | sort.Slice(pdf, func(i, j int) bool { 144 | return pdf[i].Headword < pdf[j].Headword 145 | }) 146 | edf := df[:] 147 | sort.Slice(edf, func(i, j int) bool { 148 | return edf[i].Headword < edf[j].Headword 149 | }) 150 | if jpdf, err := json.Marshal(pdf); err != nil { 151 | panic(pdf) 152 | } else if jedf, err := json.Marshal(edf); err != nil { 153 | panic(pdf) 154 | } else if !reflect.DeepEqual(jpdf, jedf) { 155 | t.Fatalf("case %#v: reparse written dictfile: differs from original (orig:%s) (reparsed:%s)", tc.What, jedf, jpdf) 156 | } 157 | 158 | buf.Reset() 159 | if err := df.WriteKoboHTML(buf); err != nil { 160 | t.Fatalf("case %#v: write kobo html: unexpected error: %v", tc.What, err) 161 | } else if tc.OutKoboHTML != buf.String() { 162 | fmt.Printf("expected:\n`%s`\n\ngot:\n`%s`", tc.OutKoboHTML, buf.String()) 163 | t.Fatalf("case %#v: unexpected kobo html output", tc.What) 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /dictgen/dictgen.go: -------------------------------------------------------------------------------- 1 | // Package dictgen simplifies creating full-featured dictionaries for Kobo 2 | // eReaders, with support for images, unicode prefixes, raw html, markdown, and 3 | // more. 4 | // 5 | // A marisa implementation must be provided by 6 | // github.com/pgaskin/kobodict/marisa or a custom one for this package to work. 7 | package dictgen 8 | 9 | import ( 10 | "bytes" 11 | "fmt" 12 | "io" 13 | "sort" 14 | "strings" 15 | "text/template" 16 | 17 | "github.com/pgaskin/dictutil/kobodict" 18 | "github.com/russross/blackfriday/v2" 19 | ) 20 | 21 | // WriteDictzip writes the dictfile to a kobodict.Writer, which should not have 22 | // been used yet. The writer is not closed automatically. If the ImageHandler 23 | // requires a file to be opened (i.e. not ImageHandlerRemove), the provided 24 | // ImageFunc will be called. 25 | func (df DictFile) WriteDictzip(dw *kobodict.Writer, ih ImageHandler, img ImageFunc) error { 26 | var prefixes []string 27 | prefixed := df.Prefixed() 28 | for pfx := range prefixed { 29 | prefixes = append(prefixes, pfx) 30 | } 31 | sort.Strings(prefixes) 32 | 33 | hbuf := bytes.NewBuffer(nil) 34 | for _, pfx := range prefixes { 35 | for _, dfe := range prefixed[pfx] { 36 | if err := dw.AddWord(dfe.Headword); err != nil { 37 | return fmt.Errorf("add word %#v: %w", dfe.Headword, err) 38 | } 39 | for _, v := range dfe.Variant { 40 | if err := dw.AddWord(v); err != nil { 41 | return fmt.Errorf("add variant %#v: %w", v, err) 42 | } 43 | } 44 | } 45 | hbuf.Reset() 46 | if err := prefixed[pfx].WriteKoboHTML(hbuf); err != nil { 47 | return fmt.Errorf("generate dicthtml for %s: %w", pfx, err) 48 | } else if buf, err := transformHTMLImages(ih, dw, hbuf.Bytes(), img); err != nil { 49 | return fmt.Errorf("generate dicthtml for %s: transform images: %w", pfx, err) 50 | } else if hw, err := dw.CreateDicthtml(pfx); err != nil { 51 | return fmt.Errorf("write dicthtml for %s: %w", pfx, err) 52 | } else if _, err = hw.Write(buf); err != nil { 53 | return fmt.Errorf("write dicthtml for %s: %w", pfx, err) 54 | } 55 | } 56 | 57 | return nil 58 | } 59 | 60 | // Prefixed shards the DictFile into the different word prefixes. The original 61 | // DictFile is unchanged, but the entries are still pointers to the originals 62 | // (i.e. the result will become out of date if you modify the entries). 63 | // 64 | // The DictFile is not validated. 65 | // 66 | // If a variamt has a different prefix, the entire entry is duplicated as 67 | // necessary. 68 | func (df DictFile) Prefixed() map[string]DictFile { 69 | prefixed := map[string]DictFile{} 70 | for _, dfe := range df { 71 | pfx := map[string]bool{} 72 | 73 | pfx[kobodict.WordPrefix(dfe.Headword)] = true 74 | for _, v := range dfe.Variant { 75 | pfx[kobodict.WordPrefix(v)] = true 76 | } 77 | 78 | for p := range pfx { 79 | prefixed[p] = append(prefixed[p], dfe) 80 | } 81 | } 82 | return prefixed 83 | } 84 | 85 | // WriteKoboHTML validates the DictFile and writes it to w in the dicthtml 86 | // format. 87 | func (df DictFile) WriteKoboHTML(w io.Writer) error { 88 | if err := df.Validate(); err != nil { 89 | return err 90 | } 91 | 92 | // must be sorted for proper matching 93 | dfs := df[:] 94 | sort.Slice(dfs, func(i int, j int) bool { 95 | return dfs[i].Headword < dfs[j].Headword 96 | }) 97 | 98 | if _, err := w.Write([]byte("")); err != nil { 99 | return err 100 | } 101 | for _, dfe := range dfs { 102 | if err := dfe.writeKoboHTML(w); err != nil { 103 | return err 104 | } 105 | } 106 | if _, err := w.Write([]byte("")); err != nil { 107 | return err 108 | } 109 | 110 | return nil 111 | } 112 | 113 | // note: we don't want the html/template escaping, this isn't actually proper 114 | // html, and also, the whitespaces in the end tags should stay EXACTLY as is 115 | // (yes, I know there is a space before the end of the a but not the variant) to 116 | // provide the best possible matches against the regexps Kobo uses. Also, the 117 | // output should not have any newlines. Also, keep in mind headwords can have 118 | // unescaped html tags in it, and they will be rendered properly by Kobo. 119 | var koboHTMLTmpl = template.Must(template.New("").Funcs(template.FuncMap{ 120 | "md": func(md string) string { 121 | return strings.TrimSpace(string(blackfriday.Run([]byte(md)))) 122 | }, 123 | "normhw": func(headword string) string { 124 | return kobodict.NormalizeWordReference(headword, false) 125 | }, 126 | "normv": func(variant string) string { 127 | return kobodict.NormalizeWordReference(variant, true) 128 | }, 129 | }).Parse(` 130 | {{- /* trim */ -}} 131 | 132 | 133 | {{- if .NoHeader -}} 134 |
135 | {{- else -}} 136 |

{{.Headword}}{{with .HeaderInfo}} {{.}}{{end}}

137 | {{- end -}} 138 | 139 | {{- range .Variant -}} 140 | 141 | {{- end -}} 142 | 143 | {{- with .Definition -}} 144 | {{- if $.RawHTML -}} 145 | {{.}} 146 | {{- else -}} 147 | {{md .}} 148 | {{- end -}} 149 | {{- end -}} 150 | {{- with .PostRawHTML -}} 151 | {{.}} 152 | {{- end -}} 153 | 154 | 155 | {{- /* trim */ -}} 156 | `)) 157 | 158 | func (d DictFileEntry) writeKoboHTML(w io.Writer) error { 159 | return koboHTMLTmpl.Execute(w, d) 160 | } 161 | -------------------------------------------------------------------------------- /dictgen/image.go: -------------------------------------------------------------------------------- 1 | package dictgen 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha1" 6 | "encoding/base64" 7 | "fmt" 8 | "image" 9 | "io" 10 | "math" 11 | "os" 12 | "path/filepath" 13 | "regexp" 14 | "strings" 15 | 16 | "github.com/disintegration/imaging" 17 | "github.com/pgaskin/dictutil/kobodict" 18 | ) 19 | 20 | // ImageFunc reads an image from the path (it may be absolute or relative) src, 21 | // and returns an io.Reader for the image contents. If the returned reader 22 | // implements io.Closer, it will automatically be called after the image has 23 | // been processed. 24 | type ImageFunc func(src string) (io.Reader, error) 25 | 26 | // ImageFuncFilesystem loads an image from the filesystem. If src is relative, 27 | // it is resolved relative to the current dir. 28 | func ImageFuncFilesystem(src string) (io.Reader, error) { 29 | rsrc, err := filepath.Abs(src) 30 | if err != nil { 31 | return nil, fmt.Errorf("resolve path %#v: %w", src, err) 32 | } 33 | f, err := os.Open(rsrc) 34 | if err != nil { 35 | return nil, fmt.Errorf("open image file %#v (resolved from %#v): %w", rsrc, src, err) 36 | } 37 | return f, nil // f will be closed by transformHTMLImages 38 | } 39 | 40 | // ImageHandler transforms images referenced in a DictFile. 41 | type ImageHandler interface { 42 | // Transform transforms an image read from ir, and returns a new value for 43 | // the img tag's src attribute. As a special case, if an empty string is 44 | // returned and the error is nil, the image tag is removed entirely. In 45 | // addition, custom CSS (which must not contain any double quotes) can be 46 | // returned to be set on the img tag. 47 | Transform(src string, ir io.Reader, dw *kobodict.Writer) (nsrc string, css string, err error) 48 | 49 | // Description returns a human-readable description of what the handler does. 50 | Description() string 51 | } 52 | 53 | // ImageHandlerRemove removes images from the dicthtml. 54 | type ImageHandlerRemove struct{} 55 | 56 | // Transform implements ImageHandler. 57 | func (*ImageHandlerRemove) Transform(string, io.Reader, *kobodict.Writer) (string, string, error) { 58 | return "", "", nil 59 | } 60 | 61 | // Description implements ImageHandler. 62 | func (*ImageHandlerRemove) Description() string { 63 | return "remove images" 64 | } 65 | 66 | // ImageHandlerEmbed adds the images to the dictzip without any additional 67 | // modifications. Usually, this would be the best choice, but unfortunately, 68 | // it is too buggy as of firmware 4.19.14123. 69 | type ImageHandlerEmbed struct{} 70 | 71 | // Transform implements ImageHandler. 72 | func (*ImageHandlerEmbed) Transform(src string, ir io.Reader, dw *kobodict.Writer) (string, string, error) { 73 | if !strings.HasSuffix(src, ".jpg") && !strings.HasSuffix(src, ".gif") { 74 | return "", "", fmt.Errorf("ImageHandlerEmbed: unsupported image file %s: extension must be .jpg or .gif when embedding", src) 75 | } 76 | 77 | // to generate a deterministic usually-unique filename 78 | fn := fmt.Sprintf("%x%s", sha1.Sum([]byte(src)), filepath.Ext(src)) 79 | if !dw.Exists(fn) { // CreateFile will error if it already exists, and we're pretty confident the file is identical anyways 80 | if iw, err := dw.CreateFile(fn); err != nil { 81 | return "", "", fmt.Errorf("ImageHandlerEmbed: create dictfile entry %#v: %w", fn, err) 82 | } else if _, err := io.Copy(iw, ir); err != nil { 83 | return "", "", fmt.Errorf("ImageHandlerEmbed: copy image to dictfile: %w", err) 84 | } 85 | } 86 | return "dict:///" + fn, "", nil 87 | } 88 | 89 | // Description implements ImageHandler. 90 | func (*ImageHandlerEmbed) Description() string { 91 | return "add to dictzip as-is (warning: this causes entries to appear blank due to a bug in nickel as of firmware 4.20.14601)" 92 | } 93 | 94 | // ImageHandlerBase64 optimizes the image and encodes it as base64. This is the 95 | // most compatible option, but it comes at the expense of space and speed. In 96 | // addition, if there are too many images, it can lead to nickel running out of 97 | // memory when parsing the dictionary (and sickel should reboot it). 98 | // 99 | // In addition, it adds CSS to fix sizing issues (by default, images appear 100 | // really small when rendered in the dictionary due to default styling). 101 | // 102 | // This is currently the recommended option for adding images. 103 | // 104 | // You must import image/* yourself for format support. 105 | type ImageHandlerBase64 struct { 106 | // Images will be resized to fit within these dimensions, while preserving 107 | // aspect ratio. If not specified, the default is 1000x1000. 108 | MaxSize image.Point 109 | // NoGrayscale will prevent images from being grayscaled. 110 | NoGrayscale bool 111 | // JPEGQuality sets the JPEG quality for the encoded images. If not set, it 112 | // defaults to 60. 113 | JPEGQuality int 114 | } 115 | 116 | func (ih *ImageHandlerBase64) params() (maxWidth, maxHeight int, noGrayscale bool, jpegQuality int) { 117 | mw, mh := float64(ih.MaxSize.X), float64(ih.MaxSize.Y) 118 | if mw < 1 { 119 | mw = 1000 120 | } 121 | if mh < 1 { 122 | mh = 1000 123 | } 124 | ng := ih.NoGrayscale 125 | jq := ih.JPEGQuality 126 | if jq == 0 { 127 | jq = 60 128 | } 129 | return int(mw), int(mh), ng, jq 130 | } 131 | 132 | // Transform implements ImageHandler. 133 | func (ih *ImageHandlerBase64) Transform(src string, ir io.Reader, dw *kobodict.Writer) (string, string, error) { 134 | mw, mh, ng, jq := ih.params() 135 | 136 | // decode the image 137 | img, err := imaging.Decode(ir) 138 | if err != nil { 139 | return "", "", fmt.Errorf("ImageHandlerBase64: decode image: %w", err) 140 | } 141 | 142 | // resize it 143 | ow, oh := float64(img.Bounds().Dx()), float64(img.Bounds().Dy()) 144 | sf := math.Min(float64(mw)/ow, float64(mh)/oh) 145 | img = imaging.Resize(img, int(ow*sf), int(oh*sf), imaging.Lanczos) 146 | 147 | // make it grayscale 148 | if ng { 149 | img = imaging.Grayscale(img) 150 | } 151 | 152 | // encode the image 153 | buf := bytes.NewBuffer(nil) 154 | bw := base64.NewEncoder(base64.StdEncoding, buf) 155 | if err := imaging.Encode(bw, img, imaging.JPEG, imaging.JPEGQuality(jq)); err != nil { 156 | return "", "", fmt.Errorf("ImageHandlerBase64: encode new image to dictfile: %w", err) 157 | } 158 | _ = bw.Close() 159 | 160 | // generate the css 161 | css := fmt.Sprintf("width:%dpx;height:%dpx;max-width:100%%;margin:1em auto;page-break-before:auto;object-fit:scale-down;object-position:center", img.Bounds().Dx(), img.Bounds().Dy()) 162 | 163 | // build the URL 164 | return "data:image/jpeg;base64," + buf.String(), css, nil 165 | } 166 | 167 | // Description implements ImageHandler. 168 | func (ih *ImageHandlerBase64) Description() string { 169 | mw, mh, ng, jq := ih.params() 170 | return fmt.Sprintf("optimize and encode as base64 data URL (max_width=%d, max_height=%d, grayscale=%t, jpeg_quality=%d) (warning: this causes segfaults in the in-book dictionary due to a bug in nickel with firmware versions below 4.20.14601)", mw, mh, ng, jq) 171 | } 172 | 173 | var imgTagRe = regexp.MustCompile(`(]*\s+)?src\s*=\s*['"]+)([^'"]+)(['"][^>]*>)`) 174 | 175 | // transformHTMLImages transforms img tags in the specified HTML, using 176 | // openImage to read the specified paths. If openImage implements io.Closer, 177 | // it will be closed automatically. Img tags which reference have a data URL are 178 | // skipped. 179 | // 180 | // The dictwriter may be used during this process, so callers should not rely on 181 | // any entries opened before calling this. 182 | func transformHTMLImages(ih ImageHandler, dw *kobodict.Writer, html []byte, img ImageFunc) ([]byte, error) { 183 | nhtml := html[:] 184 | for _, m := range imgTagRe.FindAllSubmatch(html, -1) { 185 | t, a, b, src, c := m[0], m[1], m[2], m[3], m[4] 186 | if bytes.HasPrefix(src, []byte("data:")) { 187 | continue 188 | } 189 | ir, err := img(string(src)) 190 | if err != nil { 191 | return nil, fmt.Errorf("transform image %#v: open file: %w", string(src), err) 192 | } 193 | nsrc, css, err := ih.Transform(string(src), ir, dw) 194 | if err != nil { 195 | if c, ok := ir.(io.Closer); ok { 196 | c.Close() 197 | } 198 | return nil, fmt.Errorf("transform image %#v: transform image: %w", string(src), err) 199 | } 200 | if c, ok := ir.(io.Closer); ok { 201 | c.Close() 202 | } 203 | var nstyle string 204 | if len(css) != 0 { 205 | nstyle = " style=\"" + css + "\"" 206 | } 207 | if len(nsrc) == 0 { 208 | nhtml = bytes.Replace(nhtml, t, nil, 1) 209 | } else { 210 | nhtml = bytes.Replace(nhtml, t, []byte(string(a)+nstyle+string(b)+nsrc+string(c)), 1) 211 | } 212 | } 213 | return nhtml, nil 214 | } 215 | -------------------------------------------------------------------------------- /dictgen/image_test.go: -------------------------------------------------------------------------------- 1 | package dictgen 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestImgTagRe(t *testing.T) { 9 | inHTML := ` 10 | 11 | 12 | asd 13 | asd 14 | asd 17 | ` 18 | exImg := [][]string{ 19 | {``}, 20 | {``}, 21 | {``}, 22 | {``}, 23 | {``}, 26 | } 27 | 28 | acMatch := imgTagRe.FindAllStringSubmatch(inHTML, -1) 29 | acImg := make([][]string, len(acMatch)) 30 | for i, m := range acMatch { 31 | acImg[i] = m[1:] 32 | } 33 | 34 | if !reflect.DeepEqual(exImg, acImg) { 35 | t.Errorf("Expected %#v, got %#v.", exImg, acImg) 36 | } 37 | } 38 | 39 | // TODO(v1): test the image handlers, especially the one which does the replacements 40 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | title: dictutil 2 | remote_theme: pmarsceill/just-the-docs 3 | url: https://pgaskin.net 4 | baseurl: /dictutil 5 | description: Tools, notes, and other stuff related to Kobo dictionaries. 6 | search_enabled: false 7 | aux_links: 8 | Download: 9 | - http://github.com/pgaskin/dictutil/releases/latest 10 | MobileRead: 11 | - https://www.mobileread.com/forums/showthread.php?t=327854 12 | GitHub: 13 | - http://github.com/pgaskin/dictutil 14 | heading_anchors: true 15 | footer_content: Copyright © 2020 Patrick Gaskin. 16 | -------------------------------------------------------------------------------- /docs/_includes/head_custom.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/dictgen/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: dictgen 4 | has_children: false 5 | --- 6 | 7 | # dictgen 8 | 9 | This section contains documentation for dictgen, a high-level tool to create Kobo dictionaries. 10 | {: .fs-6 .fw-300 } 11 | 12 | ## Usage 13 | 14 | ``` 15 | Usage: dictgen [options] dictfile... 16 | 17 | Options: 18 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "dicthtml.zip") 19 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex) 20 | -I, --image-method string How to handle images (if an image path is relative, it is loaded from the current dir) (base64 - optimize and encode as base64, embed - add to dictzip, remove) (default "base64") 21 | --remove-footer Add code to prevent the non-applicable dictionary source footer for certain locales from being added after the entry (e.g. if replacing the French dictionary) 22 | -h, --help Show this help text 23 | 24 | If multiple dictfiles (*.df) are provided, they will be merged (duplicate entries are fine; they will be shown in sequential order). To read from stdin, use - as the filename. 25 | 26 | Note that currently, the only usable image method is removing them or using base64-encoding (for firmware 4.20.14601+; older versions segfault in the in-book dictionary if images are enabled), as embedded dict:/// image URLs cause the webviews to appear blank (this is a nickel bug). See https://github.com/pgaskin/dictutil/issues/1 for more details. 27 | 28 | See https://pgaskin.net/dictutil/dictgen for more information about the dictfile format. 29 | ``` 30 | 31 | ## Example usage 32 | 33 | **Building a dictzip for a dictfile:** 34 | 35 | ``` 36 | dictgen my-dictionary.df 37 | ``` 38 | 39 | If you are using Windows, you can also drag-and-drop a dictfile onto dictgen.exe. 40 | 41 | **Merging multiple dictfiles into a single dictzip:** 42 | 43 | ``` 44 | dictgen my-dictionary.df another.df 45 | ``` 46 | 47 | If you are using Windows, you can also drag-and-drop multiple dictfiles onto dictgen.exe. 48 | 49 | **Building a dictzip with images removed:** 50 | 51 | ``` 52 | dictgen -I remove my-dictionary.df 53 | ``` 54 | 55 | **Specifying a custom output filename:** 56 | 57 | ``` 58 | dictgen -o dicthtml-df.zip my-dictionary.df 59 | ``` 60 | 61 | ## Dictfile format 62 | Dictgen uses a simple, but feature-complete format for representing Kobo dictionaries. 63 | 64 | A dictfile (with the file extension `.df`) is a plain-text file consisting of multiple entries. 65 | 66 | Each entry represents a single definition. There can be more than one entry per word. An entry is denoted by a line starting with `@ ` followed by the headword. The headword can contain spaces, capital letters, and so on. 67 | 68 | After the headword, zero or more header lines can be added. To add additional variants which will be matched, use `& ` followed by the word variant. The variant can be anything which could be used in a headword. This can be specified more than once, but only one variant can be specified for each `& `. Another header type is word information, denoted by a `: `. If specified, the text following it is appended after the bolded headword on the same line (see the English built-in dictionary for an example; it has things like `-verb` and the pronunciation information here). If you want to have complete control over how the entry is displayed, use `::` (without anything following it) instead of `: `. This will remove the default bolded headword at the top of the generated entry. 69 | 70 | After the header lines, you can include the body of the entry. By default, this uses [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) for formatting. If you want to include raw HTML, prepend the HTML with `` (don't include a closing tag). This can span multiple lines, and will continue until the next entry or end of file. 71 | 72 | In addition, you can include GIF and JPEG images in the body using the usual Markdown or HTML syntax. If the image path is relative (i.e. not a full path), it is resolved relative to the directory you run dictgen from. 73 | 74 | You can also include custom CSS (per-entry) by including it between the `` tags. This is supported in both HTML and Markdown mode. 75 | 76 | ## Dictfile reference 77 | 78 | - `@ HEADWORD`: Start a new entry. The headword doesn't have to be unique, and can contain spaces. 79 | - Header 80 | - `: WORD_INFO` or `::` *(optional)*: Add extra word info after the headword, or remove it entirely. 81 | - `& VARIANT` *(optional)*: Add an additional word to match. Follows the same rules as the headword. Can be repeated multiple times. 82 | - Body 83 | - `MARKDOWN` or ` RAW_HTML`: Include a definition written in Markdown or raw HTML code. 84 | 85 | ## Examples 86 | 87 | ### Simplest 88 | 89 | ``` 90 | @ word 91 | Definition here. 92 | @ word 1 93 | Definition 1 here. 94 | @ test 95 | Blah blah blah. 96 | ``` 97 | 98 | ### Simple 99 | 100 | ``` 101 | @ no 102 | - No means no... 103 | 104 | @ NO 105 | - A different definition for nitric oxide. 106 | - Blah blah blah. 107 | 108 | @ go 109 | & went 110 | & going 111 | 1. This definition is matched by three different words. 112 | 2. It's also numbered rather than bulleted. 113 | - With some sub-items. 114 | - And another. 115 | 116 | An image: 117 | 118 | ![](image.jpg) 119 | 120 | @ test 121 | : this appears beside the headword 122 | Blah blah blah. 123 | ``` 124 | 125 | ### Full 126 | 127 | ``` 128 | @ word 129 | This is the definition of a word. 130 | 131 | @ word 2 132 | This is the defnition of the second word. 133 | 134 | @ water 135 | & H2O 136 | 1. You can also use lists in Markdown. 137 | 2. And **bold text** or *italic text*. 138 | - Sub-items are also supported. 139 | 140 | @ test 141 | : -noun 142 | Blah blah blah. 143 | 144 | @ test 145 | : -verb 146 | Blah blah blah. 147 | 148 | @ custom 149 | :: 150 | **This is a custom word header!** 151 | 152 | And the definition here: 153 | - Blah blah blah. 154 | - Blah blah blah. 155 | 156 | @ images 157 | Embedding an image (relative paths): 158 | 159 | ![](image.jpg) 160 | 161 | Embedding an image (Linux/macOS style paths): 162 | 163 | ![](/path/to/image.jpg) 164 | 165 | Embedding an image (Windows style paths): 166 | 167 | ![](C:/path/to/image.jpg) 168 | 169 | 170 | @ raw-html 171 |

This definition contains raw html.

172 | 173 |

You can split it into multiple lines for readability.

174 | 175 |
    176 |
  • You can also use all HTML tags.
  • 177 |
  • This text has a dark background
  • 178 |
  • This text is styled with CSS classes.
  • 179 |
180 | 181 | 186 | ``` 187 | -------------------------------------------------------------------------------- /docs/dicthtml/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: dicthtml 4 | has_children: true 5 | --- 6 | 7 | # dicthtml 8 | 9 | This section contains documentation and notes about Kobo's dictionary format. 10 | {: .fs-6 .fw-300 } -------------------------------------------------------------------------------- /docs/dicthtml/install.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Installing custom dictionaries 4 | parent: dicthtml 5 | --- 6 | 7 | # Installing custom dictionaries 8 | Sideloading custom dictionaries is easy, but slightly finicky. 9 | 10 | ## Using dictutil 11 | You can easily install dictionaries using dictutil. First, if you are not replacing a built-in dictionary, enable and install the **Enable searches on extra dictionaries** [patch](https://pgaskin.net/kobopatch-patches). Then, follow the [instructions for using the install command](../dictutil/install.html). 12 | 13 | You can uninstall custom dictionaries (including reverting overwritten built-in ones) using the [uninstall command](../dictutil/uninstall.html). 14 | 15 | ## Manual installation 16 | 1. Enable and install the **Enable searches on extra dictionaries** [patch](https://pgaskin.net/kobopatch-patches). 17 | 2. Copy the dictionary to `KOBOeReader/.kobo/dict/dicthtml-LOCALE.zip`, where **LOCALE** is a string consisting of 2 lowercase alphanumeric characters. It does not have to be a valid locale. 18 | 3. If using a a firmware version 4.20.14601 or newer, mark the file as read-only (in Windows Explorer, or `chmod 444 dicthtml-LOCALE.zip`) to prevent nickel from overwriting it during the sync process. 19 | 4. If using a firmware version older than 4.20.14601, open `KOBOeReader/.kobo/KoboReader.sqlite` in a SQLite3 editor, and add a row to the Dictionary table with the following values: 20 | - **Suffix:** `-LOCALE`, where **LOCALE** is the locale code you chose earlier. This is used when constructing filenames. 21 | - **Name:** `Extra:_LOCALE LABEL`, where **LOCALE** is the locale code you chose earlier, and **LABEL** is a custom label (it can have spaces in it). 22 | - **Installed:** `true`. This one is self-explanatory. 23 | - **Size:** `SIZE`, where *SIZE* is the size of the dictzip in bytes. This is displayed in the dictionary settings, but is unused otherwise, so it's fine if it isn't accurate as long as it is a valid number. For built-in dictionaries with `IsSynced` set, it is used to check for updates. 24 | - **IsSynced:** `false`. This is used to see if the sync process should attempt to sync the specified dictionary. If true, the `Size` column is checked against the expected size of the latest version (from the dictionary download server), and if it does not match, the new dictionary is downloaded over it. 25 | 5. Open `KOBOeReader/.kobo/Kobo/Kobo eReader.conf`, and add a line like `ExtraLocales=LOCALE` in the `ApplicationPreferences` section. If it already exists, add your locale code to it and keep the items separated by a comma and a space (e.g. `ExtraLocales=a1, a2`). 26 | 6. Eject your eReader and test the dictionary. 27 | - If the dictionary is unselectable, ensure you followed the steps correctly, especially regarding the locale codes. 28 | - If the dictionary says that the word wasn't found, or just acts unusually in general, ensure the dictionary file is valid. 29 | 30 | ## About locale names and patches 31 | The reason why the patch is required is due to a bug in the firmware. When you choose an entry from the dictionary dropdown, it tries to find a locale name matching it (which it uses to construct the filename for the dicthtml). Kobo has a hard-coded list of supported built-in locales, and supports adding extra ones using the **ApplicationPreferences->ExtraLocales** config file option (a comma separated list of locale codes). These locales have an automatically generated name of "Extra: LOCALE". 32 | 33 | But, this is where the bug occurs. To support translation dictionaries, the dictionary selector will split the name by spaces, and only check against the first element. This is perfectly fine for one-word locale names (i.e. all the built-in ones) For custom locales, it will try to match **Extra:**, which doesn't exist, so it will default to the English dictionary. Thus, to fix this, the "Extra: " prefix used for the custom locales needs to be changed to one without a space. The patch replaces the space with an underscore. This bug does have one benefit though: since only stuff before the first space is considered, you can have a custom label after it. 34 | 35 | ## Alternative method 36 | It is also possible to install custom dictionaries by replacing an existing built-in installed dictionary in `KOBOeReader/.kobo/dict`. To prevent it from being overwritten during a sync, set the `IsSynced` column to `false` for it in the DB on firmware versions older than 4.20.14601, otherwise, mark it read-only. 37 | 38 | ## About changes in firmware 4.20.14601 39 | 40 | In short: 41 | 42 | - **Same:** Nickel will still attempt to sync all dictionaries, including sideloaded ones, unless IsSynced is false. 43 | - **New:** IsSynced can't be changed anymore due to the dictionary table being removed. 44 | - **New:** Nickel will avoid overwriting dictionary files if they are marked read-only, and will instead write `"dicthtml-LOCALE" marked as read-only.. skipping` to the log in the `sync` category. Note that this functionality has been around since at least 4.10.11655, but the database needed to be modified anyways, so there wasn't much point to using it (and nobody noticed it either). 45 | - **Same:** Nickel still generates locale names by default with `Extra: LOCALE`. 46 | - **New:** Nickel doesn't read the dictionary table anymore, so the name in it is ignored. In addition, entries in the table won't change anything even if it is still present. 47 | - **New:** The built-in dictionaries are hard-coded, rather than writing them to the db during migrations and reading from it at runtime. 48 | - **Same:** Nickel still has the bug where the locale splitting is messed up, so the `Extra: LOCALE` names are inherently broken. 49 | - **Same:** The matching can be fixed by replacing `Extra: ` with `Extra:_` (or anything not containing Unicode whitespace). 50 | - **New:** The database doesn't need to be changed anymore in addition to the patch, as the names are generated dynamically using the same string. 51 | - **Therefore:** If the dictionary table is present, it can safely be removed. 52 | - **Therefore:** The steps required to install custom dictionaries are now (note that these have already been incorporated into the instructions above, they are just here for convenience): 53 | - Copy the dictzip and mark it read-only. 54 | - Add it to ExtraLocales if it is not a built-in locale. 55 | - Use the patch to replace `Extra: ` in libnickel with any other string (same length or shorter with a null byte at the end), but does not contain a space (` `). 56 | 57 | See [#49](https://github.com/pgaskin/kobopatch-patches/issues/49) for more information. 58 | 59 | ## Issues with the read-only method for preventing dictionaries from being overwritten 60 | There have been reports of the read-only property (see [#6](https://github.com/pgaskin/dictutil/issues/6) and the threads on MobileRead for more details) not having an effect since at least 4.20.14622. This seems to be due to other checks in the code (for IsSynced and the file size) preventing the read-only one from actually being checked under some conditions. Additionally, some people have had problems marking the dictionary as read-only to begin with (this doesn't seem to be an issue on Linux). 61 | 62 | For now, you can use this [patch](https://pgaskin.net/kobopatch-patches) (for kobopatch v0.15.0, which is included in patches v60+) to prevent all dictionaries from being synced. It should work on most recent firmware versions starting from 4.22.15190. 63 | 64 | ```yaml 65 | Never sync dictionaries: 66 | - Enabled: no 67 | - BaseAddress: {Sym: "SyncDictionariesCommand::prepareDownloadList()"} 68 | - ReplaceBytes: {Offset: 922, FindH: 0CD5, ReplaceH: 0CE0} #permissions 69 | - ReplaceBytes: {Offset: 900, FindH: FFF6CAAE, ReplaceInstNOP: true} #size 70 | - ReplaceBytes: {Offset: 866, FindH: 3FF4DBAE, ReplaceInstNOP: true} #isSynced 71 | ``` 72 | 73 | For versions 4.20.14601 to 4.21.15015, use this patch instead: 74 | 75 | ```yaml 76 | Never sync dictionaries: 77 | - Enabled: no 78 | - BaseAddress: {Sym: "SyncDictionariesCommand::prepareDownloadList()"} 79 | - ReplaceBytes: {Offset: 1048, FindH: 0CD5, ReplaceH: 0CE0} #permissions 80 | - ReplaceBytes: {Offset: 1026, FindH: FFF68DAE, ReplaceInstNOP: true} #size 81 | - ReplaceBytes: {Offset: 992, FindH: 3FF49EAE, ReplaceInstNOP: true} #isSynced 82 | ``` 83 | -------------------------------------------------------------------------------- /docs/dicthtml/matching.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Matching words 4 | parent: dicthtml 5 | --- 6 | 7 | # Matching words 8 | TODO 9 | -------------------------------------------------------------------------------- /docs/dicthtml/prefixes.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Prefixes 4 | parent: dicthtml 5 | --- 6 | 7 | # Prefixes 8 | Kobo dictionaries are sharded by a prefix derived from the headword. 9 | 10 | The information in this document is based on reverse engineering DictionaryParser::htmlForWord. 11 | 12 | **Note:** Kobo will only look in the file matching the word's prefix, so if a variant has a different prefix, it must be duplicated into each matching file (note that duplicate words aren't an issue). 13 | 14 | **Note:** This document only covers the algorithm used for non-Japanese (Kanji) dictionaries. 15 | 16 | ## Prefix algorithm 17 | Prefixes are calculated using the following steps. Note that "character" refers to a single Unicode code point, not a byte. 18 | 19 | 1. Trim the word at the first null byte, if any (i.e. treat it as a C string). 20 | 2. Discard everything but the first two characters. 21 | 3. Convert the characters to lowercase using the Unicode case mapping rules. 22 | 4. Trim all whitespace characters on the left and right sides. 23 | 5. If the string is empty, return "11". 24 | 6. If the first of the remaining characters is in the Unicode Cyrillic character class, return them as-is. 25 | 7. Right-pad the remaining characters to 2 characters long using "`a`"s. 26 | 8. If either of the first two characters are not in the Unicode Letter character class, return "11". 27 | 9. Return the characters as-is. 28 | 29 | ## Examples 30 | 31 | 32 | 33 | | Word | Prefix | Notes | 34 | | --- | --- | --- | 35 | | "`test`" | "`te`" | | 36 | | "`a`" | "`aa`" | | 37 | | "`Èe`" | "`èe`" | The word is made lowercase using unicode rules (i.e. accented characters are included). | 38 | | "`multiple words`" | "`mu`" | | 39 | | "`àççèñts`" | "`àç`" | | 40 | | "`à`" | "`àa`" | | 41 | | "`ç`" | "`ça`" | | 42 | | "" | "`11`" | | 43 | | "` `" | "`11`" | Space trimming is done after taking the first 2 characters. | 44 | | "` x`" | "`xa`" | | 45 | | "` 123`" | "`11`" | | 46 | | "`x 23`" | "`xa`" | | 47 | | "`д `" | "`д`" | "д" is a Cyrillic character, and it's the first character of the word (after trimming spaces), so it isn't padded with "a"s. | 48 | | "`дaд`" | "`дa`" | | 49 | | "`未未`" | "`未未`" | | 50 | | "`未`" | "`未a`" | Even though "未" is a two-byte character, it is a single unicode rune (and the characters are counted, not bytes). | 51 | | "` 未`" | "`11`" | Space trimming is done after taking the first 2 characters. | 52 | | "` 未`" | "`未a`" | The two-byte "未" character isn't split up when taking the first 2 characters. | 53 | 54 | ## Testing 55 | You can test Kobo's prefix algorithm directly using [dictword-test](https://github.com/pgaskin/kobo-mods/tree/master/dictword-test/). 56 | 57 | If you just want an easy way to generate prefixes for words, use the [dictutil prefix](../dictutil/prefix.html) command 58 | 59 | ## Sample implementation 60 | Here is the Go implementation used in dictutil: 61 | 62 | ```go 63 | func WordPrefix(word string) string { 64 | pfx := []rune(word) 65 | 66 | for i, c := range pfx { 67 | if i >= 2 || c == '\x00' { // limit to 2 chars, also cut at null 68 | pfx = pfx[:i] // trim up to current char 69 | break 70 | } 71 | pfx[i] = unicode.ToLower(c) // this includes accented chars 72 | } 73 | 74 | for len(pfx) != 0 { 75 | if unicode.IsSpace(pfx[0]) { 76 | pfx = pfx[1:] // trim left space 77 | } else { 78 | break 79 | } 80 | } 81 | 82 | for len(pfx) != 0 { 83 | if unicode.IsSpace(pfx[len(pfx)-1]) { 84 | pfx = pfx[:len(pfx)-1] // trim right space 85 | } else { 86 | break 87 | } 88 | } 89 | 90 | if len(pfx) == 0 { 91 | return "11" // if empty, return "11" 92 | } 93 | 94 | if !unicode.Is(unicode.Cyrillic, pfx[0]) { 95 | for len(pfx) < 2 { 96 | pfx = append(pfx, 'a') // pad right with 'a's to 2 chars 97 | } 98 | if !unicode.IsLetter(pfx[0]) || !unicode.IsLetter(pfx[1]) { 99 | return "11" // if either of the first 2 chars are letters, return "11" 100 | } 101 | } 102 | 103 | return string(pfx) 104 | } 105 | ``` 106 | -------------------------------------------------------------------------------- /docs/dicthtml/v1v2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgaskin/dictutil/6708cff9a06dbd088ec2267a2314028a9a00b5a7/docs/dicthtml/v1v2-1.png -------------------------------------------------------------------------------- /docs/dicthtml/v1v2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgaskin/dictutil/6708cff9a06dbd088ec2267a2314028a9a00b5a7/docs/dicthtml/v1v2-2.png -------------------------------------------------------------------------------- /docs/dicthtml/v1v2.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Dicthtml v1/v2 4 | parent: dicthtml 5 | --- 6 | 7 | # Dicthtml v1/v2 8 | In firmware 4.7.10364 (December 2017), Kobo introduced a new version of the dictionaries. 9 | 10 | The v1 dictionaries are served from download.kobobooks.com/ereader/dictionaries/dicthtml\*.zip, while the v2 dictionaries are served from download.kobobooks.com/ereader/dictionaries/v2/dicthtml\*.zip. 11 | 12 | While the v1 dictionaries are still available (probably for the Kobo Mini, which is still on 3.19.5761), they will not fully work on newer firmware versions due to the prefix changes. 13 | 14 | I haven't looked at the exact details about v1 dictionaries, but the main change seems to be the rules for computing prefixes for words with accents. 15 | 16 | ## Prefix changes 17 | 18 | ![](v1v2-1.png) 19 | 20 | The primary change in v2 was the removal of the last step of prefix calculation - converting all non-ascii characters to `1`s. Note that this step is done after checking that the first two characters are all Unicode letters (which include accented letters), hence why the prefix wouldn't be `11` (which is used if any of the first 2 characters are not Unicode letters). 21 | 22 | ## Built-in dictionary fixes 23 | 24 | In addition, Kobo fixed some bugs with the dictionaries themselves. In v1, a few dictionaries were missing `` tags around some words, presumably because the conversion code was buggy and the input format was undocumented/unstructured. 25 | 26 | ![](v1v2-2.png) 27 | 28 | As illustrated by the diff above, some words weren't separated properly and a few line breaks were missing in v1. -------------------------------------------------------------------------------- /docs/dictutil/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: dictutil 4 | has_children: true 5 | --- 6 | 7 | # dictutil 8 | 9 | This section contains documentation for dictutil, a tool to manipulate Kobo dictionaries. 10 | {: .fs-6 .fw-300 } 11 | 12 | ``` 13 | Usage: dictutil command [options] [arguments] 14 | 15 | Dictutil provides low-level utilities to manipulate Kobo dictionaries (v2). 16 | 17 | Commands: 18 | install (I) Install a dictzip file 19 | pack (p) Pack a dictzip file 20 | prefix (x) Calculate the prefix for a word 21 | uninstall (U) Uninstall a dictzip file 22 | unpack (u) Unpack a dictzip file 23 | help Show help for all commands 24 | 25 | Options: 26 | -h, --help Show this help text 27 | ``` -------------------------------------------------------------------------------- /docs/dictutil/install.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Install 4 | parent: dictutil 5 | --- 6 | 7 | # Install 8 | 9 | ## Usage 10 | 11 | ``` 12 | Usage: dictutil install [options] dictzip 13 | 14 | Options: 15 | -k, --kobo string KOBOeReader path (default: automatically detected) 16 | -l, --locale string Locale name to use (format: ALPHANUMERIC{2}[-ALPHANUMERIC{2}]) (default: detected from filename if in format dicthtml-**.zip) 17 | -n, --name string Custom additional label for dictionary (ignored when replacing built-in dictionaries) (doesn't have any effect on 4.20.14601+) 18 | -b, --builtin string How to handle built-in locales [replace = replace and prevent from syncing] [ignore = replace and leave syncing as-is] (doesn't have any effect on 4.24.15672+) (default "replace") 19 | -B, --no-custom Whether to force installation to .kobo/dict instead of .kobo/custom-dict (4.24.15672+ only) 20 | --use-extra-locales Whether to use ExtraLocales on 4.24.15672+ if not a built-in dictionary (this is not required anymore since 4.24.15672) (4.24.15672+ only) 21 | -h, --help Show this help text 22 | 23 | Note: 24 | If you are not replacing a built-in dictionary and are using a firmware 25 | version before 4.24.15672, the 'Enable searches on extra dictionaries patch' 26 | must be installed or you will not be able to select your custom dictionary. 27 | ``` 28 | 29 | ## Examples 30 | 31 | **Install a dictionary with the locale in the filename (dicthtml-\*\*.zip):** 32 | 33 | ```sh 34 | dictutil install dicthtml-aa.zip 35 | ``` 36 | 37 | **Install a dictionary with a different locale:** 38 | 39 | ```sh 40 | dictutil install --locale aa mydictionary.zip 41 | ``` 42 | 43 | **Install a dictionary on a specific Kobo:** 44 | 45 | ```sh 46 | dictutil install --kobo /path/to/KOBOeReader dicthtml-aa.zip 47 | ``` 48 | 49 | **Install a dictionary with a custom label (4.19.14123 and older):** 50 | 51 | ```sh 52 | dictutil install --name "My Dictionary" dicthtml-aa.zip 53 | ``` 54 | 55 | ## Details 56 | See [installing dictionaries](../dicthtml/install.html) for more details on how this works. 57 | -------------------------------------------------------------------------------- /docs/dictutil/pack.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Pack 4 | parent: dictutil 5 | --- 6 | 7 | # Pack 8 | 9 | ## Usage 10 | 11 | ``` 12 | Usage: dictutil pack [options] dictdir 13 | 14 | Options: 15 | -o, --output string The output dictzip filename (will be overwritten if it exists) (default "dicthtml.zip") 16 | -c, --crypt string Encrypt the dictzip using the specified encryption method (format: method:keyhex) 17 | -h, --help Show this help text 18 | ``` 19 | 20 | ## Examples 21 | 22 | **Pack a dictdir:** 23 | 24 | ```sh 25 | dictutil pack /path/to/dictdir 26 | # the output is written to dicthtml.zip 27 | ``` 28 | 29 | **Pack a dictdir to a specific filename:** 30 | 31 | ```sh 32 | dictutil pack --output "dicthtml-aa.zip" /path/to/dictdir 33 | ``` 34 | 35 | ## Input format 36 | The input dictdir is the same as the output of [dictutil unpack](./unpack.html). 37 | -------------------------------------------------------------------------------- /docs/dictutil/prefix.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Prefix 4 | parent: dictutil 5 | --- 6 | 7 | # Prefix 8 | 9 | ## Usage 10 | 11 | ``` 12 | Usage: dictutil prefix [options] word... 13 | 14 | Options: 15 | -f, --format string The output format (go-slice, go-map, csv, tsv, json-array, json-object) (default "json-array") 16 | -h, --help Show this help text 17 | ``` 18 | 19 | ## Examples 20 | 21 | **Get the prefix for a word:** 22 | 23 | ```sh 24 | dictutil prefix "word" 25 | ``` 26 | 27 | **Get the prefix for multiple words:** 28 | 29 | ```sh 30 | dictutil prefix "word1" "word2" "word3" 31 | ``` 32 | 33 | **Get the prefix for multiple words as CSV:** 34 | 35 | ```sh 36 | dictutil prefix --format csv "word1" "word2" "word3" 37 | ``` 38 | -------------------------------------------------------------------------------- /docs/dictutil/uninstall.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Uninstall 4 | parent: dictutil 5 | --- 6 | 7 | # Uninstall 8 | 9 | ## Usage 10 | 11 | ``` 12 | Usage: dictutil uninstall [options] locale 13 | 14 | Options: 15 | -k, --kobo string KOBOeReader path (default: automatically detected) 16 | -b, --builtin string How to handle built-in locales [normal = uninstall the same way as the UI] [delete = completely delete the entry (doesn't have any effect on 4.20.14601+)] [restore = download the original dictionary from Kobo again] (doesn't have any effect on 4.24.15672+) (default "normal") 17 | -B, --no-custom Uninstall built-in dictionaries instead of custom ones on 4.24.15672+ 18 | -h, --help Show this help text 19 | ``` 20 | 21 | ## Examples 22 | 23 | **Uninstall a dictionary:** 24 | 25 | ```sh 26 | dictutil uninstall aa 27 | ``` 28 | 29 | **Restore a overwritten built-in dictionary:** 30 | 31 | ```sh 32 | dictutil uninstall --builtin restore fr 33 | ``` 34 | 35 | **Completely delete a built-in dictionary:** 36 | 37 | ```sh 38 | dictutil uninstall --builtin delete fr 39 | ``` 40 | 41 | Note: You can restore the dictionary by manually downloading it and using [dictutil install](./install). 42 | 43 | ## Details 44 | Uninstall does the following steps: 45 | 46 | 1. If the DB entry for the dictionary exists: 47 | - Built-in (normal): Set `Installed` to `false`. 48 | - Built-in (delete): Remove the row for the suffix. 49 | - Built-in (restore): Set `Installed` to `true`. 50 | - Extra: Remove the row for the suffix. 51 | 2. If the dictionary is not built-in and there is an `ExtraLocales` entry for the locale in the `.kobo/Kobo/Kobo eReader.conf`, remove it. 52 | 3. With the dictzip: 53 | - Built-in (normal): Delete it if it exists. 54 | - Built-in (delete): Delete it if it exists. 55 | - Built-in (restore): Delete it if it exists, then download it again from Kobo. 56 | - Extra: Delete it if it exists. 57 | -------------------------------------------------------------------------------- /docs/dictutil/unpack.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Unpack 4 | parent: dictutil 5 | --- 6 | 7 | # Unpack 8 | 9 | ## Usage 10 | 11 | ``` 12 | Usage: dictutil unpack [options] dictzip 13 | 14 | Options: 15 | -o, --output string The output directory (must not exist) (default: the basename of the input without the extension) 16 | -c, --crypt string Decrypt the dictzip (if needed) using the specified encryption method (format: method:keyhex) 17 | -h, --help Show this help text 18 | ``` 19 | 20 | ## Examples 21 | 22 | **Unpack a dictionary:** 23 | 24 | ```sh 25 | dictutil unpack dicthtml.zip 26 | # The output is written to ./dicthtml 27 | ``` 28 | 29 | ```sh 30 | dictutil unpack dicthtml-fr.zip 31 | # The output is written to ./dicthtml-fr 32 | ``` 33 | 34 | **Unpack a dictionary to a custom directory:** 35 | 36 | ``` 37 | dictutil unpack --output mydictionary dicthtml.zip 38 | ``` 39 | 40 | ## Details 41 | An unpacked dictdir contains: 42 | 43 | - `words`: The parsed marisa word list (newline-separated). 44 | - `*.html`: The ungzipped dicthtml files. 45 | - `*`: Any additional files as-is. 46 | -------------------------------------------------------------------------------- /docs/examples/bgl-convert.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: bgl-convert 4 | parent: examples 5 | --- 6 | 7 | # bgl-convert 8 | Converts Babylon BGL dictionaries into dictfiles for use with dictgen. 9 | 10 | Paste the BGL text in the box below to convert it: 11 | 12 | 13 | 14 | Example BGL: 15 | 16 | ``` 17 | ### metadata trimmed for brevity 18 | ### ... 19 | 20 | headword 21 | Definition with html tags. 22 | 23 | headword1|variant1|variant2 24 | The second definition. Blah 25 | blah blah blah. 26 | 27 | 28 | ``` 29 | -------------------------------------------------------------------------------- /docs/examples/dictzip-decompile.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: dictzip-decompile 4 | parent: examples 5 | --- 6 | 7 | # dictzip-decompile 8 | This is an **experimental** tool to convert a dictzip into a dictfile. The output may not be perfect for complex dictionaries. The output should be perfect for dictionaries generated by Penelope. 9 | 10 | ## Usage 11 | 12 | ``` 13 | Usage: dictzip-decompile [options] dictzip 14 | 15 | Options: 16 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./decompiled.df") 17 | -r, --resources Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled) 18 | -h, --help Show this help text 19 | 20 | Arguments: 21 | dictzip is the path to the dictzip to decompile. 22 | 23 | To convert the resulting dictfile into a dictzip, use dictgen. 24 | 25 | Note: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown. 26 | 27 | This is an experimental tool, and the output may not be perfect on complex dictionaries. 28 | ``` 29 | 30 | ## Example uses 31 | - Fixing prefixes or missing variants in dictzips generated by other tools (recompiling the dictfile will automatically fix the prefixes and variants). 32 | - Upgrading a v1 dictzip to v2 (same as above). 33 | - Decompiling a dictzip to merge it with another. 34 | - Converting a previously-created dictzip to a dictfile to make it easier to improve. 35 | - Converting StarDict dictionaries by converting to a dictzip using Penelope, then to a dictfile using this tool. 36 | 37 | ## Notes 38 | The following dictzip generators have enhanced decompilation support: 39 | 40 | - **Penelope:** The output should be perfect. 41 | - **Kobo (en, a few others):** The output should be mostly perfect, but there are a few missing edge cases. Variants (`&`) and header info (`:`) are extracted in addition to the entry content. 42 | - **Kobo (fr):** The output should be mostly perfect, but there are a few missing edge cases. Variants (`&`) and header info (`:`) are extracted in addition to the entry content. 43 | - **dictgen:** The output should be very close to the original dictfile (it has been tested with the output of gotdict-convert and webster1913-convert). With gotdict-convert, the only difference when the decompiled dictzip's dictfile was recompiled was the casing of a few entries in the words index. Even so, this should not be used unless the original dictfile has been lost. In addition, the original Markdown code and images are not recovered. Variants (`&`) and header info (`:` / `::`) are extracted in addition to the entry content. 44 | 45 | Other dictzips only have the headword (`@`) and variants (`&`) extracted, and the content is included as-is as raw HTML without support for other dictfile features. 46 | -------------------------------------------------------------------------------- /docs/examples/gotdict-convert.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: gotdict-convert 4 | parent: examples 5 | --- 6 | 7 | # gotdict-convert 8 | This tool converts [gotdict](https://github.com/wjdp/gotdict) to a dictfile for conversion into a Kobo dictzip. 9 | 10 | Images are supported on firmware 4.20.14601+. 11 | 12 | ## Download 13 | Pre-built dictionaries can be downloaded from the following links: 14 | - GOTDict *(with images, firmware 4.20.14601+)*: [dictzip (dicthtml-gt.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.df?branch=master&all=false&pr=false) 15 | - GOTDict *(without images)*: [dictzip (dicthtml-gt.noimg.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/dicthtml-gt.noimg.zip?branch=master&all=false&pr=false), [source dictfile (gotdict.noimg.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/gotdict/gotdict.noimg.df?branch=master&all=false&pr=false) 16 | 17 | You can use [dictutil](../dictutil/install.html) to install the dictionaries, or see [here](../dicthtml/install.html) for manual installation instructions. 18 | 19 | ## Usage 20 | 21 | ``` 22 | Usage: gotdict-convert [options] 23 | 24 | Version: dev 25 | 26 | Options: 27 | -g, --gotdict string The path to the local copy of github.com/wjdp/gotdict. (default "./gotdict") 28 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./gotdict.df") 29 | -I, --images Include images in dictfile 30 | -h, --help Show this help text 31 | 32 | To convert the resulting dictfile into a dictzip, use dictgen. 33 | ``` 34 | 35 | You can also use the parser as a [Go library](https://pkg.go.dev/github.com/pgaskin/dictutil/examples/gotdict-convert/gotdict). 36 | -------------------------------------------------------------------------------- /docs/examples/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: examples 4 | has_children: true 5 | --- 6 | 7 | # Examples 8 | 9 | This section contains some tools which make use of dictutil. 10 | {: .fs-6 .fw-300 } -------------------------------------------------------------------------------- /docs/examples/webster1913-convert.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: webster1913-convert 4 | parent: examples 5 | --- 6 | 7 | # webster1913-convert 8 | This tool converts [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) into a dictfile for conversion into a Kobo dictzip. 9 | 10 | ## Download 11 | Pre-built dictionaries can be downloaded from the following links: 12 | - Webster's 1913 Dictionary: [dictzip (dicthtml-wb.zip)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/dicthtml-wb.zip?branch=master&all=false&pr=false), [source dictfile (webster1913.df)](https://ci.appveyor.com/api/projects/pgaskin/dictutil/artifacts/webster1913/webster1913.df?branch=master&all=false&pr=false) 13 | 14 | You can use [dictutil](../dictutil/install.html) to install the dictionaries, or see [here](../dicthtml/install.html) for manual installation instructions. 15 | 16 | ## Usage 17 | 18 | ``` 19 | Usage: webster1913-convert [options] gutenberg_webster1913_path 20 | 21 | Options: 22 | -o, --output string The output filename (will be overwritten if it exists) (- is stdout) (default "./webster1913.df") 23 | --dump Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging) 24 | -h, --help Show this help text 25 | 26 | Arguments: 27 | gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin. 28 | 29 | To convert the resulting dictfile into a dictzip, use dictgen. 30 | ``` 31 | 32 | The source dictionary can be downloaded [here](http://www.gutenberg.org/ebooks/29765.txt.utf-8) or [here](https://github.com/pgaskin/dictserver/raw/master/data/dictionary.txt). 33 | 34 | You can also use the parser as a [Go library](https://pkg.go.dev/github.com/pgaskin/dictutil/examples/webster1913-convert/webster1913). 35 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Documentation 4 | nav_order: 1 5 | --- 6 | 7 | # Dictutil 8 | {: .fs-9 } 9 | 10 | A collection of documentation and tools for working with Kobo dictionaries. 11 | {: .fs-6 .fw-300 } 12 | 13 | [Download](https://github.com/pgaskin/dictutil/releases){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 } [dicthtml](./dicthtml/){: .btn .fs-5 .mb-4 .mb-md-0 } [dictgen](./dictgen/){: .btn .fs-5 .mb-4 .mb-md-0 } [dictutil](./dictutil/){: .btn .fs-5 .mb-4 .mb-md-0 } 14 | 15 | --- 16 | 17 | **Prebuilt dictionaries:** 18 | 19 | [GOTDict](./examples/gotdict-convert.html#download){: .btn .fs-3 .mb-1 .mb-md-0 } [Webster's 1913 Unabridged Dictionary](./examples/webster1913-convert.html#download){: .btn .fs-3 .mb-1 .mb-md-0 } 20 | 21 | --- 22 | 23 | These tools are designed to work with v2 dictionaries (4.7.10364+). 24 | 25 | ## Getting started 26 | If you're interested in creating dictionaries, look at the [dictgen documentation](./dictgen/). If you're interested in installing or manipulating existing dictionaries, see the [dictutil documentation](./dictutil/). Otherwise, see the [dicthtml documentation](./dicthtml/) for more information about the Kobo dictionary format. 27 | 28 | ## dicthtml 29 | These pages are some notes I've made about the Kobo dictionary format based on reverse engineering the firmware and the official dictionaries. 30 | 31 | - **[Format](./dicthtml/format.html):** About the Kobo dictionary format. 32 | - **[Prefixes](./dicthtml/prefixes.html):** Details about prefix calculation. 33 | - **[v1/v2 dictionaries](./dicthtml/v1v2.html):** Changes between v1/v2 dictionaries. 34 | - **[Installing custom dictionaries](./dicthtml/install.html):** Notes about sideloading dictionaries. 35 | 36 | ## dictutil 37 | dictutil is a low-level tool to unpack, pack, and perform other operations on Kobo dictzips. 38 | 39 | - **[Dictutil](./dictutil/)** 40 | - **[Install](./dictutil/install.html):** Install a dictzip. 41 | - **[Uninstall](./dictutil/uninstall.html):** Uninstall a dictzip. 42 | - **[Pack](./dictutil/pack.html):** Pack a dictzip from a dictdir. 43 | - **[Unpack](./dictutil/unpack.html):** Unpack a dictzip into a dictdir. 44 | - **[Prefix](./dictutil/prefix.html):** Calculate the dicthtml prefix for a word. 45 | 46 | ## dictgen 47 | dictgen is an easy-to-use tool/library to generate Kobo dictionaries from scratch or use in conversion scripts. It deals with all the unusual bits (e.g. variant capitalization, prefix generation, etc) for you and gives warnings when it can't. 48 | 49 | - **[Dictgen](./dictgen#usage)** 50 | - **[Dictfile format](./dictgen#dictfile-format)** 51 | 52 | ## examples 53 | These are some tools which make use of dictutil to convert actual dictionaries. 54 | 55 | - **[gotdict-convert](./examples/gotdict-convert.html):** Converts [github.com/wjdp/gotdict](https://github.com/wjdp/gotdict) to a dictfile. 56 | - **[webster1913-convert](./examples/webster1913-convert.html):** Converts [Project Gutenberg's Webster's Unabridged Dictionary](http://www.gutenberg.org/ebooks/29765.txt.utf-8) to a dictfile. 57 | - **[dictzip-decompile](./examples/dictzip-decompile.html):** An **experimental** tool to convert a dictzip into a dictfile. 58 | - **[bgl-convert](./examples/bgl-convert.html):** A simple tool to convert Babylon BGL dictionaries to a dictfile. 59 | 60 | ## other 61 | 62 | - **[dictword-test](https://github.com/pgaskin/kobo-mods/tree/master/dictword-test):** Calculates word prefixes using libnickel. 63 | - **[marisa](https://github.com/pgaskin/dictutil/tree/master/marisa):** Marisa bindings for Go. 64 | -------------------------------------------------------------------------------- /examples/bgl-convert/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | BGL Converter 6 | 7 | 8 | 9 | 59 | 60 |
61 | 62 | 63 |
64 | 65 | 114 | -------------------------------------------------------------------------------- /examples/dictzip-decompile/main.go: -------------------------------------------------------------------------------- 1 | // Command dictzip-decompile converts a dictzip into a dictfile. The regenerated 2 | // dictzip from the dictfile may not match exactly, but it will look the same, 3 | // and certain bugs with prefixes and variants will be implicitly fixed by the 4 | // conversion process (i.e. variant in wrong file, incorrect prefix, missing 5 | // words in index file). All output is in raw HTML, not Markdown. 6 | // 7 | // This is an experimental tool, and the output may not be perfect on complex 8 | // dictionaries. 9 | package main 10 | 11 | import ( 12 | "fmt" 13 | "io" 14 | "os" 15 | 16 | "github.com/pgaskin/dictutil/kobodict" 17 | "github.com/spf13/pflag" 18 | 19 | _ "github.com/pgaskin/dictutil/kobodict/marisa" 20 | ) 21 | 22 | var version = "dev" 23 | 24 | func main() { 25 | pflag.CommandLine.SortFlags = false 26 | output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"decompiled.df", "The output filename (will be overwritten if it exists) (- is stdout)") 27 | resources := pflag.BoolP("resources", "r", false, "Also extract referenced resources to the current directory (warning: any existing files will be overwritten, so it is recommended to run in an empty directory if enabled)") 28 | help := pflag.BoolP("help", "h", false, "Show this help text") 29 | pflag.Parse() 30 | 31 | if *help || pflag.NArg() != 1 { 32 | fmt.Fprintf(os.Stderr, "Usage: %s [options] dictzip\n\nVersion: dictzip-decompile %s\n\nOptions:\n%s\nArguments:\n dictzip is the path to the dictzip to decompile.\n\nTo convert the resulting dictfile into a dictzip, use dictgen.\n\nNote: The regenerated dictzip from the dictfile may not match exactly, but it will look the same, and certain bugs with prefixes and variants will be implicitly fixed by the conversion process (i.e. variant in wrong file, incorrect prefix, missing words in index file). All output is in raw HTML, not Markdown.\n\nThis is an experimental tool, and the output may not be perfect on complex dictionaries.\n", os.Args[0], version, pflag.CommandLine.FlagUsages()) 33 | if pflag.NArg() != 0 { 34 | os.Exit(2) 35 | } else { 36 | os.Exit(0) 37 | } 38 | return 39 | } 40 | 41 | fn := pflag.Args()[0] 42 | 43 | fmt.Fprintf(os.Stderr, "Opening input dictzip.\n") 44 | f, err := os.Open(fn) 45 | if err != nil { 46 | fmt.Fprintf(os.Stderr, "Error: open input file %#v: %v.\n", fn, err) 47 | os.Exit(1) 48 | return 49 | } 50 | defer f.Close() 51 | 52 | s, err := f.Stat() 53 | if err != nil { 54 | fmt.Fprintf(os.Stderr, "Error: stat input file %#v: %v.\n", fn, err) 55 | os.Exit(1) 56 | return 57 | } 58 | 59 | fmt.Fprintf(os.Stderr, "Parsing dictzip.\n") 60 | dr, err := kobodict.NewReader(f, s.Size()) 61 | if err != nil { 62 | fmt.Fprintf(os.Stderr, "Error: parse input file %#v: %v.\n", fn, err) 63 | os.Exit(1) 64 | return 65 | } 66 | 67 | fmt.Fprintf(os.Stderr, "Decompiling dictzip.\n") 68 | df, err := decompile(dr) 69 | if err != nil { 70 | fmt.Fprintf(os.Stderr, "Error: decompile dictzip %#v: %v.\n", fn, err) 71 | os.Exit(1) 72 | return 73 | } 74 | 75 | if *resources { 76 | fmt.Fprintf(os.Stderr, "Extracting resources.\n") 77 | for _, f := range dr.File { 78 | fmt.Fprintf(os.Stderr, " ./%s\n", f.Name) 79 | if err := func() error { 80 | rc, err := f.Open() 81 | if err != nil { 82 | return fmt.Errorf("open: %w", err) 83 | } 84 | defer rc.Close() 85 | 86 | f, err := os.OpenFile(f.Name, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 87 | if err != nil { 88 | return fmt.Errorf("create output: %w", err) 89 | } 90 | defer f.Close() 91 | 92 | if _, err := io.Copy(f, rc); err != nil { 93 | return fmt.Errorf("copy: %w", err) 94 | } 95 | 96 | if err := f.Close(); err != nil { 97 | return fmt.Errorf("write output: %w", err) 98 | } 99 | 100 | return nil 101 | }(); err != nil { 102 | fmt.Fprintf(os.Stderr, "Error: extract resource %#v: %v.\n", f.Name, err) 103 | os.Exit(1) 104 | return 105 | } 106 | } 107 | } else { 108 | if len(dr.File) != 0 { 109 | fmt.Fprintf(os.Stderr, "Warning: dictfile contains %d resources, but skipping because resource extraction is not enabled (see --help for more details).\n", len(dr.File)) 110 | } 111 | } 112 | 113 | fmt.Fprintf(os.Stderr, "Writing dictfile.\n") 114 | switch *output { 115 | case "-": 116 | if err := df.WriteDictFile(os.Stdout); err != nil { 117 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 118 | os.Exit(1) 119 | return 120 | } 121 | default: 122 | f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 123 | if err != nil { 124 | fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err) 125 | os.Exit(1) 126 | return 127 | } 128 | 129 | if err := df.WriteDictFile(f); err != nil { 130 | f.Close() 131 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 132 | os.Exit(1) 133 | return 134 | } 135 | 136 | if err := f.Close(); err != nil { 137 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 138 | os.Exit(1) 139 | return 140 | } 141 | } 142 | 143 | fmt.Fprintf(os.Stderr, "Successfully converted %d entries from dictzip %#v to dictfile %s.\n", len(df), fn, *output) 144 | os.Exit(0) 145 | } 146 | -------------------------------------------------------------------------------- /examples/dictzip-decompile/parse.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha1" 6 | "fmt" 7 | "io/ioutil" 8 | "regexp" 9 | "unicode" 10 | 11 | "github.com/pgaskin/dictutil/dictgen" 12 | "github.com/pgaskin/dictutil/kobodict" 13 | ) 14 | 15 | // This isn't exposed as a separate package, as it's subject to change and 16 | // highly specific to dictzip-decompile. 17 | 18 | // The regexps used to extract data should have a similar level of strictness as 19 | // the ones used by nickel (for simplicity, compatibility, and predictability). 20 | 21 | // decompile decompiles a dictzip into a dictfile. External resources are not 22 | // extracted, and must be done separately. 23 | // 24 | // Duplicate entries (e.g. the ones added by dictgen for fixing broken variants) 25 | // are collapsed into one. They will be expanded again as necessary when the 26 | // dictfile is compiled by dictgen. 27 | func decompile(r *kobodict.Reader) (dictgen.DictFile, error) { 28 | var df dictgen.DictFile 29 | seenEntries := map[[20]byte]struct{}{} 30 | for _, dh := range r.Dicthtml { 31 | if err := func() error { 32 | rc, err := dh.Open() 33 | if err != nil { 34 | return fmt.Errorf("open: %w", err) 35 | } 36 | defer rc.Close() 37 | 38 | buf, err := ioutil.ReadAll(rc) 39 | if err != nil { 40 | return fmt.Errorf("read: %w", err) 41 | } 42 | 43 | es, err := extractEntries(buf) 44 | if err != nil { 45 | return fmt.Errorf("extract entries: %w", err) 46 | } 47 | 48 | for _, e := range es { 49 | ss := sha1.Sum(e) 50 | if _, ok := seenEntries[ss]; ok { 51 | continue 52 | } 53 | seenEntries[ss] = struct{}{} 54 | 55 | de, err := decompileEntry(e) 56 | if err != nil { 57 | return fmt.Errorf("decompile entry %#v: %w", string(e), err) 58 | } 59 | 60 | df = append(df, de) 61 | } 62 | 63 | return nil 64 | }(); err != nil { 65 | return nil, fmt.Errorf("process dicthtml %#v: %w", dh.Name, err) 66 | } 67 | } 68 | return df, nil 69 | } 70 | 71 | // The regexps/vars used by decompileEntry. 72 | var ( 73 | // generator matchers (match the entire entry, split into parts) (match in order) (don't include variants here) 74 | generator1PenelopeRe = regexp.MustCompile(`^(?s)
([^<]+)<\/b>(.+)<\/div>$`) // also: first and second groups must be equal 75 | generator2KoboFrRe = regexp.MustCompile(`^(?s)

|><\/a>)\s*([^<]+)\s*<\/b>\s*(.*?)

\s*(.+)\s*<\/p>$`) // also: 2nd and 3rd (header) group must not contain "
|><\/a>)\s*(.+?)\s*<\/b>\s*(.*?)\s*<\/p>\s*(.+)\s*$`) // also: 2nd and 3rd (header) group must not contain "|><\/a>)`) // this is slightly more lenient than some of Kobo's (it makes the space before the closing optional) 79 | // other matchers 80 | variantsRe = regexp.MustCompile(`(.*?)<\/var>`) 81 | variantsItemRe = regexp.MustCompile(`|><\/variant>)`) 82 | ) 83 | 84 | // decompileEntry parses an entry (it must be trimmed). 85 | func decompileEntry(buf []byte) (*dictgen.DictFileEntry, error) { 86 | var entry dictgen.DictFileEntry 87 | 88 | // Generator-specific enhanced extraction (for making use of dictfile lines 89 | // starting with &, :, etc). 90 | var generatorMatched bool 91 | // -- Penelope: https://github.com/pettarin/penelope/blob/fce6dcfd899d3755ae3a5a3867d7d436105ada56/penelope/format_kobo.py#L167 92 | // e.g.

dfgdfg
Penelope
sdfsdf
93 | if !generatorMatched { 94 | if m := generator1PenelopeRe.FindSubmatch(buf); len(m) != 0 { 95 | headwordIndex, headwordDisplay, contentHTML := m[1], m[2], m[3] 96 | if !bytes.Equal(headwordIndex, headwordDisplay) { 97 | // it's a false positive if those aren't identical 98 | } else { 99 | entry.Headword = string(headwordIndex) 100 | entry.RawHTML = true 101 | entry.Definition = string(contentHTML) 102 | generatorMatched = true 103 | } 104 | } 105 | } 106 | // -- Kobo: based on dicthtml-fr 107 | // e.g.

a-, an-

  1. Élément exprimant la négation ( pas ), ou la privation ( sans ).
  2.    ⇒anormal, apolitique.

108 | if !generatorMatched { 109 | if m := generator2KoboFrRe.FindSubmatch(buf); len(m) != 0 { 110 | headwordIndex, headwordDisplay, headerInfo, contentHTML := m[1], m[2], m[3], m[4] 111 | if bytes.Contains(headwordDisplay, []byte("" 128 | generatorMatched = true 129 | } 130 | } 131 | } 132 | // -- Kobo: based on dicthtml-en, a few others 133 | // e.g.

ab ['ab] -n

  1. an abdominal muscle usu. used in pl.
  2. about

134 | // -- or dictgen 135 | // e.g.

a A (# emph. #).

  1. Etym: [Shortened form of an. AS. an one. See One.] An adjective, commonly called the indefinite article, and signifying one or any, but less emphatically.
  2. "At a birth"; "In a word"; "At a blow". Shak. Note: It is placed before nouns of the singular number denoting an individual object, or a quality individualized, before collective nouns, and also before plural nouns when the adjective few or the phrase great many or good many is interposed; as, a dog, a house, a man; a color; a sweetness; a hundred, a fleet, a regiment; a few persons, a great many days. It is used for an, for the sake of euphony, before words beginning with a consonant sound [for exception of certain words beginning with h, see An]; as, a table, a woman, a year, a unit, a eulogy, a ewe, a oneness, such a one, etc. Formally an was used both before vowels and consonants.
  3. Etym: [Originally the preposition a (an, on).] In each; to or for each; as, "twenty leagues a day", "a hundred pounds a year", "a dollar a yard", etc.
136 | if !generatorMatched { 137 | if m := generator3KoboEnOrDictutilRe.FindSubmatch(buf); len(m) != 0 { 138 | headwordIndex, headwordDisplay, headerInfo, contentHTML := m[1], m[2], m[3], m[4] 139 | if bytes.Contains(headwordDisplay, []byte("

dfkgjdlfjglkdfjg

162 | if !generatorMatched { 163 | entry.NoHeader = true 164 | entry.RawHTML = true 165 | entry.Definition = string(headFallbackIndexWordRe.ReplaceAllFunc(buf, func(src []byte) []byte { 166 | if entry.Headword != "" { 167 | return src // don't continue after the first headword has been found 168 | } 169 | entry.Headword = string(headFallbackIndexWordRe.FindSubmatch(src)[1]) 170 | return nil // remove the entire a tag 171 | })) 172 | if entry.Headword == "" { 173 | return nil, fmt.Errorf("no headword found in %#v", string(buf)) 174 | } 175 | generatorMatched = true 176 | } 177 | 178 | // Add any additional headwords (then remove) (which really shouldn't be there in the first place) as variants. 179 | // i.e. stray
tags (but not if the link has text, because then it's not a headword anymore) 180 | entry.Definition = string(headFallbackIndexWordRe.ReplaceAllFunc([]byte(entry.Definition), func(src []byte) []byte { 181 | entry.Variant = append(entry.Variant, string(headFallbackIndexWordRe.FindSubmatch(src)[1])) 182 | return nil // remove the entire a tag 183 | })) 184 | 185 | // Append (then remove) any variants found in the raw html. 186 | // i.e. tags inside ones 187 | entry.Definition = string(variantsRe.ReplaceAllFunc([]byte(entry.Definition), func(src []byte) []byte { 188 | for _, m := range variantsItemRe.FindAllSubmatch(src, -1) { 189 | entry.Variant = append(entry.Variant, string(m[1])) 190 | } 191 | return nil // remove the entire variant tag 192 | })) 193 | 194 | return &entry, nil 195 | } 196 | 197 | // The regexps/vars used by extractEntries. 198 | var ( 199 | htmlStart = []byte("") 200 | htmlEnd = []byte("") 201 | entryRe = regexp.MustCompile(`(?s)\s*(.+?)\s*<\/w>`) 202 | ) 203 | 204 | // extractEntries gets the trimmed body of each entry in the dicthtml file. 205 | func extractEntries(buf []byte) ([][]byte, error) { 206 | if idx := bytes.Index(buf, htmlStart); idx < 0 { 207 | return nil, fmt.Errorf("missing %s tag", string(htmlStart)) 208 | } else { 209 | buf = buf[idx+len(htmlStart):] 210 | } 211 | 212 | if idx := bytes.LastIndex(buf, htmlEnd); idx < 0 { 213 | return nil, fmt.Errorf("missing %s tag", string(htmlStart)) 214 | } else { 215 | buf = buf[:idx] 216 | } 217 | 218 | var entries [][]byte 219 | 220 | var cur, prev, body []int 221 | prev = []int{0, 0} 222 | for _, m := range entryRe.FindAllSubmatchIndex(buf, -1) { 223 | cur, body = m[0:2][:], m[2:4] 224 | for _, b := range buf[prev[1]:cur[0]] { 225 | // note: even though we might split up multi-byte utf-8 chars 226 | // here, it's fine, as the whitespace should be ascii if any, 227 | // and if there is anything else, it's an issue. 228 | if !unicode.IsSpace(rune(b)) { 229 | return nil, fmt.Errorf("non-whitespace between word entries (%#v in %#v before %#v)", string(rune(b)), string(buf[prev[1]:cur[0]]), string(buf[cur[0]:cur[1]])) 230 | } 231 | } 232 | prev = cur 233 | entries = append(entries, buf[body[0]:body[1]]) 234 | } 235 | for _, b := range buf[prev[1]:] { 236 | if !unicode.IsSpace(rune(b)) { 237 | return nil, fmt.Errorf("non-whitespace after last word entry (%#v in %#v)", string(rune(b)), string(buf[prev[1]:])) 238 | } 239 | } 240 | 241 | return entries, nil 242 | } 243 | -------------------------------------------------------------------------------- /examples/gotdict-convert/gotdict/parser.go: -------------------------------------------------------------------------------- 1 | // Package gotdict parses GOTDict (https://github.com/wjdp/gotdict). 2 | package gotdict 3 | 4 | import ( 5 | "bytes" 6 | "fmt" 7 | "io/ioutil" 8 | "os" 9 | "path/filepath" 10 | "regexp" 11 | "sort" 12 | "strings" 13 | "unicode" 14 | 15 | "gopkg.in/yaml.v2" 16 | ) 17 | 18 | // Dict represents the Dict. 19 | type Dict []*Def 20 | 21 | // Def represents a definition. 22 | type Def struct { 23 | // Title is the main title of the definition (it may contain spaces) (i.e. Tyrion Lannister). 24 | Title string 25 | // Terms are other forms of the title which should be recognized. 26 | Terms []string 27 | // Type is the record type. Currently, not many entries have one. 28 | Type Type 29 | // Images contains referenced image files. 30 | Images map[string][]byte 31 | // Definition contains the Markdown definition. 32 | Definition string 33 | } 34 | 35 | // Type is a Dict record type. 36 | type Type string 37 | 38 | const ( 39 | // TypeUnknown is used for definitions without a type set (i.e. before types were used). 40 | TypeUnknown Type = "" 41 | // TypeCharacter is a character (e.g. Jon, Tyrion). 42 | TypeCharacter Type = "character" 43 | // TypeHouse is a house (e.g. Lannister, Stark). 44 | TypeHouse Type = "house" 45 | // TypeEvent is an event in time. 46 | TypeEvent Type = "event" 47 | // TypeCity is a city. 48 | TypeCity Type = "city" 49 | // TypeLocation is a location (e.g. King's Landing). 50 | TypeLocation Type = "location" 51 | // TypeRiver is a river. 52 | TypeRiver Type = "river" 53 | // TypeShip is a ship. 54 | TypeShip Type = "ship" 55 | // TypeWord is an uncommon or ASOIAF-specific word. 56 | TypeWord Type = "word" 57 | ) 58 | 59 | // Parse parses the Dict. If imgdir is an empty string, images are removed. If 60 | // imgref is true, image paths are set to the full filepath rather than reading 61 | // the images to memory. 62 | func Parse(defdir, imgdir string, imgref bool) (Dict, error) { 63 | var dict Dict 64 | 65 | fis, err := ioutil.ReadDir(defdir) 66 | if err != nil { 67 | return nil, err 68 | } 69 | 70 | seen := map[string]*Def{} 71 | for _, fi := range fis { 72 | if filepath.Ext(fi.Name()) != ".mdd" { 73 | continue 74 | } 75 | 76 | buf, err := ioutil.ReadFile(filepath.Join(defdir, fi.Name())) 77 | if err != nil { 78 | return nil, err 79 | } 80 | 81 | var obj struct { 82 | Title string `yaml:"title"` 83 | Terms []string `yaml:"terms"` 84 | Type Type `yaml:"type"` 85 | } 86 | 87 | md, err := unmarshalStrictFrontMatter(buf, &obj) 88 | if err != nil { 89 | return nil, fmt.Errorf("parse %s frontmatter: %w", fi.Name(), err) 90 | } else if obj.Title == "" { 91 | return nil, fmt.Errorf("parse %s frontmatter: title not set", fi.Name()) 92 | } 93 | 94 | def := &Def{} 95 | 96 | obj.Title = strings.TrimSpace(obj.Title) 97 | if odef, ok := seen[obj.Title]; ok { 98 | return nil, fmt.Errorf("parse %s: already seen %#v in other def %#v", fi.Name(), def.Title, odef) 99 | } 100 | seen[obj.Title] = def 101 | def.Title = obj.Title 102 | 103 | for _, term := range obj.Terms { 104 | term = strings.TrimSpace(term) 105 | if odef, ok := seen[term]; ok && term != "Jon Umber" { // it's usually a mistake to have duplicate terms (but remember that dictgen will handle them fine) 106 | return nil, fmt.Errorf("parse %s: already seen term %#v in other def %#v", fi.Name(), term, odef) 107 | } 108 | seen[term] = def 109 | def.Terms = append(def.Terms, term) 110 | } 111 | 112 | def.Type = Type(strings.TrimSpace(string(obj.Type))) 113 | def.Images = map[string][]byte{} 114 | def.Definition = string(md) 115 | 116 | if imgdir == "" { 117 | def.Definition = regexp.MustCompile(`(\s*Map on [Nn]ext [Pp]age\.?)|(\s*\(Map on [Nn]ext [Pp]age\.?\))|(!\[[^]]*\]\([^)]+\))`).ReplaceAllLiteralString(def.Definition, "") 118 | } else { 119 | var repl []string 120 | for _, img := range regexp.MustCompile(`!\[[^]]*\]\((images/)?([^)]+)\)`).FindAllStringSubmatch(def.Definition, -1) { 121 | if img[1] == "" { 122 | return nil, fmt.Errorf("parse %s: unknown image path %#v", fi.Name(), img[1]) 123 | } 124 | fn, err := filepath.Abs(filepath.Join(imgdir, img[2])) 125 | if err != nil { 126 | return nil, fmt.Errorf("parse %s: resolve image %#v: %w", fi.Name(), img[1], err) 127 | } 128 | if imgref { 129 | if _, err := os.Stat(fn); err != nil { 130 | return nil, fmt.Errorf("parse %s: stat image %#v: %w", fi.Name(), img[1], err) 131 | } 132 | repl = append(repl, "("+img[1]+img[2]+")", "("+fn+")") 133 | } else { 134 | imgbuf, err := ioutil.ReadFile(fn) 135 | if err != nil { 136 | return nil, fmt.Errorf("parse %s: read image %#v: %w", fi.Name(), img[1], err) 137 | } 138 | def.Images[img[2]] = imgbuf 139 | repl = append(repl, "("+img[1]+img[2]+")", "("+img[2]+")") 140 | } 141 | } 142 | def.Definition = strings.NewReplacer(repl...).Replace(def.Definition) 143 | } 144 | 145 | def.Definition = strings.TrimSpace(def.Definition) 146 | 147 | dict = append(dict, def) 148 | } 149 | 150 | sort.Slice(dict, func(i, j int) bool { 151 | return dict[i].Title < dict[j].Title 152 | }) 153 | 154 | return dict, nil 155 | } 156 | 157 | func unmarshalStrictFrontMatter(buf []byte, v interface{}) (content []byte, err error) { 158 | spl := bytes.SplitN(buf, []byte{'-', '-', '-'}, 3) 159 | for _, b := range spl[0] { 160 | if !unicode.IsSpace(rune(b)) { 161 | return buf, nil 162 | } 163 | } 164 | return spl[2], yaml.UnmarshalStrict(spl[1], v) 165 | } 166 | -------------------------------------------------------------------------------- /examples/gotdict-convert/main.go: -------------------------------------------------------------------------------- 1 | // Command gotdict-convert converts GOTDict (https://github.com/wjdp/gotdict) to 2 | // a dictgen dictfile. 3 | package main 4 | 5 | import ( 6 | "fmt" 7 | "os" 8 | "path/filepath" 9 | 10 | "github.com/spf13/pflag" 11 | 12 | "github.com/pgaskin/dictutil/dictgen" 13 | "github.com/pgaskin/dictutil/examples/gotdict-convert/gotdict" 14 | ) 15 | 16 | var version = "dev" 17 | 18 | func main() { 19 | pflag.CommandLine.SortFlags = false 20 | gotdictp := pflag.StringP("gotdict", "g", "."+string(os.PathSeparator)+"gotdict", "The path to the local copy of github.com/wjdp/gotdict.") 21 | output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"gotdict.df", "The output filename (will be overwritten if it exists) (- is stdout)") 22 | images := pflag.BoolP("images", "I", false, "Include images in the generated dictfile") 23 | help := pflag.BoolP("help", "h", false, "Show this help text") 24 | pflag.Parse() 25 | 26 | if *help || pflag.NArg() != 0 { 27 | fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\nVersion: gotdict-convert %s\n\nOptions:\n%s\nTo convert the resulting dictfile into a dictzip, use dictgen.\n", os.Args[0], version, pflag.CommandLine.FlagUsages()) 28 | os.Exit(0) 29 | return 30 | } 31 | 32 | var img string 33 | if *images { 34 | fmt.Fprintf(os.Stderr, "Parsing gotdict (with images).\n") 35 | img = filepath.Join(*gotdictp, "images") 36 | } else { 37 | fmt.Fprintf(os.Stderr, "Parsing gotdict (no images).\n") 38 | } 39 | 40 | gd, err := gotdict.Parse(filepath.Join(*gotdictp, "_definitions"), img, true) 41 | if err != nil { 42 | fmt.Fprintf(os.Stderr, "Error: parse gotdict: %v\n", err) 43 | os.Exit(1) 44 | return 45 | } 46 | 47 | fmt.Fprintf(os.Stderr, "Transforming definitions.\n") 48 | var df dictgen.DictFile 49 | for _, d := range gd { 50 | var hwi string 51 | if d.Type != "" { 52 | hwi = "-" + string(d.Type) 53 | } 54 | 55 | df = append(df, &dictgen.DictFileEntry{ 56 | Headword: d.Title, 57 | HeaderInfo: hwi, 58 | Variant: d.Terms, 59 | Definition: d.Definition, 60 | }) 61 | } 62 | 63 | fmt.Fprintf(os.Stderr, "Writing dictfile.\n") 64 | switch *output { 65 | case "-": 66 | if err := df.WriteDictFile(os.Stdout); err != nil { 67 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 68 | os.Exit(1) 69 | return 70 | } 71 | default: 72 | f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 73 | if err != nil { 74 | fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err) 75 | os.Exit(1) 76 | return 77 | } 78 | 79 | if err := df.WriteDictFile(f); err != nil { 80 | f.Close() 81 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 82 | os.Exit(1) 83 | return 84 | } 85 | 86 | if err := f.Close(); err != nil { 87 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 88 | os.Exit(1) 89 | return 90 | } 91 | } 92 | 93 | fmt.Fprintf(os.Stderr, "Successfully converted %d entries from gotdict %s to dictfile %s.\n", len(df), *gotdictp, *output) 94 | os.Exit(0) 95 | } 96 | -------------------------------------------------------------------------------- /examples/webster1913-convert/main.go: -------------------------------------------------------------------------------- 1 | // Command webster1913-convert converts Project Gutenberg's Webster's 1913 2 | // Unabridged Dictionary to a dictgen dictfile. 3 | package main 4 | 5 | import ( 6 | "bytes" 7 | "encoding/json" 8 | "fmt" 9 | "html/template" 10 | "io" 11 | "os" 12 | 13 | "github.com/spf13/pflag" 14 | 15 | "github.com/pgaskin/dictutil/dictgen" 16 | "github.com/pgaskin/dictutil/examples/webster1913-convert/webster1913" 17 | ) 18 | 19 | var version = "dev" 20 | 21 | var deftmpl = template.Must(template.New("").Funcs(template.FuncMap{ 22 | "spldc": func(s string) []string { 23 | for i, c := range s { 24 | if c == '.' || c == ',' || c == '(' { 25 | return []string{s[:i], s[i:]} 26 | } 27 | } 28 | return []string{"", s} 29 | }, 30 | }).Parse(` 31 | {{- with .Etymology}}

{{.}}

{{end -}} 32 | {{- with .Meanings}}
    {{range .}}
  1. {{.Text}}{{with .Example}}

    {{.}}{{end}}
  2. {{end}}
{{end -}} 33 | {{- with .PhraseDefns}}

{{range $n, $v := .}}{{if $n}} {{end}}{{range $x, $y := (spldc $v)}}{{if $x}}{{$y}}{{else}}{{$y}}{{end}}{{end}}{{end}}

{{end -}} 34 | {{- with .Synonyms}}

{{range $n, $v := .}}{{if $n}} {{end}}{{$v}}{{end}}

{{end -}} 35 | {{- with .Extra}}

{{.}}

{{end -}} 36 | `)) 37 | 38 | func main() { 39 | pflag.CommandLine.SortFlags = false 40 | output := pflag.StringP("output", "o", "."+string(os.PathSeparator)+"webster1913.df", "The output filename (will be overwritten if it exists) (- is stdout)") 41 | dump := pflag.Bool("dump", false, "Instead of converting, dump the parsed dictionary to stdout as JSON (for debugging)") 42 | help := pflag.BoolP("help", "h", false, "Show this help text") 43 | pflag.Parse() 44 | 45 | if *help || pflag.NArg() != 1 { 46 | fmt.Fprintf(os.Stderr, "Usage: %s [options] gutenberg_webster1913_path\n\nVersion: webster1913-convert %s\n\nOptions:\n%s\nArguments:\n gutenberg_webster1913_path is the path to Project Gutenberg's Webster's 1913 dictionary. Use - to read from stdin.\n\nTo convert the resulting dictfile into a dictzip, use dictgen.\n", os.Args[0], version, pflag.CommandLine.FlagUsages()) 47 | os.Exit(0) 48 | return 49 | } 50 | 51 | fmt.Fprintf(os.Stderr, "Opening input file.\n") 52 | var r io.Reader 53 | switch v := pflag.Args()[0]; v { 54 | case "-": 55 | r = os.Stdin 56 | default: 57 | f, err := os.Open(v) 58 | if err != nil { 59 | fmt.Fprintf(os.Stderr, "Error: open input %#v: %v\n", v, err) 60 | os.Exit(1) 61 | return 62 | } 63 | defer f.Close() 64 | r = f 65 | } 66 | 67 | fmt.Fprintf(os.Stderr, "Parsing dictionary.\n") 68 | wd, err := webster1913.Parse(r, func(i int, word string) { 69 | if i%1000 == 0 { 70 | fmt.Fprintf(os.Stderr, "[% 5d] %s\n", i, word) 71 | } 72 | }) 73 | if err != nil { 74 | fmt.Fprintf(os.Stderr, "Error: parse webster1913: %v\n", err) 75 | os.Exit(1) 76 | return 77 | } 78 | 79 | if *dump { 80 | fmt.Fprintf(os.Stderr, "Dumping JSON to stdout.\n") 81 | enc := json.NewEncoder(os.Stdout) 82 | enc.SetIndent("", " ") 83 | enc.Encode(wd) 84 | os.Exit(0) 85 | return 86 | } 87 | 88 | fmt.Fprintf(os.Stderr, "Transforming definitions.\n") 89 | var df dictgen.DictFile 90 | dbuf := bytes.NewBuffer(nil) 91 | for _, d := range wd { 92 | dbuf.Reset() 93 | if err := deftmpl.Execute(dbuf, d); err != nil { 94 | fmt.Fprintf(os.Stderr, "Error: render definition %#v: %v\n", d, err) 95 | os.Exit(1) 96 | return 97 | } 98 | df = append(df, &dictgen.DictFileEntry{ 99 | Headword: d.Headword, 100 | Variant: d.Variant, 101 | RawHTML: true, 102 | HeaderInfo: d.Info, 103 | Definition: dbuf.String(), 104 | }) 105 | } 106 | 107 | fmt.Fprintf(os.Stderr, "Writing dictfile.\n") 108 | switch *output { 109 | case "-": 110 | if err := df.WriteDictFile(os.Stdout); err != nil { 111 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 112 | os.Exit(1) 113 | return 114 | } 115 | default: 116 | f, err := os.OpenFile(*output, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 117 | if err != nil { 118 | fmt.Fprintf(os.Stderr, "Error: create dictfile: %v\n", err) 119 | os.Exit(1) 120 | return 121 | } 122 | 123 | if err := df.WriteDictFile(f); err != nil { 124 | f.Close() 125 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 126 | os.Exit(1) 127 | return 128 | } 129 | 130 | if err := f.Close(); err != nil { 131 | fmt.Fprintf(os.Stderr, "Error: write dictfile: %v\n", err) 132 | os.Exit(1) 133 | return 134 | } 135 | } 136 | 137 | fmt.Fprintf(os.Stderr, "Successfully converted %d entries from Webster's 1913 dictionary %#v to dictfile %s.\n", len(df), pflag.Args()[0], *output) 138 | os.Exit(0) 139 | } 140 | -------------------------------------------------------------------------------- /examples/webster1913-convert/webster1913/parser.go: -------------------------------------------------------------------------------- 1 | // Package webster1913 parses Project Gutenberg's Webster's 1913 Unabridged 2 | // Dictionary (http://www.gutenberg.org/ebooks/29765.txt.utf-8). 3 | package webster1913 4 | 5 | import ( 6 | "bufio" 7 | "bytes" 8 | "io" 9 | "regexp" 10 | "runtime/debug" 11 | "strings" 12 | ) 13 | 14 | // Dict represents the parsed dictionary. 15 | type Dict []*Entry 16 | 17 | // Entry is a single dictionary entry. 18 | type Entry struct { 19 | Headword string 20 | Variant []string 21 | Info string 22 | Etymology string 23 | Meanings []*EntryMeaning 24 | Synonyms []string 25 | PhraseDefns []string 26 | Extra string // unparseable text 27 | } 28 | 29 | // EntryMeaning is a meaning for a dictionary entry. 30 | type EntryMeaning struct { 31 | Text string 32 | Example string 33 | } 34 | 35 | var ( 36 | entryWordRe = regexp.MustCompile(`^[A-Z_ ;-]+$`) 37 | numberedDefnStartRe = regexp.MustCompile(`^[0-9]+\.\s*`) 38 | singleDefnStartRe = regexp.MustCompile(`^Defn:\s+`) 39 | noteStartRe = regexp.MustCompile(`^\s*Note:\s+`) 40 | synStartRe = regexp.MustCompile(`^Syn.\s*$`) 41 | synItemStartRe = regexp.MustCompile(`^\s+--\s+`) 42 | phraseDefnStartRe = regexp.MustCompile(`^\s+--\s+([A-Za-z ]+?[A-Za-z])\s*(\([^)]+\))?[,.]\s*`) 43 | wordInfoFormRe = regexp.MustCompile(`(?:p\. p\.|vb\. n\.|p\. pr\.) +([A-Z][a-z]+)[:;.,]`) 44 | ) 45 | 46 | type state int 47 | 48 | const ( 49 | // StateNone is before the first entry. 50 | StateNone state = iota 51 | // StateEntryInfo is at the beginning of the entry. 52 | StateEntryInfo 53 | // StateEntryExtra is unclassified text in the entry. 54 | StateEntryExtra 55 | // StateEntryMeaningText is inside an entry's meaning's text. 56 | StateEntryMeaningText 57 | // StateEntryMeaningExample is inside an entry's meaning's example. 58 | StateEntryMeaningExample 59 | // StateEntrySynonym is inside an entry's synonym list. 60 | StateEntrySynonym 61 | // StateEntryPhraseDefn is inside an entry's phrase definition list. 62 | StateEntryPhraseDefn 63 | ) 64 | 65 | // Parse parses Project Gutenberg's Webster's Unabridged Dictionary. 66 | func Parse(r io.Reader, progress func(i int, w string)) (Dict, error) { 67 | var wd Dict 68 | var perr error 69 | sc := bufio.NewScanner(r) 70 | 71 | var state state 72 | var entry *Entry 73 | var meaning *EntryMeaning 74 | var i int 75 | for sc.Scan() { 76 | ln := sc.Bytes() 77 | lnt := bytes.TrimSpace(ln) 78 | blankLine := len(lnt) == 0 79 | 80 | if bytes.HasPrefix(lnt, []byte("*** END")) { 81 | break 82 | } 83 | 84 | if entryWordRe.Match(ln) { 85 | if state == StateNone { 86 | // skip the file header(up to the word "A") 87 | if !bytes.Equal(lnt, []byte{'A'}) { 88 | continue 89 | } 90 | } 91 | if bytes.Count(lnt, []byte{'-'}) != len(lnt) { 92 | // ^ if all dashes, it is a false positive 93 | if entry != nil { 94 | progress(len(wd), entry.Headword) 95 | } 96 | spl := strings.Split(string(bytes.ToLower(ln)), ";") 97 | entry = &Entry{Headword: strings.TrimSpace(spl[0])} 98 | if len(spl) > 1 { 99 | for _, v := range spl[1:] { 100 | if w := strings.TrimSpace(v); w != "" { 101 | entry.Variant = append(entry.Variant, w) 102 | } 103 | } 104 | } 105 | meaning = nil 106 | wd = append(wd, entry) 107 | state = StateEntryInfo 108 | continue 109 | } 110 | } 111 | 112 | switch state { 113 | case StateNone: 114 | // ignore any text before the first entry 115 | case StateEntryInfo: 116 | switch { 117 | case blankLine: 118 | for _, m := range wordInfoFormRe.FindAllStringSubmatch(entry.Info, -1) { 119 | entry.Variant = append(entry.Variant, strings.ToLower(m[1])) 120 | } 121 | // attempt to split into etymology 122 | if spl := strings.SplitN(entry.Info, " Etym: ", 2); len(spl) == 2 { 123 | entry.Info = strings.TrimSpace(spl[0]) 124 | entry.Etymology = strings.TrimSpace(spl[1]) 125 | } 126 | state = StateEntryExtra 127 | default: 128 | entry.Info += " " + string(lnt) 129 | } 130 | case StateEntryExtra: 131 | switch { 132 | case singleDefnStartRe.Match(ln): 133 | meaning = &EntryMeaning{Text: string(singleDefnStartRe.ReplaceAllLiteral(ln, nil))} 134 | entry.Meanings = append(entry.Meanings, meaning) 135 | state = StateEntryMeaningText 136 | case numberedDefnStartRe.Match(ln): 137 | meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))} 138 | entry.Meanings = append(entry.Meanings, meaning) 139 | state = StateEntryMeaningText 140 | case phraseDefnStartRe.Match(ln): 141 | meaning = nil 142 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1)))) 143 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1]))) 144 | state = StateEntryPhraseDefn 145 | case blankLine: 146 | // ignore 147 | default: 148 | entry.Extra += " " + string(lnt) 149 | } 150 | case StateEntryMeaningText: 151 | switch { 152 | case synStartRe.Match(ln): 153 | meaning = nil 154 | state = StateEntrySynonym 155 | case singleDefnStartRe.Match(ln): 156 | // if it is in any kind of definition (single/numbered), it is part of it. 157 | meaning.Text += " " + string(singleDefnStartRe.ReplaceAllLiteral(lnt, nil)) 158 | case numberedDefnStartRe.Match(ln): 159 | meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))} 160 | entry.Meanings = append(entry.Meanings, meaning) 161 | state = StateEntryMeaningText 162 | case phraseDefnStartRe.Match(ln): 163 | meaning = nil 164 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1)))) 165 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1]))) 166 | state = StateEntryPhraseDefn 167 | case len(meaning.Text) > 5 && len(lnt) < 55 && bytes.HasSuffix(lnt, []byte{'.'}) && !noteStartRe.Match(ln): 168 | // if there is already some body text, it is not a hard-wrapped 169 | // line, and it ends with a period, and is not a note, then it's 170 | // the last line of the text before the example. 171 | meaning.Text += " " + string(lnt) 172 | state = StateEntryMeaningExample 173 | case blankLine: 174 | // ignore 175 | default: 176 | meaning.Text += " " + string(lnt) 177 | } 178 | case StateEntryMeaningExample: 179 | switch { 180 | case synStartRe.Match(ln): 181 | meaning = nil 182 | state = StateEntrySynonym 183 | case singleDefnStartRe.Match(ln): 184 | meaning = &EntryMeaning{Text: string(singleDefnStartRe.ReplaceAllLiteral(ln, nil))} 185 | entry.Meanings = append(entry.Meanings, meaning) 186 | state = StateEntryMeaningText 187 | case numberedDefnStartRe.Match(ln): 188 | meaning = &EntryMeaning{Text: string(numberedDefnStartRe.ReplaceAllLiteral(ln, nil))} 189 | entry.Meanings = append(entry.Meanings, meaning) 190 | state = StateEntryMeaningText 191 | case phraseDefnStartRe.Match(ln): 192 | meaning = nil 193 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1)))) 194 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1]))) 195 | state = StateEntryPhraseDefn 196 | case blankLine: 197 | // ignore 198 | default: 199 | if meaning.Example != "" { 200 | meaning.Example += " " 201 | } 202 | meaning.Example += string(lnt) 203 | } 204 | case StateEntrySynonym: 205 | switch { 206 | case blankLine: 207 | state = StateEntryExtra 208 | case synItemStartRe.Match(ln): 209 | entry.Synonyms = append(entry.Synonyms, string(synItemStartRe.ReplaceAllLiteral(ln, nil))) 210 | case len(entry.Synonyms) == 0: 211 | // there was a "Syn." without any valid synonyms under it 212 | state = StateEntryExtra 213 | case phraseDefnStartRe.Match(ln): 214 | meaning = nil 215 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1)))) 216 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1]))) 217 | state = StateEntryPhraseDefn 218 | default: 219 | entry.Synonyms[len(entry.Synonyms)-1] += " " + string(lnt) 220 | } 221 | case StateEntryPhraseDefn: 222 | switch { 223 | case phraseDefnStartRe.Match(ln): 224 | meaning = nil 225 | entry.PhraseDefns = append(entry.PhraseDefns, string(bytes.TrimSpace(bytes.Replace(lnt, []byte("--"), nil, 1)))) 226 | entry.Variant = append(entry.Variant, string(bytes.ToLower(phraseDefnStartRe.FindSubmatch(ln)[1]))) 227 | state = StateEntryPhraseDefn 228 | case blankLine: 229 | // allow a blank line to end it for reducing the chance of bugs. 230 | state = StateEntryExtra 231 | default: 232 | // phrase definitions are always last, so no need for checking 233 | // for any other state changes (e.g. the start of a numbered 234 | // definition) (and the previous case should deal with any 235 | // edge-cases). 236 | entry.PhraseDefns[len(entry.PhraseDefns)-1] += " " + string(lnt) 237 | } 238 | } 239 | 240 | if i%10000 == 0 { 241 | debug.FreeOSMemory() // hack to try and limit memory usage 242 | } 243 | i++ 244 | } 245 | 246 | if serr := sc.Err(); serr != nil { 247 | return nil, serr 248 | } 249 | if perr != nil { 250 | return nil, perr 251 | } 252 | return wd, nil 253 | } 254 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/pgaskin/dictutil 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/disintegration/imaging v1.6.2 7 | github.com/mattn/go-sqlite3 v2.0.3+incompatible 8 | github.com/pgaskin/koboutils/v2 v2.1.0 9 | github.com/pmezard/go-difflib v1.0.0 // indirect 10 | github.com/russross/blackfriday/v2 v2.0.1 11 | github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect 12 | github.com/spf13/pflag v1.0.5 13 | gopkg.in/yaml.v2 v2.2.8 14 | ) 15 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/disintegration/imaging v1.6.2 h1:w1LecBlG2Lnp8B3jk5zSuNqd7b4DXhcjwek1ei82L+c= 2 | github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4= 3 | github.com/mattn/go-sqlite3 v2.0.3+incompatible h1:gXHsfypPkaMZrKbD5209QV9jbUTJKjyR5WD3HYQSd+U= 4 | github.com/mattn/go-sqlite3 v2.0.3+incompatible/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= 5 | github.com/pgaskin/koboutils/v2 v2.1.0 h1:J5KzLWvj0zDvoP5aJ7RxWuzFA32CcnD+hqH6tw/3uRE= 6 | github.com/pgaskin/koboutils/v2 v2.1.0/go.mod h1:wTzkDIlsxmUyfwfspGcm0Ap+HOxSUYV0S8kMYrf+0gM= 7 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 9 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= 10 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 11 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= 12 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= 13 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 14 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 15 | golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 h1:hVwzHzIUGRjiF7EcUjqNxk3NCfkPxbDKRdnNE1Rpg0U= 16 | golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= 17 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 18 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 19 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 20 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= 21 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 22 | -------------------------------------------------------------------------------- /kobodict/crypt.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | import ( 4 | "bytes" 5 | "crypto/aes" 6 | "crypto/cipher" 7 | "fmt" 8 | ) 9 | 10 | // Crypter represents a symmetric dictionary encryption method. 11 | type Crypter interface { 12 | Encrypter 13 | Decrypter 14 | } 15 | 16 | // CryptMethodAES represents AES-128-ECB encryption with PKCS#7 padding. 17 | const CryptMethodAES string = "aes" 18 | 19 | // NewCrypter creates the specified type of Crypter with the specified key. 20 | func NewCrypter(method string, key []byte) (Crypter, error) { 21 | switch method { 22 | case CryptMethodAES: 23 | c, err := newCryptAES(key) 24 | return c, err 25 | default: 26 | return nil, fmt.Errorf("unknown encryption method %#v", method) 27 | } 28 | } 29 | 30 | type cryptAES struct { 31 | b cipher.Block 32 | } 33 | 34 | func newCryptAES(key []byte) (*cryptAES, error) { 35 | if b, err := aes.NewCipher(key); err != nil { 36 | return nil, err 37 | } else { 38 | return &cryptAES{b}, nil 39 | } 40 | } 41 | 42 | // Encrypt implements Encrypter. 43 | func (c *cryptAES) Encrypt(buf []byte) ([]byte, error) { 44 | if dst, err := cryptPKCS7Pad(buf, aes.BlockSize); err != nil { 45 | return nil, err 46 | } else if dst, err = cryptAES128ECBEncrypt(c.b, dst); err != nil { 47 | return nil, err 48 | } else { 49 | return dst, nil 50 | } 51 | } 52 | 53 | // Decrypt implements Decrypter. 54 | func (c *cryptAES) Decrypt(buf []byte) ([]byte, error) { 55 | if dst, err := cryptAES128ECBDecrypt(c.b, buf); err != nil { 56 | return nil, err 57 | } else if dst, err := cryptPKCS7Unpad(dst, aes.BlockSize); err != nil { 58 | return nil, err 59 | } else { 60 | return dst, nil 61 | } 62 | } 63 | 64 | func cryptPKCS7Unpad(src []byte, blockSize int) ([]byte, error) { 65 | if blockSize > 0xFF || blockSize < 0x00 { 66 | return nil, fmt.Errorf("block size %d out of bounds", blockSize) 67 | } else if len(src)%blockSize != 0 || len(src) == 0 { 68 | return nil, fmt.Errorf("data length %d is empty or not a multiple of block size %d", len(src), blockSize) 69 | } 70 | plen := int(src[len(src)-1]) 71 | if len(src) <= plen { 72 | return nil, fmt.Errorf("invalid padding: padding length %d out of bounds", plen) 73 | } 74 | for _, v := range src[len(src)-plen:] { 75 | if int(v) != plen { 76 | return nil, fmt.Errorf("invalid padding: expected %d, got %d", plen, v) 77 | } 78 | } 79 | return src[:len(src)-plen], nil 80 | } 81 | 82 | func cryptPKCS7Pad(src []byte, blockSize int) ([]byte, error) { 83 | if blockSize > 0xFF || blockSize < 0x00 { 84 | return nil, fmt.Errorf("block size %d out of bounds", blockSize) 85 | } 86 | plen := blockSize - len(src)%blockSize 87 | return append(src, bytes.Repeat([]byte{byte(plen)}, plen)...), nil 88 | } 89 | 90 | func cryptAES128ECBDecrypt(cb cipher.Block, src []byte) ([]byte, error) { 91 | if len(src)%aes.BlockSize != 0 { 92 | return nil, fmt.Errorf("src not a multiple of block size %d", aes.BlockSize) 93 | } 94 | dst := make([]byte, len(src)) 95 | for i := aes.BlockSize; i <= len(src); i += aes.BlockSize { 96 | cb.Decrypt(dst[i-aes.BlockSize:i], src[i-aes.BlockSize:i]) 97 | } 98 | return dst, nil 99 | } 100 | 101 | func cryptAES128ECBEncrypt(cb cipher.Block, src []byte) ([]byte, error) { 102 | if len(src)%aes.BlockSize != 0 { 103 | return nil, fmt.Errorf("src not a multiple of block size %d", aes.BlockSize) 104 | } 105 | dst := make([]byte, len(src)) 106 | for i := aes.BlockSize; i <= len(src); i += aes.BlockSize { 107 | cb.Encrypt(dst[i-aes.BlockSize:i], src[i-aes.BlockSize:i]) 108 | } 109 | return dst, nil 110 | } 111 | -------------------------------------------------------------------------------- /kobodict/crypt_test.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | // TODO(v1) 4 | -------------------------------------------------------------------------------- /kobodict/fs.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "os" 9 | "path/filepath" 10 | "strings" 11 | "unicode/utf8" 12 | ) 13 | 14 | // Unpack is a helper function to unpack the contents of a Reader to a folder 15 | // on-disk. The provided dir must be non-existent. Unpack will not close the 16 | // reader. 17 | func Unpack(r *Reader, dir string) error { 18 | if _, err := os.Stat(dir); !os.IsNotExist(err) { 19 | return fmt.Errorf("dir %#v already exists", dir) 20 | } 21 | if err := os.Mkdir(dir, 0755); err != nil { 22 | return fmt.Errorf("create dir %#v: %w", dir, err) 23 | } 24 | for _, f := range r.File { 25 | if err := unpackFile(dir, f.Open, f.Name); err != nil { 26 | return fmt.Errorf("unpack file %#v: %w", f.Name, err) 27 | } 28 | } 29 | for _, f := range r.Dicthtml { 30 | if err := unpackFile(dir, f.Open, f.Name); err != nil { 31 | return fmt.Errorf("unpack dicthtml %#v (prefix: %s): %w", f.Name, f.Prefix, err) 32 | } 33 | } 34 | if err := ioutil.WriteFile(filepath.Join(dir, "words"), []byte(strings.Join(r.Word, "\n")), 0644); err != nil { 35 | return fmt.Errorf("write words file: %w", err) 36 | } 37 | return nil 38 | } 39 | 40 | func unpackFile(dir string, open func() (io.ReadCloser, error), name string) error { 41 | fr, err := open() 42 | if err != nil { 43 | return fmt.Errorf("read contents: %w", err) 44 | } 45 | defer fr.Close() 46 | 47 | fw, err := os.OpenFile(filepath.Join(dir, name), os.O_CREATE|os.O_WRONLY|os.O_EXCL, 0644) 48 | if err != nil { 49 | return fmt.Errorf("create output file: %w", err) 50 | } 51 | defer fw.Close() 52 | 53 | if _, err := io.Copy(fw, fr); err != nil { 54 | return fmt.Errorf("write output file: %w", err) 55 | } 56 | 57 | if err := fw.Close(); err != nil { 58 | return fmt.Errorf("write output file: %w", err) 59 | } 60 | 61 | return nil 62 | } 63 | 64 | // Pack is a helper function to pack the contents a folder unpacked using Unpack 65 | // into a Writer. It is assumed that the writer has not been used. The provided 66 | // file will be overwritten if it exists and is a regular file, or created if it 67 | // doesn't exist. Pack will not close the writer. 68 | func Pack(w *Writer, dir string) error { 69 | if fi, err := os.Stat(filepath.Join(dir, "words")); os.IsNotExist(err) || (err == nil && fi.IsDir()) { 70 | return fmt.Errorf("dir %#v is not an unpacked dictzip (no words file)", dir) 71 | } 72 | 73 | fis, err := ioutil.ReadDir(dir) // note: this is sorted 74 | if err != nil { 75 | return fmt.Errorf("read dir %#v: %w", dir, err) 76 | } 77 | 78 | for _, fi := range fis { 79 | switch { 80 | case fi.IsDir(): 81 | return fmt.Errorf("invalid dir %#v: dirs are not supported", fi.Name()) 82 | case fi.Name() == "words": 83 | continue 84 | case strings.HasSuffix(fi.Name(), ".html"): 85 | if err := func() error { 86 | fr, err := os.OpenFile(filepath.Join(dir, fi.Name()), os.O_RDONLY, 0) 87 | if err != nil { 88 | return fmt.Errorf("open file: %w", err) 89 | } 90 | defer fr.Close() 91 | 92 | tmp := make([]byte, 2) 93 | if _, err := fr.Read(tmp); err != nil { 94 | return fmt.Errorf("read file: %w", err) 95 | } else if tmp[0] == 0x1F && tmp[1] == 0x8B { 96 | return fmt.Errorf("invalid unpacked dicthtml file: already compressed") 97 | } else if _, err := fr.Seek(0, os.SEEK_SET); err != nil { 98 | return fmt.Errorf("read file: %w", err) 99 | } 100 | 101 | fw, err := w.CreateDicthtml(strings.TrimSuffix(fi.Name(), ".html")) 102 | if err != nil { 103 | return fmt.Errorf("create dictzip entry: %w", err) 104 | } 105 | 106 | if _, err := io.Copy(fw, fr); err != nil { 107 | return fmt.Errorf("write file: %w", err) 108 | } 109 | 110 | return nil 111 | }(); err != nil { 112 | return fmt.Errorf("add dicthtml %#v: %w", fi.Name(), err) 113 | } 114 | default: 115 | if err := func() error { 116 | fr, err := os.OpenFile(filepath.Join(dir, fi.Name()), os.O_RDONLY, 0) 117 | if err != nil { 118 | return fmt.Errorf("open file: %w", err) 119 | } 120 | defer fr.Close() 121 | 122 | fw, err := w.CreateFile(strings.TrimSuffix(fi.Name(), ".html")) 123 | if err != nil { 124 | return fmt.Errorf("create dictzip entry: %w", err) 125 | } 126 | 127 | if _, err := io.Copy(fw, fr); err != nil { 128 | return fmt.Errorf("write file: %w", err) 129 | } 130 | 131 | return nil 132 | }(); err != nil { 133 | return fmt.Errorf("add file %#v: %w", fi.Name(), err) 134 | } 135 | } 136 | } 137 | 138 | if err := func() error { 139 | fr, err := os.OpenFile(filepath.Join(dir, "words"), os.O_RDONLY, 0) 140 | if err != nil { 141 | return fmt.Errorf("open words file: %w", err) 142 | } 143 | defer fr.Close() 144 | 145 | sc := bufio.NewScanner(fr) 146 | for sc.Scan() { 147 | if !utf8.Valid(sc.Bytes()) { 148 | return fmt.Errorf("invalid word: %#v", sc.Text()) 149 | } 150 | if word := strings.TrimSpace(sc.Text()); len(word) != 0 { 151 | if err := w.AddWord(word); err != nil { 152 | return fmt.Errorf("add word %#v: %s", word, err) 153 | } 154 | } 155 | } 156 | if sc.Err() != nil { 157 | return fmt.Errorf("read words file: %w", err) 158 | } 159 | 160 | return nil 161 | }(); err != nil { 162 | return fmt.Errorf("add words index: %w", err) 163 | } 164 | 165 | return nil 166 | } 167 | -------------------------------------------------------------------------------- /kobodict/fs_test.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | // TODO(v1) 4 | -------------------------------------------------------------------------------- /kobodict/marisa.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | import "io" 4 | 5 | // Marisa is used by Reader and Writer for reading/writing Marisa tries. It is 6 | // automatically set on supported platforms if 7 | // github.com/pgaskin/dictutil/kobodict/marisa is imported, but can be 8 | // overridden manually. 9 | var Marisa interface { 10 | MarisaReader 11 | MarisaWriter 12 | } 13 | 14 | // MarisaReader represents a simplified abstraction for reading Marisa tries. 15 | type MarisaReader interface { 16 | ReadAll(io.Reader) ([]string, error) 17 | } 18 | 19 | // MarisaWriter represents a simplified abstraction for writing Marisa tries. 20 | type MarisaWriter interface { 21 | WriteAll(io.Writer, []string) error 22 | } 23 | -------------------------------------------------------------------------------- /kobodict/marisa/marisa.go: -------------------------------------------------------------------------------- 1 | // Package marisa is imported with _ to enable marisa for the kobodict, if 2 | // supported. It is in a separate package so functions in kobodict which don't 3 | // require marisa can be used without compiling it. As an alternative to 4 | // importing this package, you can provide your own implementation of marisa in 5 | // kobodict.Marisa. If imported, this package will fail to compile unless marisa 6 | // is available for your GOOS/GOARCH. 7 | package marisa 8 | 9 | import "github.com/pgaskin/dictutil/kobodict" 10 | 11 | // This is done so it can still be instantiated even if not implemented for the 12 | // current platform (it will be caught when assigning it to kobodict.Marisa), 13 | // named platform for better error messages. 14 | 15 | type platform struct{} 16 | 17 | func init() { 18 | kobodict.Marisa = new(platform) // platform-specific implementation 19 | } 20 | -------------------------------------------------------------------------------- /kobodict/marisa/marisa_cgo.go: -------------------------------------------------------------------------------- 1 | //+build cgo 2 | 3 | package marisa 4 | 5 | import ( 6 | "io" 7 | 8 | "github.com/pgaskin/dictutil/marisa" 9 | ) 10 | 11 | func (*platform) ReadAll(r io.Reader) (wd []string, err error) { 12 | return marisa.ReadAll(r) 13 | } 14 | 15 | func (*platform) WriteAll(w io.Writer, wd []string) (err error) { 16 | return marisa.WriteAll(w, wd) 17 | } 18 | -------------------------------------------------------------------------------- /kobodict/marisa/marisa_test.go: -------------------------------------------------------------------------------- 1 | package marisa 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha1" 6 | "encoding/hex" 7 | "io" 8 | "reflect" 9 | "runtime" 10 | "testing" 11 | 12 | "github.com/pgaskin/dictutil/kobodict" 13 | ) 14 | 15 | func TestMarisa(t *testing.T) { 16 | impl, ok := (interface{})(new(platform)).(interface { 17 | kobodict.MarisaReader 18 | kobodict.MarisaWriter 19 | }) 20 | if !ok { 21 | t.Skipf("warning: Marisa not supported on platform GOOS=%s GOARCH=%s and must be provided externally", runtime.GOOS, runtime.GOARCH) 22 | } 23 | 24 | w := []string{ 25 | "asd", 26 | "dfg", 27 | "sdf", 28 | } 29 | 30 | buf := bytes.NewBuffer(nil) 31 | if err := impl.WriteAll(buf, w); err != nil { 32 | t.Fatalf("unexpected error when writing trie: %v", err) 33 | } else if buf.Len() == 0 { 34 | t.Errorf("written trie is empty") 35 | } 36 | 37 | ss := sha1.New() 38 | 39 | nw, err := impl.ReadAll(io.TeeReader(buf, ss)) 40 | if err != nil { 41 | t.Fatalf("unexpected error when reading written trie: %v", err) 42 | } else if len(nw) == 0 { 43 | t.Errorf("read trie is empty") 44 | } else if !reflect.DeepEqual(nw, w) { 45 | t.Errorf("read tree: expected %+s, got %+s", w, nw) 46 | } 47 | 48 | if runtime.GOARCH == "amd64" { 49 | if x, y := hex.EncodeToString(ss.Sum(nil)), "ea7252fc4e86585dea884e4bcb5ce7be90676474"; x != y { 50 | t.Errorf("trie output is incorrect or non-determinstic, expected sha1 %s, got %s", y, x) 51 | } 52 | } else { 53 | t.Logf("skipping sha1 check on non-amd64 architecture, as the correct file differs slightly on each one (usually by ~4 bytes)") 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /kobodict/reader.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "compress/gzip" 7 | "fmt" 8 | "io" 9 | "io/ioutil" 10 | "strings" 11 | ) 12 | 13 | // Reader provides access to the contents of a dictzip file. 14 | type Reader struct { 15 | Word []string 16 | Dicthtml []*ReaderDicthtml 17 | File []*ReaderFile 18 | z *zip.Reader 19 | d Decrypter 20 | } 21 | 22 | // ReaderDicthtml represents a dicthtml file from a Reader. 23 | type ReaderDicthtml struct { 24 | Name string 25 | Prefix string 26 | f *zip.File 27 | r *Reader 28 | } 29 | 30 | // ReaderDicthtml represents a raw file from a Reader (e.g. images). 31 | type ReaderFile struct { 32 | Name string 33 | f *zip.File 34 | r *Reader 35 | } 36 | 37 | // Decrypter decrypts dicthtml files. 38 | type Decrypter interface { 39 | // Decrypt decrypts the dicthtml bytes. It will only be called if the 40 | // dicthtml is not otherwise readable. An error should be returned if the 41 | // decryption itself encounters an error; the decryptor should not try to 42 | // judge if the resulting bytes are valid. 43 | Decrypt([]byte) ([]byte, error) 44 | } 45 | 46 | // NewReader returns a new dictzip reader which reads from r, with the given 47 | // file size. 48 | func NewReader(r io.ReaderAt, size int64) (*Reader, error) { 49 | zr, err := zip.NewReader(r, size) 50 | if err != nil { 51 | return nil, fmt.Errorf("open zip: %w", err) 52 | } 53 | 54 | kr := &Reader{ 55 | z: zr, 56 | } 57 | 58 | var found bool 59 | for _, zf := range zr.File { 60 | if zf.Name == "words" { 61 | if fr, err := zf.Open(); err != nil { 62 | return nil, fmt.Errorf("open words index: %w", err) 63 | } else if Marisa == nil { 64 | return nil, fmt.Errorf("no marisa bindings found") 65 | } else if kr.Word, err = Marisa.ReadAll(fr); err != nil { 66 | return nil, fmt.Errorf("read words index: %w", err) 67 | } 68 | found = true 69 | break 70 | } 71 | } 72 | if !found { 73 | return nil, fmt.Errorf("not a dictzip: no words index found") 74 | } 75 | 76 | for _, f := range zr.File { 77 | switch { 78 | case !f.Mode().IsRegular(): 79 | continue 80 | case f.Name == "words": 81 | continue 82 | case strings.Contains(f.Name, "/"): 83 | return nil, fmt.Errorf("read zip: illegal file %#v: contains slash (not in root dir)", f.Name) 84 | case strings.HasSuffix(f.Name, ".html"): 85 | kr.Dicthtml = append(kr.Dicthtml, &ReaderDicthtml{ 86 | Name: f.Name, 87 | Prefix: strings.TrimSuffix(f.Name, ".html"), 88 | f: f, 89 | r: kr, 90 | }) 91 | default: 92 | kr.File = append(kr.File, &ReaderFile{ 93 | Name: f.Name, 94 | f: f, 95 | r: kr, 96 | }) 97 | } 98 | } 99 | 100 | return kr, nil 101 | } 102 | 103 | // SetDecrypter sets the Decrypter used to decrypt encrypted dicthtml files. 104 | func (r *Reader) SetDecrypter(d Decrypter) { 105 | r.d = d 106 | } 107 | 108 | // Open returns an io.ReadCloser which reads the decoded dicthtml file. Multiple 109 | // files can be read at once. 110 | func (f *ReaderDicthtml) Open() (io.ReadCloser, error) { 111 | enc, err := func() (bool, error) { 112 | fr, err := f.f.Open() 113 | if err != nil { 114 | return false, fmt.Errorf("open zip entry: %v", err) 115 | } 116 | defer fr.Close() 117 | 118 | tmp := make([]byte, 2) 119 | if n, err := fr.Read(tmp); err != nil { 120 | return false, fmt.Errorf("read zip entry: %v", err) 121 | } else if n != len(tmp) { 122 | return false, fmt.Errorf("corrupt dicthtml: too short (%d)", n) 123 | } 124 | 125 | if tmp[0] == 0x1F && tmp[1] == 0x8B { 126 | return false, nil 127 | } 128 | 129 | if f.r.d == nil { 130 | return true, fmt.Errorf("corrupt or encrypted dicthtml: invalid header") 131 | } 132 | 133 | // maybe optimize this later? 134 | if buf, err := ioutil.ReadAll(io.MultiReader(bytes.NewReader(tmp), fr)); err != nil { 135 | return true, fmt.Errorf("read zip entry: %v", err) 136 | } else if dec, err := f.r.d.Decrypt(buf); err != nil { 137 | return true, fmt.Errorf("decrypt dicthtml: %v", err) 138 | } else if dec[0] != 0x1F || dec[1] != 0x8B { 139 | return true, fmt.Errorf("corrupt dicthtml or invalid encryption key: invalid header") 140 | } 141 | return true, nil 142 | }() 143 | if err != nil { 144 | return nil, err 145 | } 146 | 147 | fr, err := f.f.Open() 148 | if err != nil { 149 | return nil, fmt.Errorf("open zip entry: %v", err) 150 | } 151 | 152 | var dr io.Reader 153 | if enc { 154 | if buf, err := ioutil.ReadAll(fr); err != nil { 155 | return nil, fmt.Errorf("read zip entry: %v", err) 156 | } else if dec, err := f.r.d.Decrypt(buf); err != nil { 157 | return nil, fmt.Errorf("decrypt dicthtml: %v", err) 158 | } else if dec[0] != 0x1F || dec[1] != 0x8B { 159 | return nil, fmt.Errorf("corrupt dicthtml or invalid encryption key: invalid header") 160 | } else { 161 | dr = bytes.NewReader(dec) 162 | } 163 | } else { 164 | dr = fr 165 | } 166 | 167 | zr, err := gzip.NewReader(dr) 168 | if err != nil { 169 | return nil, fmt.Errorf("decompress dicthtml: %v", err) 170 | } 171 | 172 | return &funcReadCloser{ 173 | Reader: zr, 174 | Closer: func() error { 175 | if err := zr.Close(); err != nil { 176 | fr.Close() 177 | return err 178 | } 179 | return fr.Close() 180 | }, 181 | }, nil 182 | } 183 | 184 | // Open returns an io.ReadCloser which reads the contents of the file. Multiple 185 | // files can be read at once. 186 | func (f *ReaderFile) Open() (io.ReadCloser, error) { 187 | return f.f.Open() 188 | } 189 | 190 | type funcReadCloser struct { 191 | io.Reader 192 | Closer func() error 193 | } 194 | 195 | func (f *funcReadCloser) Close() error { 196 | if f.Closer != nil { 197 | return f.Closer() 198 | } 199 | return nil 200 | } 201 | -------------------------------------------------------------------------------- /kobodict/reader_test.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | // TODO(v1) 4 | -------------------------------------------------------------------------------- /kobodict/util.go: -------------------------------------------------------------------------------- 1 | // Package kobodict implements reading, writing, and other utilities for Kobo 2 | // dictionaries (v2). 3 | // 4 | // A marisa implementation must be provided by 5 | // github.com/pgaskin/kobodict/marisa or a custom one if Writer or Reader is 6 | // used. 7 | package kobodict 8 | 9 | import ( 10 | "strings" 11 | "unicode" 12 | ) 13 | 14 | // NormalizeWordReference normalizes a word for use in an dicthtml headword 15 | // (
= 2 || c == '\x00' { // limit to 2 chars, also cut at null 49 | pfx = pfx[:i] // trim up to current char 50 | break 51 | } 52 | pfx[i] = unicode.ToLower(c) // this includes accented chars 53 | } 54 | 55 | for len(pfx) != 0 { 56 | if unicode.IsSpace(pfx[0]) { 57 | pfx = pfx[1:] // trim left space 58 | } else { 59 | break 60 | } 61 | } 62 | 63 | for len(pfx) != 0 { 64 | if unicode.IsSpace(pfx[len(pfx)-1]) { 65 | pfx = pfx[:len(pfx)-1] // trim right space 66 | } else { 67 | break 68 | } 69 | } 70 | 71 | if len(pfx) == 0 { 72 | return "11" // if empty, return "11" 73 | } 74 | 75 | if !unicode.Is(unicode.Cyrillic, pfx[0]) { 76 | for len(pfx) < 2 { 77 | pfx = append(pfx, 'a') // pad right with 'a's to 2 chars 78 | } 79 | if !unicode.IsLetter(pfx[0]) || !unicode.IsLetter(pfx[1]) { 80 | return "11" // if neither of the first 2 chars are letters, return "11" 81 | } 82 | } 83 | 84 | return string(pfx) 85 | } 86 | 87 | // wordPrefix gets the prefix of a word for sharding dicthtml files. 88 | // 89 | // This is not to be used with Kanji, as those are handled by a separate 90 | // function for Japanese dictionaries. 91 | // 92 | // The logic is reversed from DictionaryParser::htmlForWord in libnickel. It 93 | // matches it as closely as possible. 94 | func wordPrefix(w string) string { 95 | // w 96 | // QString::toLower() 97 | w = strings.ToLower(w) 98 | 99 | // QString::leftRef(2) 100 | if len(w) > 2 { 101 | w = string([]rune(w)[:2]) 102 | } 103 | 104 | // QString::trimmed() 105 | w = strings.TrimSpace(w) 106 | 107 | // simplify the following code by converting to rune slice 108 | r := []rune(w) 109 | 110 | // A null byte is a valid Unicode character, but in C, it's treated as 111 | // the end of a string. To keep compatibility with libnickel, we need to 112 | // end a string there if necessary. 113 | for i, c := range r { 114 | if c == '\x00' { 115 | r = r[:i] 116 | break 117 | } 118 | } 119 | 120 | // DictionaryParser::isCyrillic(w[0]) 121 | // skip if true 122 | if !(len(r) != 0 && unicode.Is(unicode.Cyrillic, r[0])) { 123 | // add an 'a' for right padding if not 2 chars 124 | if len(r) != 2 { 125 | r = append(r, 'a') 126 | } 127 | } 128 | 129 | // DictionaryParser::isCyrillic(w[0]) 130 | // skip if != false 131 | switch { 132 | case !(len(r) != 0 && unicode.Is(unicode.Cyrillic, r[0])): 133 | // inlined QChar::isLetter(w[0]), QChar::isLetter(w[1]), unnecessary length check 134 | // skip if both true 135 | if (len(r) >= 1 && unicode.IsLetter(r[0])) && (len(r) >= 2 && unicode.IsLetter(r[1])) { 136 | break 137 | } 138 | fallthrough 139 | case len(r) == 0: 140 | // w = QString::fromLatin1_helper("11"..., 2) 141 | return "11" 142 | } 143 | 144 | return string(r) 145 | } 146 | -------------------------------------------------------------------------------- /kobodict/util_test.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | import ( 4 | "strconv" 5 | "testing" 6 | ) 7 | 8 | func TestNormalizeWordReference(t *testing.T) { 9 | for _, tc := range []struct { 10 | v bool 11 | i, o string 12 | }{ 13 | {true, "Asd", "asd"}, 14 | {false, "Asd", "Asd"}, 15 | {true, " Asd", "asd"}, 16 | {false, " Asd", "Asd"}, 17 | {true, " Asd ", "asd"}, 18 | {false, " Asd ", "Asd"}, 19 | {true, " Asd \n", "asd"}, 20 | {false, " Asd \n", "Asd"}, 21 | {true, " Ȃsd \n", "ȃsd"}, 22 | {false, " Ȃsd \n", "Ȃsd"}, 23 | } { 24 | t.Logf("word %#v [variant:%t] (%#v)", tc.i, tc.v, tc.o) 25 | if o := NormalizeWordReference(tc.i, tc.v); o != tc.o { 26 | t.Errorf(" got %#v", o) 27 | } 28 | } 29 | } 30 | 31 | var tcs = []struct{ w, p string }{ 32 | // dicthtml-en 33 | {"test", "te"}, 34 | {"a-", "11"}, 35 | {"-an", "11"}, 36 | {"GB", "gb"}, 37 | 38 | // dicthtml-fr 39 | {"ébahir", "éb"}, 40 | {"à", "àa"}, 41 | {"a1", "11"}, 42 | {"ô", "ôa"}, 43 | {"kébab", "ké"}, 44 | {"aérer", "aé"}, 45 | {"living-room", "li"}, 46 | 47 | // dicthtml-ja 48 | // Note, Kanji not currently implemented, so not testing (note, the logic 49 | // is in a separate function, anyways). 50 | // {"あ", "あ"}, 51 | // {"アークとう", "アー"}, 52 | 53 | // generated by dictword-test: spaces 54 | {" x", "xa"}, 55 | {" ", "11"}, 56 | {"x ", "xa"}, 57 | {" ", "11"}, 58 | {" ", "11"}, 59 | {"\t\t", "11"}, 60 | {"\t\f\t", "11"}, 61 | {"x ", "xa"}, 62 | {" xx", "xa"}, 63 | 64 | // generated by dictword-test: spaces where trim/prefix order matters 65 | {" x", "11"}, 66 | {" xy", "11"}, 67 | {" xyz", "11"}, 68 | {"x z", "xa"}, 69 | {"x z", "xa"}, 70 | 71 | // generated by dictword-test: cyrillic 72 | {" д", "д"}, 73 | {"д ", "д"}, 74 | {" ", "11"}, 75 | {" ", "11"}, 76 | {" ", "11"}, 77 | {" дд", "д"}, 78 | {"д ", "д"}, 79 | {"д", "д"}, 80 | {"aд", "aд"}, 81 | {"дa", "дa"}, 82 | {"aдa", "aд"}, 83 | {"дaд", "дa"}, 84 | 85 | // generated by dictword-test: uppercase accented letters 86 | {"Ȅe", "ȅe"}, 87 | {"eȄ", "eȅ"}, 88 | {"Ȅ", "ȅa"}, 89 | {"Ȅ!", "11"}, 90 | 91 | // generated by dictword-test: cjk 92 | {" 未", "未a"}, 93 | {" 未", "11"}, 94 | {"未", "未a"}, 95 | {"未未", "未未"}, 96 | {"x未", "x未"}, 97 | {"未x", "未x"}, 98 | {"xy未", "xy"}, 99 | {"还没", "还没"}, 100 | 101 | // generated by dictword-test: misc 102 | {"!", "11"}, 103 | {"!!", "11"}, 104 | {"!!!", "11"}, 105 | {"x!", "11"}, 106 | {"x!!", "11"}, 107 | {"xx!", "xx"}, 108 | {"xxx!", "xx"}, 109 | {" !", "11"}, 110 | {" !!", "11"}, 111 | {" !!!", "11"}, 112 | {" !", "11"}, 113 | {" !!", "11"}, 114 | {" !!!", "11"}, 115 | {" x!", "xa"}, 116 | {" x!!", "xa"}, 117 | {" xx!", "xa"}, 118 | {" xxx!", "xa"}, 119 | 120 | // synthetic 121 | {"x\x00y", "xa"}, 122 | {"\x00xy", "11"}, 123 | } 124 | 125 | func TestWordPrefix(t *testing.T) { 126 | for _, tc := range tcs { 127 | t.Logf("word %#v (%#v)", tc.w, tc.p) 128 | if p := wordPrefix(tc.w); p != tc.p { 129 | t.Errorf(" got (original version) %#v", p) 130 | } 131 | if p := WordPrefix(tc.w); p != tc.p { 132 | t.Errorf(" got (simplified version) %#v", p) 133 | } 134 | } 135 | } 136 | 137 | func BenchmarkWordPrefix(b *testing.B) { 138 | for _, tcf := range []struct { 139 | n string 140 | fn func(string) string 141 | }{ 142 | {"Orig/", wordPrefix}, 143 | {"Smpl/", WordPrefix}, 144 | } { 145 | // all test cases 146 | b.Run(tcf.n+"All"+strconv.Itoa(len(tcs)), func(b *testing.B) { 147 | for i := 0; i < b.N; i++ { 148 | for _, tc := range tcs { 149 | tcf.fn(tc.w) 150 | } 151 | } 152 | }) 153 | 154 | // near-worst possible case 155 | b.Run(tcf.n+"Worst", func(b *testing.B) { 156 | for i := 0; i < b.N; i++ { 157 | tcf.fn(" 还д 没") 158 | } 159 | }) 160 | 161 | // normal case 162 | b.Run(tcf.n+"Normal", func(b *testing.B) { 163 | for i := 0; i < b.N; i++ { 164 | tcf.fn("Test") 165 | } 166 | }) 167 | 168 | // best case 169 | b.Run(tcf.n+"Best", func(b *testing.B) { 170 | for i := 0; i < b.N; i++ { 171 | tcf.fn("aa") 172 | } 173 | }) 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /kobodict/writer.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "compress/gzip" 7 | "fmt" 8 | "io" 9 | "sort" 10 | "strings" 11 | ) 12 | 13 | // Writer creates dictzips. It does not do any validation; it only does what it 14 | // is told. It is up to the user to ensure the input is valid. 15 | type Writer struct { 16 | z *zip.Writer 17 | e Encrypter 18 | words map[string]struct{} // doesn't take up space for values 19 | used map[string]struct{} 20 | closed bool 21 | last io.WriteCloser 22 | } 23 | 24 | // Encrypter encrypts dicthtml files. 25 | type Encrypter interface { 26 | // Encrypt encrypts the provided bytes. 27 | Encrypt([]byte) ([]byte, error) 28 | } 29 | 30 | // NewWriter creates a dictzip writer writing to w. 31 | func NewWriter(w io.Writer) *Writer { 32 | return &Writer{ 33 | z: zip.NewWriter(w), 34 | words: map[string]struct{}{}, 35 | used: map[string]struct{}{}, 36 | } 37 | } 38 | 39 | // AddWord normalizes and adds a word to the index. If the word has already been 40 | // added, it does nothing. 41 | func (w *Writer) AddWord(word string) error { 42 | if w.closed { 43 | return fmt.Errorf("write to closed writer") 44 | } 45 | w.words[strings.TrimSpace(word)] = struct{}{} // index words aren't normalized except for trimming spaces 46 | return nil 47 | } 48 | 49 | // CreateDicthtml adds a dicthtml file for the specified prefix and returns a 50 | // writer which is valid until the next file is created. 51 | func (w *Writer) CreateDicthtml(prefix string) (io.Writer, error) { 52 | if strings.Contains(prefix, "/") { 53 | return nil, fmt.Errorf("invalid prefix: must not contain slashes") 54 | } 55 | if w.closed { 56 | return nil, fmt.Errorf("writer already closed") 57 | } 58 | if w.last != nil { 59 | if err := w.last.Close(); err != nil { 60 | return nil, fmt.Errorf("close last file writer: %w", err) 61 | } 62 | w.last = nil 63 | } 64 | 65 | filename := prefix + ".html" 66 | if _, ok := w.used[filename]; ok { 67 | return nil, fmt.Errorf("file %#v already exists in dictzip", filename) 68 | } 69 | 70 | fw, err := w.z.Create(filename) 71 | if err != nil { 72 | return nil, fmt.Errorf("create zip entry: %w", err) 73 | } 74 | 75 | if w.e != nil { 76 | ew := newEncryptWriter(w.e, fw) 77 | zw := gzip.NewWriter(ew) 78 | 79 | w.last = &funcWriteCloser{ 80 | Writer: zw, 81 | Closer: func() error { 82 | if err := zw.Close(); err != nil { 83 | return err 84 | } 85 | return ew.Close() 86 | }, 87 | } 88 | } else { 89 | w.last = gzip.NewWriter(fw) 90 | } 91 | 92 | w.used[filename] = struct{}{} 93 | return w.last, nil 94 | } 95 | 96 | // CreateFile adds a raw file with the specified name. Note that Kobo only 97 | // supports GIF and JPEG files starting with the "GIF" and "JFIF" magic, and the 98 | // treatment of other files is undefined. In addition, subdirectories are not 99 | // supported. The behaviour is undefined if a dicthtml file is added this way. 100 | func (w *Writer) CreateFile(filename string) (io.Writer, error) { 101 | if strings.Contains(filename, "/") || strings.Contains(filename, "\\") { 102 | return nil, fmt.Errorf("invalid filename: must not contain slashes") 103 | } else if strings.Contains(filename, "words") { 104 | return nil, fmt.Errorf("invalid filename: must not be 'words'") 105 | } else if _, ok := w.used[filename]; ok { 106 | return nil, fmt.Errorf("file %#v already exists in dictzip", filename) 107 | } 108 | if w.last != nil { 109 | if err := w.last.Close(); err != nil { 110 | return nil, fmt.Errorf("close last file writer: %w", err) 111 | } 112 | w.last = nil 113 | } 114 | 115 | fw, err := w.z.Create(filename) 116 | if err != nil { 117 | return nil, fmt.Errorf("create zip entry: %w", err) 118 | } 119 | 120 | w.last = &funcWriteCloser{ 121 | Writer: fw, 122 | Closer: nil, 123 | } 124 | w.used[filename] = struct{}{} 125 | return w.last, nil 126 | } 127 | 128 | // Exists checks if a file already exists in the dictzip with the specified name. 129 | func (w *Writer) Exists(fn string) bool { 130 | _, ok := w.used[fn] 131 | return ok 132 | } 133 | 134 | // Close writes the marisa index and the zip footer. The error should not be 135 | // ignored. It does not close the underlying writer. 136 | func (w *Writer) Close() error { 137 | if w.closed { 138 | return fmt.Errorf("writer already closed") 139 | } 140 | if w.last != nil { 141 | if err := w.last.Close(); err != nil { 142 | return fmt.Errorf("close last file writer: %w", err) 143 | } 144 | w.last = nil 145 | } 146 | 147 | var words []string 148 | for word := range w.words { 149 | words = append(words, word) 150 | } 151 | sort.Strings(words) 152 | 153 | if fw, err := w.z.Create("words"); err != nil { 154 | return fmt.Errorf("create index zip entry: %w", err) 155 | } else if Marisa == nil { 156 | return fmt.Errorf("no marisa bindings found") 157 | } else if err := Marisa.WriteAll(fw, words); err != nil { 158 | return fmt.Errorf("write index: %w", err) 159 | } 160 | 161 | if err := w.z.Close(); err != nil { 162 | return fmt.Errorf("close zip: %w", err) 163 | } 164 | return nil 165 | } 166 | 167 | // SetEncrypter sets the Encrypter used to encrypt dicthtml files. This must be 168 | // will only apply to dicthtml files added after the encrypter is set. 169 | func (w *Writer) SetEncrypter(e Encrypter) { 170 | w.e = e 171 | } 172 | 173 | type encryptWriter struct { 174 | e Encrypter 175 | w io.Writer 176 | b *bytes.Buffer 177 | c bool 178 | } 179 | 180 | func newEncryptWriter(e Encrypter, w io.Writer) io.WriteCloser { 181 | return &encryptWriter{ 182 | e: e, 183 | w: w, 184 | b: bytes.NewBuffer(nil), 185 | c: false, 186 | } 187 | } 188 | 189 | func (e encryptWriter) Write(buf []byte) (n int, err error) { 190 | if e.c { 191 | return 0, fmt.Errorf("write to closed writer") 192 | } 193 | return e.b.Write(buf) 194 | } 195 | 196 | // Close encrypts and writes the buffer to the underlying writer. The error 197 | // should be checked. 198 | func (e encryptWriter) Close() error { 199 | if e.c { 200 | return fmt.Errorf("writer already closed") 201 | } 202 | if buf, err := e.e.Encrypt(e.b.Bytes()); err != nil { 203 | return fmt.Errorf("encrypt bytes: %w", err) 204 | } else if _, err := e.w.Write(buf); err != nil { 205 | return fmt.Errorf("write encrypted bytes: %w", err) 206 | } 207 | return nil 208 | } 209 | 210 | type funcWriteCloser struct { 211 | io.Writer 212 | Closer func() error 213 | } 214 | 215 | func (f *funcWriteCloser) Close() error { 216 | if f.Closer != nil { 217 | return f.Closer() 218 | } 219 | return nil 220 | } 221 | -------------------------------------------------------------------------------- /kobodict/writer_test.go: -------------------------------------------------------------------------------- 1 | package kobodict 2 | 3 | // TODO(v1) 4 | -------------------------------------------------------------------------------- /marisa/libmarisa_generate.go: -------------------------------------------------------------------------------- 1 | //+build libmarisa_generate 2 | 3 | package main 4 | 5 | import ( 6 | "archive/tar" 7 | "bytes" 8 | "compress/gzip" 9 | "fmt" 10 | "io" 11 | "io/ioutil" 12 | "net/http" 13 | "os" 14 | "path" 15 | "regexp" 16 | "strings" 17 | ) 18 | 19 | func main() { 20 | url := "https://github.com/s-yata/marisa-trie/archive/970b20c141f11d9d7572a6bb8d0488f2e0520e22.tar.gz" 21 | version := "970b20c" 22 | 23 | if files, err := tarball(url); err != nil { 24 | fmt.Fprintf(os.Stderr, "Error: download tarball %#v: %v\n", url, err) 25 | os.Exit(1) 26 | return 27 | } else if err := func() error { 28 | if mr, err := libmarisa(files, version); err != nil { 29 | return err 30 | } else if mf, err := os.OpenFile("libmarisa.cc", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644); err != nil { 31 | return err 32 | } else if _, err := io.Copy(mf, mr); err != nil { 33 | mf.Close() 34 | return err 35 | } else { 36 | return mf.Close() 37 | } 38 | }(); err != nil { 39 | fmt.Fprintf(os.Stderr, "Error: generate libmarisa.cc: %v\n", err) 40 | os.Exit(1) 41 | return 42 | } else if err := func() error { 43 | if mr, err := hmarisa(files, version); err != nil { 44 | return err 45 | } else if mf, err := os.OpenFile("libmarisa.h", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644); err != nil { 46 | return err 47 | } else if _, err := io.Copy(mf, mr); err != nil { 48 | mf.Close() 49 | return err 50 | } else { 51 | return mf.Close() 52 | } 53 | }(); err != nil { 54 | fmt.Fprintf(os.Stderr, "Error: generate libmarisa.h: %v\n", err) 55 | os.Exit(1) 56 | return 57 | } 58 | } 59 | func hmarisa(files map[string][]byte, version string) (io.Reader, error) { 60 | marisaH, err := resolve(files, []string{ 61 | "include/marisa.h", 62 | }, "include", "lib") 63 | if err != nil { 64 | return nil, err 65 | } 66 | 67 | fmt.Printf("Generating libmarisa.h\n") 68 | return io.MultiReader( 69 | // A custom header. 70 | strings.NewReader("// AUTOMATICALLY GENERATED, DO NOT EDIT!\n"), 71 | strings.NewReader("// merged from marisa-trie "+version+".\n"), 72 | // Include the license info. 73 | bytes.NewReader([]byte{'\n', '/', '/', ' '}), 74 | bytes.NewReader(bytes.ReplaceAll(files["COPYING.md"], []byte{'\n'}, []byte{'\n', '/', '/', ' '})), 75 | bytes.NewReader([]byte{'\n', '\n'}), 76 | // Include the header. 77 | bytes.NewReader(marisaH), 78 | ), nil 79 | } 80 | 81 | func libmarisa(files map[string][]byte, version string) (io.Reader, error) { 82 | marisaGrimoireIOLib, err := resolve(files, []string{ 83 | "lib/marisa/grimoire/io/mapper.cc", 84 | "lib/marisa/grimoire/io/reader.cc", 85 | "lib/marisa/grimoire/io/writer.cc", 86 | }, "include", "lib") 87 | if err != nil { 88 | return nil, err 89 | } 90 | 91 | marisaGrimoireTrieLib, err := resolve(files, []string{ 92 | "lib/marisa/grimoire/trie/tail.cc", 93 | "lib/marisa/grimoire/trie/louds-trie.cc", 94 | }, "include", "lib") 95 | if err != nil { 96 | return nil, err 97 | } 98 | 99 | marisaGrimoireVectorLib, err := resolve(files, []string{ 100 | "lib/marisa/grimoire/vector/bit-vector.cc", 101 | }, "include", "lib") 102 | if err != nil { 103 | return nil, err 104 | } 105 | 106 | marisaLib, err := resolve(files, []string{ 107 | "lib/marisa/agent.cc", 108 | "lib/marisa/keyset.cc", 109 | "lib/marisa/trie.cc", 110 | }, "include", "lib") 111 | if err != nil { 112 | return nil, err 113 | } 114 | 115 | fmt.Printf("Generating libmarisa.cc\n") 116 | return io.MultiReader( 117 | // A custom header. 118 | strings.NewReader("// AUTOMATICALLY GENERATED, DO NOT EDIT!\n"), 119 | strings.NewReader("// merged from marisa-trie "+version+".\n"), 120 | // Include the license info. 121 | bytes.NewReader([]byte{'\n', '/', '/', ' '}), 122 | bytes.NewReader(bytes.ReplaceAll(files["COPYING.md"], []byte{'\n'}, []byte{'\n', '/', '/', ' '})), 123 | bytes.NewReader([]byte{'\n', '\n'}), 124 | // Include the warnings from the Makefile.am CXXFLAGS. 125 | // - Note that Clang also recognizes the GCC pragmas. 126 | strings.NewReader("#pragma GCC diagnostic warning \"-Wall\"\n"), 127 | strings.NewReader("#pragma GCC diagnostic warning \"-Weffc++\"\n"), 128 | strings.NewReader("#pragma GCC diagnostic warning \"-Wextra\"\n"), 129 | strings.NewReader("#pragma GCC diagnostic warning \"-Wconversion\"\n"), 130 | // Silence a warning. 131 | strings.NewReader("#pragma GCC diagnostic ignored \"-Wimplicit-fallthrough=\"\n"), 132 | // Include the libs themselves. 133 | bytes.NewReader(marisaGrimoireIOLib), 134 | bytes.NewReader(marisaGrimoireTrieLib), 135 | bytes.NewReader(marisaGrimoireVectorLib), 136 | bytes.NewReader(marisaLib), 137 | // Show info about the generated file. 138 | strings.NewReader("#line 1 \"libmarisa_generate.go\"\n"), 139 | strings.NewReader("#pragma GCC warning \"Using generated built-in marisa-trie "+version+".\"\n"), 140 | ), nil 141 | } 142 | 143 | func tarball(url string) (map[string][]byte, error) { 144 | fmt.Printf("Downloading tarball from %s\n", url) 145 | 146 | resp, err := http.Get(url) 147 | if err != nil { 148 | return nil, err 149 | } 150 | defer resp.Body.Close() 151 | 152 | zr, err := gzip.NewReader(resp.Body) 153 | if err != nil { 154 | return nil, err 155 | } 156 | 157 | var pfx string 158 | files := map[string][]byte{} 159 | 160 | tr := tar.NewReader(zr) 161 | for { 162 | fh, err := tr.Next() 163 | if err == io.EOF { 164 | break 165 | } else if err != nil { 166 | return nil, err 167 | } 168 | 169 | if fh.Name == "pax_global_header" || fh.FileInfo().IsDir() { 170 | continue 171 | } 172 | 173 | if pfx == "" { 174 | if strings.HasPrefix(fh.Name, "./") { 175 | pfx = "./" + strings.Split(fh.Name, "/")[1] + "/" 176 | } else { 177 | pfx = strings.Split(fh.Name, "/")[0] + "/" 178 | } 179 | } 180 | 181 | if !strings.HasPrefix(fh.Name, pfx) { 182 | return nil, fmt.Errorf("extract file %#v: doesn't have common prefix %#v", fh.Name, pfx) 183 | } 184 | 185 | buf, err := ioutil.ReadAll(tr) 186 | if err != nil { 187 | return nil, fmt.Errorf("extract file %#v: %w", fh.Name, err) 188 | } 189 | 190 | fn := strings.TrimPrefix(fh.Name, pfx) 191 | files[fn] = buf 192 | 193 | fmt.Printf(" [D] %s\n", fn) // downloaded 194 | } 195 | 196 | return files, nil 197 | } 198 | 199 | func resolve(files map[string][]byte, filenames []string, includePath ...string) (resolvedFile []byte, err error) { 200 | fmt.Printf("Resolving C* source files %s (against:%s) (I = included, S = preserved because not found, R = skipped because already included)\n", filenames, includePath) 201 | 202 | var resolveFn func(indent string, files map[string][]byte, filename string, buf []byte, done []string, includePath []string) (resolvedFile []byte, err error) 203 | resolveFn = func(indent string, files map[string][]byte, filename string, buf []byte, done []string, includePath []string) (resolvedFile []byte, err error) { 204 | defer func() { 205 | if rerr := recover(); rerr != nil { 206 | resolvedFile, err = nil, rerr.(error) 207 | } 208 | }() 209 | 210 | resolvedFile = regexp.MustCompile(`(?m)^\s*#\s*include\s+["'<][^"'>]+["'>]$`).ReplaceAllFunc(buf, func(importBuf []byte) []byte { 211 | fn := string(regexp.MustCompile(`["'<]([^"'>]+)["'>]`).FindSubmatch(importBuf)[1]) 212 | 213 | for _, ip := range includePath { 214 | ifn := path.Join(ip, fn) 215 | for _, dfn := range done { 216 | if m, _ := path.Match(dfn, ifn); m { 217 | fmt.Printf("%s[R] %s\n", indent, fn) // already included 218 | return nil 219 | } 220 | } 221 | 222 | ibuf, ok := files[ifn] 223 | if ok { 224 | fmt.Printf("%s[I] %s => %s\n", indent, fn, ifn) // include 225 | ibuf, err := resolveFn(indent+" ", files, ifn, ibuf, append(done, ifn), append(includePath, path.Dir(ifn))) 226 | if err != nil { 227 | panic(fmt.Errorf("resolve %#v: %w", ifn, err)) 228 | } 229 | return append(append([]byte{'\n', '\n'}, ibuf...), '\n', '\n') 230 | } 231 | } 232 | 233 | fmt.Printf("%s[S] %s\n", indent, fn) // preserve 234 | return importBuf 235 | }) 236 | 237 | return 238 | } 239 | 240 | for _, fn := range filenames { 241 | if buf, ok := files[fn]; !ok { 242 | return nil, fmt.Errorf("file %#v: not found", fn) 243 | } else if buf, err := resolveFn(" ", files, fn, buf, []string{fn}, append(includePath, path.Dir(fn))); err != nil { 244 | return nil, fmt.Errorf("file %v: %w", fn, err) 245 | } else { 246 | resolvedFile = append(resolvedFile, buf...) 247 | resolvedFile = append(resolvedFile, '\n', '\n') 248 | } 249 | } 250 | 251 | return resolvedFile, nil 252 | } 253 | -------------------------------------------------------------------------------- /marisa/marisa.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "libmarisa.h" 7 | #include "shim.h" 8 | 9 | #define catch_go_ex(t, ctx) \ 10 | catch (const t &ex) { \ 11 | const char* b = ctx; \ 12 | char* err = reinterpret_cast( \ 13 | calloc(strlen(b)+strlen(ex.what())+1, sizeof(char))); \ 14 | strcpy(err, b); \ 15 | strcat(err, ex.what()); \ 16 | return err; \ 17 | } 18 | 19 | #define catch_go \ 20 | catch_go_ex(marisa::Exception, "marisa: ") \ 21 | catch_go_ex(go::error, "go shim: ") \ 22 | catch_go_ex(std::runtime_error, "c++ runtime: ") \ 23 | catch_go_ex(std::exception, "c++ error: ") \ 24 | catch (...) { return strdup("marisa: unknown c++ exception"); } \ 25 | return NULL; 26 | 27 | #define go_func extern "C" const char* 28 | 29 | go_func marisa_read_all(int iid, char ***out_wd, size_t *out_wd_sz) { 30 | try { 31 | if (!out_wd || !out_wd_sz) 32 | throw std::runtime_error("parameter is null"); 33 | go::rstream r(iid); 34 | marisa::Trie t; 35 | marisa::read(r, &t); 36 | marisa::Agent a; 37 | a.set_query(""); 38 | *out_wd_sz = 0; 39 | *out_wd = reinterpret_cast(calloc(t.num_keys(), sizeof(char**))); 40 | while (t.predictive_search(a)) { 41 | if (*out_wd_sz == t.num_keys()) 42 | throw std::runtime_error("expected " + std::to_string(t.num_keys()) + " keys, got more"); 43 | memcpy((*out_wd)[(*out_wd_sz)++] = reinterpret_cast(calloc(a.key().length()+1, sizeof(char))), a.key().ptr(), a.key().length()); 44 | } 45 | if (*out_wd_sz != t.num_keys()) 46 | throw std::runtime_error("expected " + std::to_string(t.num_keys()) + " keys, got " + std::to_string(*out_wd_sz)); 47 | } catch_go 48 | } 49 | 50 | go_func marisa_write_all(int iid, const char** wd, size_t wd_sz) { 51 | try { 52 | if (wd_sz && !wd) 53 | throw std::runtime_error("parameter is null"); 54 | marisa::Keyset k; 55 | for (size_t i = 0; i < wd_sz; i++) 56 | k.push_back(wd[i]); 57 | marisa::Trie t; 58 | t.build(k); 59 | go::wstream w(iid); 60 | marisa::write(w, t); 61 | } catch_go 62 | } 63 | -------------------------------------------------------------------------------- /marisa/marisa.go: -------------------------------------------------------------------------------- 1 | // Package marisa provides a simplified self-contained CGO wrapper for 2 | // marisa-trie (https://github.com/s-yata/marisa-trie). 3 | package marisa 4 | 5 | //go:generate go run -tags libmarisa_generate libmarisa_generate.go 6 | 7 | //#cgo CPPFLAGS: -Wall 8 | //#cgo LDFLAGS: 9 | //#include 10 | //#include 11 | //const char* marisa_read_all(int iid, char ***out_wd, size_t *out_wd_sz); 12 | //const char* marisa_write_all(int iid, const char** wd, size_t wd_sz); 13 | import "C" 14 | 15 | import ( 16 | "errors" 17 | "io" 18 | "unsafe" 19 | ) 20 | 21 | func ReadAll(r io.Reader) ([]string, error) { 22 | iid := iopPut(r) 23 | var out_wd **C.char 24 | var out_wd_sz C.size_t 25 | err := C.marisa_read_all( 26 | (C.int)(iid), 27 | (***C.char)(unsafe.Pointer(&out_wd)), 28 | (*C.size_t)(unsafe.Pointer(&out_wd_sz)), 29 | ) 30 | iopDel(iid) 31 | return gostrs(out_wd, out_wd_sz), goerr(err) 32 | } 33 | 34 | func WriteAll(w io.Writer, wd []string) error { 35 | iid := iopPut(w) 36 | wd_ptr, wd_sz, wd_free := cstrs(wd) 37 | err := C.marisa_write_all( 38 | (C.int)(iid), 39 | (**C.char)(wd_ptr), 40 | (C.size_t)(wd_sz), 41 | ) 42 | wd_free() 43 | iopDel(iid) 44 | return goerr(err) 45 | } 46 | 47 | func goerr(p *C.char) (err error) { 48 | if p != nil { 49 | err = errors.New(C.GoString(p)) 50 | C.free(unsafe.Pointer(p)) 51 | } 52 | return 53 | } 54 | 55 | func gostrs(p **C.char, n C.size_t) (s []string) { 56 | if p != nil { 57 | s = make([]string, int(n)) 58 | for i, v := range (*[1 << 28]*C.char)(unsafe.Pointer(p))[:int(n):int(n)] { 59 | s[i] = C.GoString(v) 60 | C.free(unsafe.Pointer(v)) 61 | } 62 | C.free(unsafe.Pointer(p)) 63 | } 64 | return 65 | } 66 | 67 | func cstrs(s []string) (p **C.char, n C.size_t, free func()) { 68 | n = (C.size_t)(len(s)) 69 | if len(s) == 0 { 70 | free = func() {} 71 | return 72 | } 73 | c := make([]*C.char, len(s)) 74 | for i, v := range s { 75 | c[i] = C.CString(v) 76 | } 77 | p = (**C.char)(unsafe.Pointer(&c[0])) 78 | free = func() { 79 | for _, v := range c { 80 | C.free(unsafe.Pointer(v)) 81 | } 82 | } 83 | return 84 | } 85 | -------------------------------------------------------------------------------- /marisa/marisa_test.go: -------------------------------------------------------------------------------- 1 | package marisa 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha1" 6 | "encoding/hex" 7 | "errors" 8 | "io" 9 | "reflect" 10 | "runtime" 11 | "strings" 12 | "testing" 13 | ) 14 | 15 | func TestTrieIO(t *testing.T) { 16 | emptyBuf := bytes.NewBuffer(nil) 17 | emptyS := "1aa6c451104c2c1b24ecb66ecb84bde2403c49b1" // marisa-build 5 | //#include 6 | import "C" 7 | 8 | import ( 9 | "fmt" 10 | "io" 11 | "sync" 12 | "unsafe" 13 | ) 14 | 15 | // shim.go and shim.h (plus _cgo_export.h implicitly), implement a shim to 16 | // access Go I/O interfaces efficiently, concurrently, cleanly, and safely from 17 | // C/C++ code. Note that if any C strings are returned by the Go side, they must 18 | // be freed on the C side. 19 | 20 | // https://golang.org/issue/13656#issuecomment-253600758 21 | // https://golang.org/cmd/cgo/#hdr-C_references_to_Go 22 | // https://stackoverflow.com/a/49879469 23 | 24 | var ( 25 | iopMu sync.RWMutex // for controlling access to the slice header (i.e. https://stackoverflow.com/a/49879469) 26 | iop = []interface{}{nil} // the 0th element is reserved to prevent mistakes 27 | ) 28 | 29 | // iopPut adds the io.Reader and/or io.Writer, and returns its new iid. The iid 30 | // will be valid until iopDel is called, but will never be reused. 31 | func iopPut(rw interface{}) int { 32 | switch rw.(type) { 33 | case io.Reader, io.Writer: 34 | iopMu.Lock() 35 | iop = append(iop, rw) 36 | iid := len(iop) - 1 37 | iopMu.Unlock() 38 | return iid 39 | default: 40 | panic("not a reader, writer, or both") 41 | } 42 | } 43 | 44 | // iopGet gets the interface referenced by iid. It will panic if iid has never 45 | // been issued by iopPut, and will return nil if it has been deleted by iopDel. 46 | func iopGet(iid int) interface{} { 47 | iopMu.RLock() 48 | if iid <= 0 || iid >= len(iop) { 49 | panic("invalid iid") 50 | } 51 | r := iop[iid] 52 | iopMu.RUnlock() 53 | return r 54 | } 55 | 56 | // iopDel sets the interface referenced by iid to nil to prevent future usage. 57 | // It will panic if iid has never been issued by iopPut. 58 | func iopDel(iid int) { 59 | iopMu.RLock() 60 | if iid <= 0 || iid >= len(iop) { 61 | panic("invalid iid") 62 | } 63 | iop[iid] = nil 64 | iopMu.RUnlock() 65 | } 66 | 67 | //export go_iop_check 68 | func go_iop_check(iid C.int, t C.int, out_err **C.char) bool /*C.bool*/ { 69 | var n []string 70 | i := iopGet(int(iid)) 71 | if t&(1<<0) != 0 { // go_iop_type::reader 72 | if _, ok := iopGet(int(iid)).(io.Reader); !ok { 73 | n = append(n, "io.Reader") 74 | } 75 | } 76 | if t&(1<<1) != 0 { // go_iop_type::writer 77 | if _, ok := iopGet(int(iid)).(io.Writer); !ok { 78 | n = append(n, "io.Writer") 79 | } 80 | } 81 | if out_err != nil { 82 | if len(n) != 0 { 83 | *out_err = C.CString(fmt.Sprintf("iid %d: underlying type %T does not implement types %s", int(iid), i, n)) 84 | } else { 85 | *out_err = nil 86 | } 87 | } 88 | return len(n) == 0 89 | } 90 | 91 | //export go_iop_read 92 | func go_iop_read(iid C.int, buf *C.char, buf_n C.size_t, out_err **C.char) C.ptrdiff_t { 93 | *out_err = nil 94 | switch i := iopGet(int(iid)).(type) { 95 | case io.Reader: 96 | n, err := i.Read((*[1 << 28]byte)(unsafe.Pointer(buf))[:int(buf_n):int(buf_n)]) 97 | if err == io.EOF { 98 | if n == 0 { 99 | return C.ptrdiff_t(-1) 100 | } 101 | } else if err != nil { 102 | *out_err = C.CString(fmt.Sprintf("go_iop_read: read up to %d bytes from iid %d: %v", buf_n, int(iid), err)) 103 | } 104 | return C.ptrdiff_t(n) 105 | case nil: 106 | *out_err = C.CString(fmt.Sprintf("go_iop_read: iid %d has been deleted", int(iid))) 107 | return C.ptrdiff_t(0) 108 | default: 109 | *out_err = C.CString(fmt.Sprintf("go_iop_read: iid %d is a %T, not an io.Reader", int(iid), i)) 110 | return C.ptrdiff_t(0) 111 | } 112 | } 113 | 114 | //export go_iop_write 115 | func go_iop_write(iid C.int, buf *C.char, buf_n C.size_t, out_err **C.char) C.ptrdiff_t { 116 | *out_err = nil 117 | switch i := iopGet(int(iid)).(type) { 118 | case io.Writer: 119 | n, err := i.Write((*[1 << 28]byte)(unsafe.Pointer(buf))[:int(buf_n):int(buf_n)]) 120 | if err == io.EOF { 121 | if n == 0 { 122 | return C.ptrdiff_t(-1) 123 | } 124 | } else if err != nil { 125 | *out_err = C.CString(fmt.Sprintf("go_iop_write: write up to %d bytes to iid %d: %v", buf_n, int(iid), err)) 126 | } 127 | return C.ptrdiff_t(n) 128 | case nil: 129 | *out_err = C.CString(fmt.Sprintf("go_iop_write: iid %d has been deleted", int(iid))) 130 | return C.ptrdiff_t(0) 131 | default: 132 | *out_err = C.CString(fmt.Sprintf("go_iop_write: iid %d is a %T, not an io.Writer", int(iid), i)) 133 | return C.ptrdiff_t(0) 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /marisa/shim.h: -------------------------------------------------------------------------------- 1 | #ifndef GO_SHIM_H 2 | #define GO_SHIM_H 3 | 4 | #ifdef __cplusplus 5 | #include 6 | extern "C" { 7 | #else 8 | #include 9 | #include 10 | #endif 11 | 12 | // go_iop_type represents interfaces an iid may implement. 13 | enum go_iop_type { 14 | reader = 1 << 0, // io.Reader 15 | writer = 1 << 1, // io.Writer 16 | }; 17 | 18 | // go_iop_reader checks if the specified iid implements the specified ORed type 19 | // flags. Note that it doesn't have to be checked here, as go_iop_* will return 20 | // an error if it doesn't implement the necessary interfaces. If out_err is not 21 | // NULL and the return value is false, it will be set to an error message, which 22 | // must be freed by the caller, if the iid doesn't implement the specified 23 | // flags. 24 | bool go_iop_check(int iid, int t, char **out_err); 25 | 26 | // Note: we use ptrdiff_t over ssize_t for portability (and not size_t because 27 | // it will return -1 for EOF). Also, note that C++'s std::streamsize uses this 28 | // internally too, which is a nice advantage. 29 | 30 | // go_iop_read reads from the iid's underlying io.Reader. It has the same 31 | // semantics as the Go one, but io.EOF is returned as -1. out_err must be a 32 | // valid pointer to a char pointer. If an error occured, it is set and must be 33 | // freed by the caller. 34 | ptrdiff_t go_iop_read(int iid, const char *p, size_t n, char **out_err); 35 | // go_iop_write writes to the iid's underlying io.Writer. It has the same 36 | // semantics as the Go one, but io.EOF is returned as -1. out_err must be a 37 | // valid pointer to a char pointer. If an error occured, it is set and must be 38 | // freed by the caller. 39 | ptrdiff_t go_iop_write(int iid, const char *p, size_t n, char **out_err); 40 | 41 | #ifdef __cplusplus 42 | } 43 | 44 | #include 45 | #include 46 | #include 47 | #include 48 | 49 | // https://golang.org/cmd/cgo/#hdr-C_references_to_Go 50 | // https://en.cppreference.com/w/cpp/io/basic_streambuf <- this describes it better than many of the other sites I found 51 | 52 | namespace go { 53 | 54 | bool dbg(const char* format, ...) { 55 | static bool _dbg = getenv("GOSHIMDEBUG") ? getenv("GOSHIMDEBUG")[0] == '1' && getenv("GOSHIMDEBUG")[1] == '\0' : false; 56 | if (!_dbg) 57 | return false; 58 | fprintf(stderr, "GOSHIMDEBUG: "); 59 | va_list arg; 60 | va_start(arg, format); 61 | vfprintf(stderr, format, arg); 62 | va_end(arg); 63 | fflush(stderr); 64 | return true; 65 | } 66 | 67 | class error : public std::runtime_error { 68 | public: 69 | error(const char* what) : std::runtime_error(what) { 70 | go::dbg("new go::error(%s)\n", what); 71 | }; 72 | 73 | // check checks an output err pointer and frees+throws it if set. 74 | static void check(char* err) { 75 | if (!err) 76 | return; 77 | go::error ex = go::error(err); 78 | free(err); 79 | throw ex; 80 | } 81 | }; 82 | 83 | class iopbuf : public std::basic_streambuf { 84 | int iid_; 85 | char rbuf_; // single-byte read buffer (i.e. direct access to the io.Reader) 86 | public: 87 | static_assert((std::is_same::value && std::is_same::value), "Go shim only supports char"); // just to be safe 88 | #ifndef __clang__ 89 | static_assert(iopbuf::traits_type::eof() != iopbuf::traits_type::to_int_type((char) 0xFF), "EOF not distinct from 0xFF"); // this is already specified in the spec, but just to make sure 90 | #endif 91 | 92 | iopbuf(int iid) : iid_(iid) { 93 | this->setg(&this->rbuf_, &this->rbuf_ + 1, &this->rbuf_ + 1); // set the buffer, but at the end to force the next read to underflow 94 | } 95 | 96 | iopbuf(int iid, int t) : iopbuf(iid) { 97 | char* err = NULL; 98 | go_iop_check(iid, t, &err); 99 | go::error::check(err); 100 | } 101 | 102 | iopbuf::int_type underflow() override { 103 | // This is all that's strictly needed for reading. Note that we can't 104 | // just return the char, and we must set the buffer to point to it to 105 | // conform to the expected postconditions and prevent unusual bugs from 106 | // popping up. 107 | 108 | char* err = NULL; 109 | ptrdiff_t n = go_iop_read(this->iid_, &this->rbuf_, 1, &err); 110 | go::dbg("underflow: go_iop_read(%d, 1) = %td %02x err=%s\n", this->iid_, n, this->rbuf_, err); fflush(stdout); 111 | go::error::check(err); 112 | 113 | this->setg(&this->rbuf_, &this->rbuf_, &this->rbuf_ + (n>0 ? n : 0)); // Update the current byte. 114 | return this->gptr() == this->egptr() // If the new current pos == past end of buffer, no byte was read (n<=0). 115 | ? iopbuf::traits_type::eof() // If no byte was read (and no error was thrown earlier), it's an EOF. 116 | : iopbuf::traits_type::to_int_type(this->rbuf_); // Otherwise, return the byte we just read (note: without to_int_type, 0xFF would be sign extended to -1/eof). 117 | } 118 | 119 | std::streamsize xsgetn(iopbuf::char_type* buf, std::streamsize buf_n) override { 120 | // We can provide a more efficient bulk read implementation than the 121 | // default one which gets each byte one-by-one in a loop. 122 | // Note: Remember to test ::underflow by forcing it to use the default 123 | // implementation: return std::streambuf::xsgetn(buf, buf_n); 124 | 125 | std::streamsize t = 0; 126 | 127 | ptrdiff_t n = 0; 128 | char* err = NULL; 129 | while (t != buf_n && n != -1) { 130 | n = go_iop_read(this->iid_, buf+t, buf_n-t, &err); 131 | go::dbg("xsgetn: go_iop_read(%d, %zu) = %td (%td/%td) err=%s\n", this->iid_, buf_n-t, n, t+(n>0 ? n : 0), buf_n, err); fflush(stdout); 132 | t += n>0 ? n : 0; 133 | if (t > buf_n) 134 | throw go::error("read returned too many bytes!"); 135 | go::error::check(err); 136 | } 137 | 138 | this->rbuf_ = t>0 ? buf[t-1] : 0; // Set the current byte to the last one read, if any. 139 | this->setg(&this->rbuf_, &this->rbuf_, &this->rbuf_ + (t>0 ? 1 : 0)); // Update the current byte. 140 | return this->gptr() == this->egptr() // If the new current pos == past end of buffer, no byte was read (n<=0). 141 | ? iopbuf::traits_type::eof() // If no byte was read (and no error was thrown earlier), it's an EOF 142 | : t; // Otherwise, return the number of bytes read. 143 | } 144 | 145 | iopbuf::int_type overflow(iopbuf::int_type c = iopbuf::traits_type::eof()) override { 146 | // Unlike for reading, we don't have to use a buffer (you can read a 147 | // byte advancing, but you can't do that kind of thing when writing), 148 | // so we'll just write it directly. This makes the implementation much 149 | // simpler, as we're basically just passing the calls to the Go funcs 150 | // directly. 151 | 152 | // Usually, we would flush the buffer if given an EOF instead of a char, 153 | // but we're not using one, so it's a no-op. 154 | if (iopbuf::traits_type::eq_int_type(c, iopbuf::traits_type::eof())) 155 | return 0; 156 | 157 | // Since the logic is basically a simplified version of xsputn, just 158 | // with a single char, it's easier just to call it and implement the 159 | // bulk of the logic there. 160 | if (this->xsputn(reinterpret_cast(&c), 1) != 1) 161 | throw go::error("short write"); // we still need to check for a short write 162 | return c; 163 | } 164 | 165 | std::streamsize xsputn(const iopbuf::char_type* buf, std::streamsize buf_n) override { 166 | char* err = NULL; 167 | ptrdiff_t n = go_iop_write(this->iid_, buf, buf_n, &err); 168 | go::error::check(err); 169 | if (n == -1) 170 | throw go::error("EOF while writing to Go writer"); 171 | return n; 172 | } 173 | }; 174 | 175 | class rwstream : private iopbuf, public std::iostream { 176 | public: rwstream(int iid) : iopbuf(iid, go_iop_type::reader|go_iop_type::writer), std::iostream(this) {} 177 | }; 178 | 179 | class wstream : private iopbuf, public std::ostream { 180 | public: wstream(int iid) : iopbuf(iid, go_iop_type::writer), std::ostream(this) {} 181 | }; 182 | 183 | class rstream : private iopbuf, public std::istream { 184 | public: rstream(int iid) : iopbuf(iid, go_iop_type::reader), std::istream(this) {} 185 | }; 186 | 187 | } 188 | 189 | #endif 190 | #endif --------------------------------------------------------------------------------