├── .gitignore
├── .spelling.hjson
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── aff.go
├── aff_test.go
├── case.go
├── case_test.go
├── cmd
    ├── gospell
    │   └── main.go
    └── sample
    │   └── sample.go
├── file.go
├── gospell.go
├── notwords.go
├── notwords_test.go
├── plaintext
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── cmd
    │   └── plaintext
    │   │   └── main.go
    ├── golang.go
    ├── html.go
    ├── html_test.go
    ├── identity.go
    ├── identity_test.go
    ├── markdown.go
    ├── markdown_test.go
    ├── mime.go
    ├── script.go
    ├── script_test.go
    ├── template.go
    ├── template_test.go
    └── text.go
├── words.go
└── words_test.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | 
 3 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 4 | *.o
 5 | *.a
 6 | *.so
 7 | 
 8 | # Folders
 9 | _obj
10 | _test
11 | 
12 | # Architecture specific extensions/prefixes
13 | *.[568vq]
14 | [568vq].out
15 | 
16 | *.cgo1.go
17 | *.cgo2.c
18 | _cgo_defun.c
19 | _cgo_gotypes.go
20 | _cgo_export.*
21 | 
22 | _testmain.go
23 | 
24 | *.exe
25 | *.test
26 | *.prof
27 | 


--------------------------------------------------------------------------------
/.spelling.hjson:
--------------------------------------------------------------------------------
 1 | extra: [
 2 |   words/LocalDictionary
 3 | ]
 4 | additions: [
 5 |  AFF
 6 |  DIC
 7 |  PFX
 8 |  SFX
 9 |  hunspell
10 |  UTF
11 |  WIP
12 |  gospell
13 |  io
14 |  googlesource
15 | ]
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 | dist: trusty
3 | language: generic
4 | script:
5 |   - make -e docker-ci
6 | 
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Nick Galbreath
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all: install lint test
 3 | 
 4 | install:
 5 | 	go get ./...
 6 | 	go install ./...
 7 | 
 8 | lint:
 9 | 	golint ./...
10 | 	go vet ./...
11 | 	find . -name '*.go' | xargs gofmt -w -s
12 | 
13 | test:
14 | 	go test ./...
15 | 	find . -name '*.go' | xargs misspell
16 | 	find . -name '*.md' | xargs misspell
17 | 
18 | clean:
19 | 	rm -f *~ cmd/gospell/*~
20 | 	go clean ./...
21 | 	git gc
22 | 
23 | ci: install lint test
24 | 
25 | docker-ci:
26 | 	docker run --rm \
27 | 		-e COVERALLS_REPO_TOKEN=$COVERALLS_REPO_TOKEN \
28 | 		-v $(PWD):/go/src/github.com/client9/gospell \
29 | 		-w /go/src/github.com/client9/gospell \
30 | 		nickg/golang-dev-docker \
31 | 		make ci
32 | 
33 | .PHONY: ci docker-ci
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gospell
 2 | [![Build Status](https://travis-ci.org/client9/gospell.svg?branch=master)](https://travis-ci.org/client9/gospell) [![Go Report Card](http://goreportcard.com/badge/client9/gospell)](http://goreportcard.com/report/client9/gospell) [![GoDoc](https://godoc.org/github.com/client9/gospell?status.svg)](https://godoc.org/github.com/client9/gospell) [![Coverage](http://gocover.io/_badge/github.com/client9/gospell)](http://gocover.io/github.com/client9/gospell) [![license](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](https://raw.githubusercontent.com/client9/gospell/master/LICENSE)
 3 | 
 4 | pure golang spelling dictionary based on hunspell dictionaries.
 5 | 
 6 | NOTE: I'm not an expert in linguistics nor spelling.  Help is very
 7 | welcome!
 8 | 
 9 | ### What is hunspell?
10 | 
11 | * http://hunspell.github.io
12 | * https://github.com/hunspell
13 | 
14 | NOTE: This is not affiliated with Hunspell although if they wanted
15 | merge it in as an official project, I'd be happy to donate the code
16 | (although it's in no shape to do so right now).
17 | 
18 | ### Where can I get English dictionaries?
19 | 
20 | 
21 | 
22 | The world of spelling dictionaries is surprisingly complicated, as
23 | "lists of words" are frequently proprietary and with conflicting
24 | software licenses.
25 | 
26 | 
27 | ### Kevin Atkinson
28 | 
29 | [Kevin Atkinson](http://www.kevina.org)
30 | maintains many open source lists via
31 | the [SCOWL](http://wordlist.aspell.net) project.  The source code and
32 | raw lists are available on
33 | [GitHub `kevina/wordlist`](https://github.com/kevina/wordlist)
34 | 
35 | 
36 | #### Marco A.G.Pinto
37 | 
38 | Marco maintains the released dictionaries for Firefox and Apache Open
39 | Office.  The word lists appears to be actively updated.
40 | 
41 | https://github.com/marcoagpinto/aoo-mozilla-en-dict
42 | 
43 | #### Open Office
44 | 
45 | http://extensions.openoffice.org/en/project/english-dictionaries-apache-openoffice
46 | 
47 | The downloaded file has a `.oxt` extension but it's a compressed `tar`
48 | file.  Extract the files using:
49 | 
50 | ```
51 | mkdir dict-en
52 | cd dict-en
53 | tar -xzf ../dict-en.oxt
54 | ```
55 | 
56 | #### Chromium
57 | 
58 | The Chrome/Chromium browser uses Hunspell and it's source tree
59 | contains various up-to-date dictionaries, some with additional words.  You can view them at
60 | [chromium.googlesource.com](https://chromium.googlesource.com/chromium/deps/hunspell_dictionaries/+/master)
61 | and you can check them out locally via
62 | 
63 | ```bash
64 | git clone --depth=1 https://chromium.googlesource.com/chromium/deps/hunspell_dictionaries
65 | ```
66 | 
67 | More information can be found in the [chromium developer guide](https://www.chromium.org/developers/how-tos/editing-the-spell-checking-dictionaries)
68 | 


--------------------------------------------------------------------------------
/aff.go:
--------------------------------------------------------------------------------
  1 | package gospell
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"regexp"
  8 | 	"strconv"
  9 | 	"strings"
 10 | )
 11 | 
 12 | // AffixType is either an affix prefix or suffix
 13 | type AffixType int
 14 | 
 15 | // specific Affix types
 16 | const (
 17 | 	Prefix AffixType = iota
 18 | 	Suffix
 19 | )
 20 | 
 21 | // Affix is a rule for affix (adding prefixes or suffixes)
 22 | type Affix struct {
 23 | 	Type         AffixType // either PFX or SFX
 24 | 	CrossProduct bool
 25 | 	Rules        []Rule
 26 | }
 27 | 
 28 | // Expand provides all variations of a given word based on this affix rule
 29 | func (a Affix) Expand(word string, out []string) []string {
 30 | 	for _, r := range a.Rules {
 31 | 		if r.matcher != nil && !r.matcher.MatchString(word) {
 32 | 			continue
 33 | 		}
 34 | 		if a.Type == Prefix {
 35 | 			out = append(out, r.AffixText+word)
 36 | 			// TODO is does Strip apply to prefixes too?
 37 | 		} else {
 38 | 			stripWord := word
 39 | 			if r.Strip != "" && strings.HasSuffix(word, r.Strip) {
 40 | 				stripWord = word[:len(word)-len(r.Strip)]
 41 | 			}
 42 | 			out = append(out, stripWord+r.AffixText)
 43 | 		}
 44 | 	}
 45 | 	return out
 46 | }
 47 | 
 48 | // Rule is a Affix rule
 49 | type Rule struct {
 50 | 	Strip     string
 51 | 	AffixText string         // suffix or prefix text to add
 52 | 	Pattern   string         // original matching pattern from AFF file
 53 | 	matcher   *regexp.Regexp // matcher to see if this rule applies or not
 54 | }
 55 | 
 56 | // DictConfig is a partial representation of a Hunspell AFF (Affix) file.
 57 | type DictConfig struct {
 58 | 	Flag              string
 59 | 	TryChars          string
 60 | 	WordChars         string
 61 | 	NoSuggestFlag     rune
 62 | 	IconvReplacements []string
 63 | 	Replacements      [][2]string
 64 | 	AffixMap          map[rune]Affix
 65 | 	CamelCase         int
 66 | 	CompoundMin       int
 67 | 	CompoundOnly      string
 68 | 	CompoundRule      []string
 69 | 	compoundMap       map[rune][]string
 70 | }
 71 | 
 72 | // Expand expands a word/affix using dictionary/affix rules
 73 | //  This also supports CompoundRule flags
 74 | func (a DictConfig) Expand(wordAffix string, out []string) ([]string, error) {
 75 | 	out = out[:0]
 76 | 	idx := strings.Index(wordAffix, "/")
 77 | 
 78 | 	// not found
 79 | 	if idx == -1 {
 80 | 		out = append(out, wordAffix)
 81 | 		return out, nil
 82 | 	}
 83 | 	if idx == 0 || idx+1 == len(wordAffix) {
 84 | 		return nil, fmt.Errorf("Slash char found in first or last position")
 85 | 	}
 86 | 	// safe
 87 | 	word, keyString := wordAffix[:idx], wordAffix[idx+1:]
 88 | 
 89 | 	// check to see if any of the flags are in the
 90 | 	// "compound only".  If so then nothing to add
 91 | 	compoundOnly := false
 92 | 	for _, key := range keyString {
 93 | 		if strings.IndexRune(a.CompoundOnly, key) != -1 {
 94 | 			compoundOnly = true
 95 | 			continue
 96 | 		}
 97 | 		if _, ok := a.compoundMap[key]; !ok {
 98 | 			// the isn't a compound flag
 99 | 			continue
100 | 		}
101 | 		// is a compound flag
102 | 		a.compoundMap[key] = append(a.compoundMap[key], word)
103 | 	}
104 | 
105 | 	if compoundOnly {
106 | 		return out, nil
107 | 	}
108 | 
109 | 	out = append(out, word)
110 | 	prefixes := make([]Affix, 0, 5)
111 | 	suffixes := make([]Affix, 0, 5)
112 | 	for _, key := range keyString {
113 | 		// want keyString to []?something?
114 | 		// then iterate over that
115 | 		af, ok := a.AffixMap[key]
116 | 		if !ok {
117 | 			// is it compound flag?
118 | 			if _, ok := a.compoundMap[key]; ok {
119 | 				continue
120 | 			}
121 | 			// is it a NoSuggest?
122 | 			if key == a.NoSuggestFlag {
123 | 				continue
124 | 			}
125 | 			// no idea
126 | 			return nil, fmt.Errorf("unable to find affix key %v", key)
127 | 		}
128 | 		if !af.CrossProduct {
129 | 			out = af.Expand(word, out)
130 | 			continue
131 | 		}
132 | 		if af.Type == Prefix {
133 | 			prefixes = append(prefixes, af)
134 | 		} else {
135 | 			suffixes = append(suffixes, af)
136 | 		}
137 | 	}
138 | 
139 | 	// expand all suffixes with out any prefixes
140 | 	for _, suf := range suffixes {
141 | 		out = suf.Expand(word, out)
142 | 	}
143 | 	for _, pre := range prefixes {
144 | 		prewords := pre.Expand(word, nil)
145 | 		out = append(out, prewords...)
146 | 
147 | 		// now do cross product
148 | 		for _, suf := range suffixes {
149 | 			for _, w := range prewords {
150 | 				out = suf.Expand(w, out)
151 | 			}
152 | 		}
153 | 	}
154 | 	return out, nil
155 | }
156 | 
157 | func isCrossProduct(val string) (bool, error) {
158 | 	switch val {
159 | 	case "Y":
160 | 		return true, nil
161 | 	case "N":
162 | 		return false, nil
163 | 	}
164 | 	return false, fmt.Errorf("CrossProduct is not Y or N: got %q", val)
165 | }
166 | 
167 | // NewDictConfig reads an Hunspell AFF file
168 | func NewDictConfig(file io.Reader) (*DictConfig, error) {
169 | 	aff := DictConfig{
170 | 		Flag:        "ASCII",
171 | 		AffixMap:    make(map[rune]Affix),
172 | 		compoundMap: make(map[rune][]string),
173 | 		CompoundMin: 3, // default in Hunspell
174 | 	}
175 | 	scanner := bufio.NewScanner(file)
176 | 	for scanner.Scan() {
177 | 		line := scanner.Text()
178 | 		parts := strings.Fields(line)
179 | 		if len(parts) == 0 {
180 | 			continue
181 | 		}
182 | 		switch parts[0] {
183 | 		case "#":
184 | 			continue
185 | 		case "TRY":
186 | 			if len(parts) != 2 {
187 | 				return nil, fmt.Errorf("TRY stanza had %d fields, expected 2", len(parts))
188 | 			}
189 | 			aff.TryChars = parts[1]
190 | 		case "ICONV":
191 | 			// if only 2 fields, then its the first stanza that just provides a count
192 | 			//  we don't care, as we dynamically allocate
193 | 			if len(parts) == 2 {
194 | 				continue
195 | 			}
196 | 			if len(parts) != 3 {
197 | 				return nil, fmt.Errorf("ICONV stanza had %d fields, expected 2", len(parts))
198 | 			}
199 | 			// we have 3
200 | 			aff.IconvReplacements = append(aff.IconvReplacements, parts[1], parts[2])
201 | 		case "REP":
202 | 			// if only 2 fields, then its the first stanza that just provides a count
203 | 			//  we don't care, as we dynamically allocate
204 | 			if len(parts) == 2 {
205 | 				continue
206 | 			}
207 | 			if len(parts) != 3 {
208 | 				return nil, fmt.Errorf("REP stanza had %d fields, expected 2", len(parts))
209 | 			}
210 | 			// we have 3
211 | 			aff.Replacements = append(aff.Replacements, [2]string{parts[1], parts[2]})
212 | 		case "COMPOUNDMIN":
213 | 			if len(parts) != 2 {
214 | 				return nil, fmt.Errorf("COMPOUNDMIN stanza had %d fields, expected 2", len(parts))
215 | 			}
216 | 			val, err := strconv.ParseInt(parts[1], 10, 64)
217 | 			if err != nil {
218 | 				return nil, fmt.Errorf("COMPOUNDMIN stanza had %q expected number", parts[1])
219 | 			}
220 | 			aff.CompoundMin = int(val)
221 | 		case "ONLYINCOMPOUND":
222 | 			if len(parts) != 2 {
223 | 				return nil, fmt.Errorf("ONLYINCOMPOUND stanza had %d fields, expected 2", len(parts))
224 | 			}
225 | 			aff.CompoundOnly = parts[1]
226 | 		case "COMPOUNDRULE":
227 | 			if len(parts) != 2 {
228 | 				return nil, fmt.Errorf("COMPOUNDRULE stanza had %d fields, expected 2", len(parts))
229 | 			}
230 | 			val, err := strconv.ParseInt(parts[1], 10, 64)
231 | 			if err == nil {
232 | 				aff.CompoundRule = make([]string, 0, val)
233 | 			} else {
234 | 				aff.CompoundRule = append(aff.CompoundRule, parts[1])
235 | 				for _, char := range parts[1] {
236 | 					if _, ok := aff.compoundMap[char]; !ok {
237 | 						aff.compoundMap[char] = []string{}
238 | 					}
239 | 				}
240 | 			}
241 | 		case "NOSUGGEST":
242 | 			if len(parts) != 2 {
243 | 				return nil, fmt.Errorf("NOSUGGEST stanza had %d fields, expected 2", len(parts))
244 | 			}
245 | 			// should use runes or parse correctly
246 | 			chars := []rune(parts[1])
247 | 			if len(chars) != 1 {
248 | 				return nil, fmt.Errorf("NOSUGGEST stanza had more than one flag: %q", parts[1])
249 | 			}
250 | 			aff.NoSuggestFlag = chars[0]
251 | 		case "WORDCHARS":
252 | 			if len(parts) != 2 {
253 | 				return nil, fmt.Errorf("WORDCHAR stanza had %d fields, expected 2", len(parts))
254 | 			}
255 | 			aff.WordChars = parts[1]
256 | 		case "FLAG":
257 | 			if len(parts) != 2 {
258 | 				return nil, fmt.Errorf("FLAG stanza had %d, expected 1", len(parts))
259 | 			}
260 | 			aff.Flag = parts[1]
261 | 			return nil, fmt.Errorf("FLAG stanza not yet supported")
262 | 		case "PFX", "SFX":
263 | 			atype := Prefix
264 | 			if parts[0] == "SFX" {
265 | 				atype = Suffix
266 | 			}
267 | 
268 | 			switch len(parts) {
269 | 			case 4:
270 | 				cross, err := isCrossProduct(parts[2])
271 | 				if err != nil {
272 | 					return nil, err
273 | 				}
274 | 				// this is a new Affix!
275 | 				a := Affix{
276 | 					Type:         atype,
277 | 					CrossProduct: cross,
278 | 				}
279 | 				flag := rune(parts[1][0])
280 | 				aff.AffixMap[flag] = a
281 | 			case 5:
282 | 				// does this need to be split out into suffix and prefix?
283 | 				flag := rune(parts[1][0])
284 | 				a, ok := aff.AffixMap[flag]
285 | 				if !ok {
286 | 					return nil, fmt.Errorf("Got rules for flag %q but no definition", flag)
287 | 				}
288 | 
289 | 				strip := ""
290 | 				if parts[2] != "0" {
291 | 					strip = parts[2]
292 | 				}
293 | 
294 | 				var matcher *regexp.Regexp
295 | 				var err error
296 | 				pat := parts[4]
297 | 				if pat != "." {
298 | 					if a.Type == Prefix {
299 | 						pat = "^" + pat
300 | 					} else {
301 | 						pat = pat + "$"
302 | 					}
303 | 					matcher, err = regexp.Compile(pat)
304 | 					if err != nil {
305 | 						return nil, fmt.Errorf("Unable to compile %s", pat)
306 | 					}
307 | 				}
308 | 
309 | 				a.Rules = append(a.Rules, Rule{
310 | 					Strip:     strip,
311 | 					AffixText: parts[3],
312 | 					Pattern:   parts[4],
313 | 					matcher:   matcher,
314 | 				})
315 | 				aff.AffixMap[flag] = a
316 | 			default:
317 | 				return nil, fmt.Errorf("%s stanza had %d fields, expected 4 or 5", parts[0], len(parts))
318 | 			}
319 | 		default:
320 | 			// nothing
321 | 		}
322 | 	}
323 | 
324 | 	if err := scanner.Err(); err != nil {
325 | 		return nil, err
326 | 	}
327 | 
328 | 	return &aff, nil
329 | }
330 | 


--------------------------------------------------------------------------------
/aff_test.go:
--------------------------------------------------------------------------------
  1 | package gospell
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"strings"
  6 | 	"testing"
  7 | )
  8 | 
  9 | // SmokeTest for AFF parser.  Contains a little bit of everything.
 10 | //
 11 | func TestAFFSmoke(t *testing.T) {
 12 | 	sample := `
 13 | #
 14 | 
 15 | TRY abc
 16 | WORDCHARS 123
 17 | ICONV 1
 18 | ICONV a b
 19 | PFX A Y 1
 20 | PFX A   0     re .
 21 | SFX D Y 4
 22 | SFX D   0     d          e
 23 | SFX D   y     ied        [^aeiou]y
 24 | SFX D   0     ed         [^ey]
 25 | SFX D   0     ed         [aeiou]y
 26 | REP 1
 27 | REP a ei
 28 | COMPOUNDMIN 2
 29 | `
 30 | 	aff, err := NewDictConfig(strings.NewReader(sample))
 31 | 	if err != nil {
 32 | 		t.Fatalf("Unable to parse sample: %s", err)
 33 | 	}
 34 | 
 35 | 	if aff.TryChars != "abc" {
 36 | 		t.Errorf("TRY stanza is %s", aff.TryChars)
 37 | 	}
 38 | 
 39 | 	if aff.WordChars != "123" {
 40 | 		t.Errorf("WORDCHARS stanza is %s", aff.WordChars)
 41 | 	}
 42 | 
 43 | 	if aff.CompoundMin != 2 {
 44 | 		t.Errorf("COMPOUNDMIN stanza not processed, want 2 got %d", aff.CompoundMin)
 45 | 	}
 46 | 
 47 | 	if len(aff.IconvReplacements) != 2 {
 48 | 		t.Errorf("Didn't get ICONV replacement")
 49 | 	} else {
 50 | 		if aff.IconvReplacements[0] != "a" || aff.IconvReplacements[1] != "b" {
 51 | 			t.Errorf("Replacement isnt a->b, got %v", aff.IconvReplacements)
 52 | 		}
 53 | 	}
 54 | 
 55 | 	if len(aff.Replacements) != 1 {
 56 | 		t.Errorf("Didn't get REPlacement")
 57 | 	} else {
 58 | 		pair := aff.Replacements[0]
 59 | 		if pair[0] != "a" || pair[1] != "ei" {
 60 | 			t.Errorf("Replacement isnt [a ie] got %v", pair)
 61 | 		}
 62 | 	}
 63 | 
 64 | 	if len(aff.AffixMap) != 2 {
 65 | 		t.Errorf("AffixMap is wrong size")
 66 | 	}
 67 | 	a, ok := aff.AffixMap[rune('A')]
 68 | 	if !ok {
 69 | 		t.Fatalf("Didn't get Affix for A")
 70 | 	}
 71 | 	if a.Type != Prefix {
 72 | 		t.Fatalf("A Affix should be PFX %v, got %v", Prefix, a.Type)
 73 | 	}
 74 | 	if !a.CrossProduct {
 75 | 		t.Fatalf("A Affix should be a cross product")
 76 | 	}
 77 | 
 78 | 	variations := a.Expand("define", nil)
 79 | 	if len(variations) != 1 {
 80 | 		t.Fatalf("Expected 1 variation got %d", len(variations))
 81 | 	}
 82 | 	if variations[0] != "redefine" {
 83 | 		t.Errorf("Expected %s got %s", "redefine", variations[0])
 84 | 	}
 85 | 
 86 | 	a, ok = aff.AffixMap[rune('D')]
 87 | 	if !ok {
 88 | 		t.Fatalf("Didn't get Affix for D")
 89 | 	}
 90 | 	if a.Type != Suffix {
 91 | 		t.Fatalf("Affix D is not a SFX %v", Suffix)
 92 | 	}
 93 | 	if len(a.Rules) != 4 {
 94 | 		t.Fatalf("Affix should have 4 rules, got %d", len(a.Rules))
 95 | 	}
 96 | 	variations = a.Expand("accept", nil)
 97 | 	if len(variations) != 1 {
 98 | 		t.Fatalf("D Affix should have %d rules, got %d", 1, len(variations))
 99 | 	}
100 | 	if variations[0] != "accepted" {
101 | 		t.Errorf("Expected %s got %s", "accepted", variations[0])
102 | 	}
103 | }
104 | 
105 | func TestExpand(t *testing.T) {
106 | 	sample := `
107 | SET UTF-8
108 | TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'
109 | 
110 | REP 2
111 | REP f ph
112 | REP ph f
113 | 
114 | PFX A Y 1
115 | PFX A 0 re .
116 | 
117 | SFX B Y 2
118 | SFX B 0 ed [^y]
119 | SFX B y ied y
120 | `
121 | 	aff, err := NewDictConfig(strings.NewReader(sample))
122 | 	if err != nil {
123 | 		t.Fatalf("Unable to parse sample: %s", err)
124 | 	}
125 | 
126 | 	cases := []struct {
127 | 		word string
128 | 		want []string
129 | 	}{
130 | 		{"hello", []string{"hello"}},
131 | 		{"try/B", []string{"try", "tried"}},
132 | 		{"work/AB", []string{"work", "worked", "rework", "reworked"}},
133 | 	}
134 | 	for pos, tt := range cases {
135 | 		got, err := aff.Expand(tt.word, nil)
136 | 		if err != nil {
137 | 			t.Errorf("%d: affix expansions error: %s", pos, err)
138 | 		}
139 | 		if !reflect.DeepEqual(tt.want, got) {
140 | 			t.Errorf("%d: affix expansion want %v got %v", pos, tt.want, got)
141 | 		}
142 | 	}
143 | }
144 | 
145 | func TestCompound(t *testing.T) {
146 | 	sampleAff := `
147 | SET UTF-8
148 | COMPOUNDMIN 1
149 | ONLYINCOMPOUND c
150 | COMPOUNDRULE 2
151 | COMPOUNDRULE n*1t
152 | COMPOUNDRULE n*mp
153 | WORDCHARS 0123456789
154 | `
155 | 	sampleDic := `23
156 | 0/nm
157 | 0th/pt
158 | 1/n1
159 | 1st/p
160 | 1th/tc
161 | 2/nm
162 | 2nd/p
163 | 2th/tc
164 | 3/nm
165 | 3rd/p
166 | 3th/tc
167 | 4/nm
168 | 4th/pt
169 | 5/nm
170 | 5th/pt
171 | 6/nm
172 | 6th/pt
173 | 7/nm
174 | 7th/pt
175 | 8/nm
176 | 8th/pt
177 | 9/nm
178 | 9th/pt
179 | `
180 | 	aff := strings.NewReader(sampleAff)
181 | 	dic := strings.NewReader(sampleDic)
182 | 	gs, err := NewGoSpellReader(aff, dic)
183 | 	if err != nil {
184 | 		t.Fatalf("Unable to create GoSpell: %s", err)
185 | 	}
186 | 
187 | 	cases := []struct {
188 | 		word  string
189 | 		spell bool
190 | 	}{
191 | 		{"0", true},
192 | 		{"1", true},
193 | 		{"2", true},
194 | 		{"3", true},
195 | 		{"4", true},
196 | 		{"5", true},
197 | 		{"6", true},
198 | 		{"7", true},
199 | 		{"8", true},
200 | 		{"9", true},
201 | 		{"10", true},
202 | 		{"21", true},
203 | 		{"32", true},
204 | 		{"43", true},
205 | 		{"54", true},
206 | 		{"65", true},
207 | 		{"76", true},
208 | 		{"87", true},
209 | 		{"98", true},
210 | 		{"99", true},
211 | 		{"1st", true},
212 | 		{"21st", true},
213 | 		{"11th", true},
214 | 		{"1th", false},
215 | 		{"12th", true},
216 | 		{"2th", false},
217 | 		{"13th", true},
218 | 		{"3th", false},
219 | 		{"3rd", true},
220 | 		{"33rd", true},
221 | 		{"4th", true},
222 | 		{"5th", true},
223 | 		{"6th", true},
224 | 		{"7th", true},
225 | 		{"8th", true},
226 | 		{"9th", true},
227 | 		{"14th", true},
228 | 		{"15th", true},
229 | 		{"16th", true},
230 | 		{"17th", true},
231 | 		{"18th", true},
232 | 		{"19th", true},
233 | 		{"111", true},
234 | 		{"111st", false},
235 | 		{"111th", true},
236 | 	}
237 | 	for pos, tt := range cases {
238 | 		if gs.Spell(tt.word) != tt.spell {
239 | 			t.Errorf("%d %q was not %v", pos, tt.word, tt.spell)
240 | 		}
241 | 	}
242 | }
243 | 
244 | func TestSpell(t *testing.T) {
245 | 	sampleAff := `
246 | SET UTF-8
247 | WORDCHARS 0123456789
248 | 
249 | PFX A Y 1
250 | PFX A 0 re .
251 | 
252 | SFX B Y 2
253 | SFX B 0 ed [^y]
254 | SFX B y ied y
255 | `
256 | 
257 | 	sampleDic := `4
258 | hello
259 | try/B
260 | work/AB
261 | GB
262 | `
263 | 	aff := strings.NewReader(sampleAff)
264 | 	dic := strings.NewReader(sampleDic)
265 | 	gs, err := NewGoSpellReader(aff, dic)
266 | 	if err != nil {
267 | 		t.Fatalf("Unable to create GoSpell: %s", err)
268 | 	}
269 | 
270 | 	cases := []struct {
271 | 		word  string
272 | 		spell bool
273 | 	}{
274 | 		{"hello", true},
275 | 		{"try", true},
276 | 		{"tried", true},
277 | 		{"work", true},
278 | 		{"worked", true},
279 | 		{"rework", true},
280 | 		{"reworked", true},
281 | 		{"junk", false},
282 | 		{"100", true},
283 | 		{"1", true},
284 | 		{"100GB", true},
285 | 		{"100mi", false},
286 | 		{"0xFF", true},
287 | 		{"0x12ff", true},
288 | 	}
289 | 	for pos, tt := range cases {
290 | 		if gs.Spell(tt.word) != tt.spell {
291 | 			t.Errorf("%d %q was not %v", pos, tt.word, tt.spell)
292 | 		}
293 | 	}
294 | }
295 | 


--------------------------------------------------------------------------------
/case.go:
--------------------------------------------------------------------------------
 1 | package gospell
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"unicode"
 6 | )
 7 | 
 8 | // WordCase is an enum of various word casing styles
 9 | type WordCase int
10 | 
11 | // Various WordCase types.. likely to be not correct
12 | const (
13 | 	AllLower WordCase = iota
14 | 	AllUpper
15 | 	Title
16 | 	Mixed
17 | 	Camel
18 | )
19 | 
20 | // CaseStyle returns what case style a word is in
21 | func CaseStyle(word string) WordCase {
22 | 	hasTitle := false
23 | 	upperCount := 0
24 | 	lowerCount := 0
25 | 	runeCount := 0
26 | 
27 | 	// this iterates over RUNES not BYTES
28 | 	for _, r := range word {
29 | 		// ASCII apostrophe doesn't count
30 | 		//  want words like "don't" to have
31 | 		//  upper case forms when adding to dictionary
32 | 		if r == 0x0027 {
33 | 			continue
34 | 		}
35 | 		runeCount++
36 | 		if unicode.IsLower(r) {
37 | 			lowerCount++
38 | 			continue
39 | 		}
40 | 		if unicode.IsUpper(r) {
41 | 			if runeCount == 1 {
42 | 				hasTitle = true
43 | 			}
44 | 			upperCount++
45 | 			continue
46 | 		}
47 | 
48 | 		//???
49 | 	}
50 | 
51 | 	switch {
52 | 	case runeCount == lowerCount:
53 | 		return AllLower
54 | 	case runeCount == upperCount:
55 | 		return AllUpper
56 | 	case hasTitle && runeCount-1 == lowerCount:
57 | 		return Title
58 | 	default:
59 | 		return Mixed
60 | 	}
61 | }
62 | 
63 | // CaseVariations returns
64 | // If AllUpper or First-Letter-Only is upcased: add the all upper case version
65 | // If AllLower, add the original, the title and upcase forms
66 | // If Mixed, return the original, and the all upcase form
67 | //
68 | func CaseVariations(word string, style WordCase) []string {
69 | 	switch style {
70 | 	case AllLower:
71 | 		return []string{word, strings.ToUpper(word[0:1]) + word[1:], strings.ToUpper(word)}
72 | 	case AllUpper:
73 | 		return []string{strings.ToUpper(word)}
74 | 	default:
75 | 		return []string{word, strings.ToUpper(word)}
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/case_test.go:
--------------------------------------------------------------------------------
 1 | package gospell
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestCaseStyle(t *testing.T) {
 9 | 	cases := []struct {
10 | 		word string
11 | 		want WordCase
12 | 	}{
13 | 		{"lower", AllLower},
14 | 		{"what's", AllLower},
15 | 		{"UPPER", AllUpper},
16 | 		{"Title", Title},
17 | 		{"CamelCase", Mixed},
18 | 		{"camelCase", Mixed},
19 | 	}
20 | 
21 | 	for pos, tt := range cases {
22 | 		got := CaseStyle(tt.word)
23 | 		if tt.want != got {
24 | 			t.Errorf("Case %d %q: want %v got %v", pos, tt.word, tt.want, got)
25 | 		}
26 | 	}
27 | }
28 | 
29 | func TestCaseVariations(t *testing.T) {
30 | 	cases := []struct {
31 | 		word string
32 | 		want []string
33 | 	}{
34 | 		{"that's", []string{"that's", "That's", "THAT'S"}},
35 | 	}
36 | 	for pos, tt := range cases {
37 | 		got := CaseVariations(tt.word, CaseStyle(tt.word))
38 | 		if !reflect.DeepEqual(tt.want, got) {
39 | 			t.Errorf("Case %d %q: want %v got %v", pos, tt.word, tt.want, got)
40 | 		}
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/cmd/gospell/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"flag"
  6 | 	"io/ioutil"
  7 | 	"log"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"text/template"
 11 | 	"time"
 12 | 
 13 | 	"github.com/client9/gospell"
 14 | 	"github.com/client9/gospell/plaintext"
 15 | )
 16 | 
 17 | var (
 18 | 	stdout      *log.Logger // see below in init()
 19 | 	defaultLog  *template.Template
 20 | 	defaultWord *template.Template
 21 | 	defaultLine *template.Template
 22 | )
 23 | 
 24 | const (
 25 | 	defaultLogTmpl  = `{{ .Path }}:{{ .LineNum }}:{{ js .Original }}`
 26 | 	defaultWordTmpl = `{{ .Original }}`
 27 | 	defaultLineTmpl = `{{ .Line }}`
 28 | )
 29 | 
 30 | func init() {
 31 | 	// we see it so it doesn't use a prefix or include a time stamp.
 32 | 	stdout = log.New(os.Stdout, "", 0)
 33 | 	defaultLog = template.Must(template.New("defaultLog").Parse(defaultLogTmpl))
 34 | 	defaultWord = template.Must(template.New("defaultWord").Parse(defaultWordTmpl))
 35 | 	defaultLine = template.Must(template.New("defaultLine").Parse(defaultLineTmpl))
 36 | }
 37 | 
 38 | func main() {
 39 | 	format := flag.String("f", "", "use Golang template for log message")
 40 | 	listOnly := flag.Bool("l", false, "only print unknown word")
 41 | 	lineOnly := flag.Bool("L", false, "print line with unknown word")
 42 | 
 43 | 	// TODO based on OS (Windows vs. Linux)
 44 | 	dictPath := flag.String("path", ".:/usr/local/share/hunspell:/usr/share/hunspell", "Search path for dictionaries")
 45 | 
 46 | 	// TODO based on environment variable settings
 47 | 	dicts := flag.String("d", "en_US", "dictionaries to load")
 48 | 
 49 | 	personalDict := flag.String("p", "", "personal wordlist file")
 50 | 
 51 | 	flag.Parse()
 52 | 	args := flag.Args()
 53 | 
 54 | 	if *listOnly {
 55 | 		defaultLog = defaultWord
 56 | 	}
 57 | 
 58 | 	if *lineOnly {
 59 | 		defaultLog = defaultLine
 60 | 	}
 61 | 
 62 | 	if len(*format) > 0 {
 63 | 		t, err := template.New("custom").Parse(*format)
 64 | 		if err != nil {
 65 | 			log.Fatalf("Unable to compile log format: %s", err)
 66 | 		}
 67 | 		defaultLog = t
 68 | 	}
 69 | 
 70 | 	affFile := ""
 71 | 	dicFile := ""
 72 | 	for _, base := range filepath.SplitList(*dictPath) {
 73 | 		affFile = filepath.Join(base, *dicts+".aff")
 74 | 		dicFile = filepath.Join(base, *dicts+".dic")
 75 | 		//log.Printf("Trying %s", affFile)
 76 | 		_, err1 := os.Stat(affFile)
 77 | 		_, err2 := os.Stat(dicFile)
 78 | 		if err1 == nil && err2 == nil {
 79 | 			break
 80 | 		}
 81 | 		affFile = ""
 82 | 		dicFile = ""
 83 | 	}
 84 | 
 85 | 	if affFile == "" {
 86 | 		log.Fatalf("Unable to load %s", *dicts)
 87 | 	}
 88 | 
 89 | 	log.Printf("Loading %s %s", affFile, dicFile)
 90 | 	timeStart := time.Now()
 91 | 	h, err := gospell.NewGoSpell(affFile, dicFile)
 92 | 	timeEnd := time.Now()
 93 | 
 94 | 	// note: 10x too slow
 95 | 	log.Printf("Loaded in %v", timeEnd.Sub(timeStart))
 96 | 	if err != nil {
 97 | 		log.Fatalf("%s", err)
 98 | 	}
 99 | 
100 | 	if *personalDict != "" {
101 | 		raw, err := ioutil.ReadFile(*personalDict)
102 | 		if err != nil {
103 | 			log.Fatalf("Unable to load personal dictionary %s: %s", *personalDict, err)
104 | 		}
105 | 		duplicates, err := h.AddWordList(bytes.NewReader(raw))
106 | 		if err != nil {
107 | 			log.Fatalf("Unable to process personal dictionary %s: %s", *personalDict, err)
108 | 		}
109 | 		if len(duplicates) > 0 {
110 | 			for _, word := range duplicates {
111 | 				log.Printf("Word %q in personal dictionary already exists in main dictionary", word)
112 | 			}
113 | 		}
114 | 	}
115 | 
116 | 	// stdin support
117 | 	if len(args) == 0 {
118 | 		raw, err := ioutil.ReadAll(os.Stdin)
119 | 		if err != nil {
120 | 			log.Fatalf("Unable to read Stdin: %s", err)
121 | 		}
122 | 		pt, _ := plaintext.NewIdentity()
123 | 		out := gospell.SpellFile(h, pt, raw)
124 | 		for _, diff := range out {
125 | 			diff.Filename = "stdin"
126 | 			diff.Path = ""
127 | 			buf := bytes.Buffer{}
128 | 			defaultLog.Execute(&buf, diff)
129 | 			// goroutine-safe print to os.Stdout
130 | 			stdout.Println(buf.String())
131 | 		}
132 | 	}
133 | 	for _, arg := range args {
134 | 		// ignore directories
135 | 		if f, err := os.Stat(arg); err != nil || f.IsDir() {
136 | 			continue
137 | 		}
138 | 
139 | 		raw, err := ioutil.ReadFile(arg)
140 | 		if err != nil {
141 | 			log.Fatalf("Unable to read %q: %s", arg, err)
142 | 		}
143 | 		pt, err := plaintext.ExtractorByFilename(arg)
144 | 		if err != nil {
145 | 			continue
146 | 		}
147 | 		out := gospell.SpellFile(h, pt, raw)
148 | 		for _, diff := range out {
149 | 			diff.Filename = filepath.Base(arg)
150 | 			diff.Path = arg
151 | 			buf := bytes.Buffer{}
152 | 			defaultLog.Execute(&buf, diff)
153 | 			// goroutine-safe print to os.Stdout
154 | 			stdout.Println(buf.String())
155 | 		}
156 | 	}
157 | }
158 | 


--------------------------------------------------------------------------------
/cmd/sample/sample.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"github.com/naoina/toml"
  5 | 	glob "github.com/ryanuber/go-glob"
  6 | 	"io/ioutil"
  7 | 	"log"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"strings"
 11 | 
 12 | 	"github.com/client9/gospell"
 13 | 	"github.com/client9/gospell/plaintext"
 14 | )
 15 | 
 16 | // Dictionary is the configuration structure
 17 | type Dictionary struct {
 18 | 	Language string   `json:"language"` // core dictionary
 19 | 	Extra    []string // extra word packs
 20 | 	//Wordlist  []string // personal word list files
 21 | 	Additions []string // inline word additions
 22 | 	Removals  []string
 23 | 
 24 | 	FileSet []DictionaryFileSet `json:"fileset"`
 25 | }
 26 | 
 27 | // FileSet represents options to select or exclude a group of files
 28 | type FileSet struct {
 29 | 	Path    string
 30 | 	Include []string
 31 | 	Exclude []string
 32 | 
 33 | 	Matches []string
 34 | }
 35 | 
 36 | // DictionaryFileSet extends FileSet to include other information ont
 37 | // the file type
 38 | type DictionaryFileSet struct {
 39 | 	FileSet
 40 | 	Charset      string
 41 | 	Source       string
 42 | 	TemplateType string
 43 | }
 44 | 
 45 | func (fs *FileSet) visit(path string, info os.FileInfo, err error) error {
 46 | 	if err != nil {
 47 | 		log.Printf("visitor failed on %q: %s", path, err)
 48 | 		return nil
 49 | 	}
 50 | 	included := false
 51 | 	for _, inc := range fs.Include {
 52 | 		if glob.Glob(inc, path) {
 53 | 			included = true
 54 | 			break
 55 | 		}
 56 | 	}
 57 | 	excluded := false
 58 | 	for _, exc := range fs.Exclude {
 59 | 		if strings.Index(path, exc) != -1 {
 60 | 			excluded = true
 61 | 			break
 62 | 		}
 63 | 	}
 64 | 	if included && !excluded && !info.IsDir() {
 65 | 		fs.Matches = append(fs.Matches, path)
 66 | 		//log.Printf("path allowed: %q", path)
 67 | 		return nil
 68 | 	}
 69 | 	if !included && !excluded {
 70 | 		//log.Printf("path ignored: %q", path)
 71 | 		return nil
 72 | 	}
 73 | 	if excluded && info.IsDir() {
 74 | 		//log.Printf("path ignoring directory %q", path)
 75 | 		return filepath.SkipDir
 76 | 	}
 77 | 	//log.Printf("Included then excluded: %q", path)
 78 | 	return nil
 79 | }
 80 | 
 81 | func main() {
 82 | 	config, err := ioutil.ReadFile(".spelling.toml")
 83 | 	if err != nil {
 84 | 		log.Fatalf("Unable to reading config: %s", err)
 85 | 	}
 86 | 	//log.Printf("JSON: %s", cson.ToJSON([]byte(config)))
 87 | 	s := Dictionary{}
 88 | 	err = toml.Unmarshal([]byte(config), &s)
 89 | 	if err != nil {
 90 | 		log.Printf("out : %+v", s)
 91 | 		log.Fatalf("err = %v", err)
 92 | 	}
 93 | 	if s.Language == "" {
 94 | 		s.Language = "en_US"
 95 | 	}
 96 | 	if s.Language != "en_US" {
 97 | 		log.Fatalf("Only support en_US: got %q", s.Language)
 98 | 	}
 99 | 	gs, err := gospell.NewGoSpell("/usr/local/share/hunspell/en_US.aff", "/usr/local/share/hunspell/en_US.dic")
100 | 	if err != nil {
101 | 		log.Fatalf("Unable to load dictionary: %s", err)
102 | 	}
103 | 
104 | 	for _, wordfile := range s.Extra {
105 | 		_, err := gs.AddWordListFile(wordfile)
106 | 		if err != nil {
107 | 			log.Printf("Unable to read word list %s: %s", wordfile, err)
108 | 		}
109 | 	}
110 | 
111 | 	for _, word := range s.Additions {
112 | 		log.Printf("Adding %q", word)
113 | 		gs.AddWordRaw(word)
114 | 	}
115 | 
116 | 	if len(s.FileSet) == 0 {
117 | 		s.FileSet = append(s.FileSet, DictionaryFileSet{
118 | 			FileSet: FileSet{
119 | 				Path:    ".",
120 | 				Include: []string{"*"},
121 | 				Exclude: []string{".git"},
122 | 			},
123 | 		})
124 | 	}
125 | 	finalExit := 0
126 | 	for _, fs := range s.FileSet {
127 | 		if fs.Path == "" {
128 | 			fs.Path = "."
129 | 		}
130 | 		filepath.Walk(fs.Path, fs.visit)
131 | 		for _, filename := range fs.Matches {
132 | 			raw, err := ioutil.ReadFile(filename)
133 | 			if err != nil {
134 | 				log.Printf("Unable to read %q: %s", filename, err)
135 | 				finalExit = finalExit | 2
136 | 				continue
137 | 			}
138 | 			pt, err := plaintext.ExtractorByFilename(filename)
139 | 			if err != nil {
140 | 				continue
141 | 			}
142 | 			out := gospell.SpellFile(gs, pt, raw)
143 | 			for _, diff := range out {
144 | 				diff.Filename = filepath.Base(filename)
145 | 				diff.Path = filename
146 | 				finalExit = finalExit | 1
147 | 				log.Printf("Got a %s:%d %s", diff.Path, diff.LineNum, diff.Original)
148 | 			}
149 | 		}
150 | 	}
151 | 	os.Exit(finalExit)
152 | }
153 | 


--------------------------------------------------------------------------------
/file.go:
--------------------------------------------------------------------------------
 1 | package gospell
 2 | 
 3 | import (
 4 | 	"github.com/client9/gospell/plaintext"
 5 | 
 6 | 	"strings"
 7 | )
 8 | 
 9 | // Diff represent a unknown word in a file
10 | type Diff struct {
11 | 	Filename string
12 | 	Path     string
13 | 	Original string
14 | 	Line     string
15 | 	LineNum  int
16 | }
17 | 
18 | // SpellFile is attempts to spell-check a file.  This interface is not
19 | // very good so expect changes.
20 | func SpellFile(gs *GoSpell, ext plaintext.Extractor, raw []byte) []Diff {
21 | 	out := []Diff{}
22 | 
23 | 	// remove any golang templates
24 | 	raw = plaintext.StripTemplate(raw)
25 | 
26 | 	// extract plain text
27 | 	raw = ext.Text(raw)
28 | 
29 | 	// do character conversion "smart quotes" to quotes, etc
30 | 	// as specified in the Affix file
31 | 	rawstring := gs.InputConversion(raw)
32 | 
33 | 	// zap URLS
34 | 	s := RemoveURL(rawstring)
35 | 	// zap file paths
36 | 	s = RemovePath(s)
37 | 
38 | 	for linenum, line := range strings.Split(s, "\n") {
39 | 		// now get words
40 | 		words := gs.Split(line)
41 | 		for _, word := range words {
42 | 			// HACK
43 | 			word = strings.Trim(word, "'")
44 | 			if known := gs.Spell(word); !known {
45 | 				out = append(out, Diff{
46 | 					Line:     line,
47 | 					LineNum:  linenum + 1,
48 | 					Original: word,
49 | 				})
50 | 			}
51 | 		}
52 | 	}
53 | 	return out
54 | }
55 | 


--------------------------------------------------------------------------------
/gospell.go:
--------------------------------------------------------------------------------
  1 | package gospell
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"log"
  8 | 	"os"
  9 | 	"regexp"
 10 | 	"strconv"
 11 | 	"strings"
 12 | )
 13 | 
 14 | // GoSpell is main struct
 15 | type GoSpell struct {
 16 | 	Config DictConfig
 17 | 	Dict   map[string]struct{} // likely will contain some value later
 18 | 
 19 | 	ireplacer *strings.Replacer // input conversion
 20 | 	compounds []*regexp.Regexp
 21 | 	splitter  *Splitter
 22 | }
 23 | 
 24 | // InputConversion does any character substitution before checking
 25 | //  This is based on the ICONV stanza
 26 | func (s *GoSpell) InputConversion(raw []byte) string {
 27 | 	sraw := string(raw)
 28 | 	if s.ireplacer == nil {
 29 | 		return sraw
 30 | 	}
 31 | 	return s.ireplacer.Replace(sraw)
 32 | }
 33 | 
 34 | // Split a text into Words
 35 | func (s *GoSpell) Split(text string) []string {
 36 | 	return s.splitter.Split(text)
 37 | }
 38 | 
 39 | // AddWordRaw adds a single word to the internal dictionary without modifications
 40 | // returns true if added
 41 | // return false is already exists
 42 | func (s *GoSpell) AddWordRaw(word string) bool {
 43 | 	_, ok := s.Dict[word]
 44 | 	if ok {
 45 | 		// already exists
 46 | 		return false
 47 | 	}
 48 | 	s.Dict[word] = struct{}{}
 49 | 	return true
 50 | }
 51 | 
 52 | // AddWordListFile reads in a word list file
 53 | func (s *GoSpell) AddWordListFile(name string) ([]string, error) {
 54 | 	fd, err := os.Open(name)
 55 | 	if err != nil {
 56 | 		return nil, err
 57 | 	}
 58 | 	defer fd.Close()
 59 | 	return s.AddWordList(fd)
 60 | }
 61 | 
 62 | // AddWordList adds basic word lists, just one word per line
 63 | //  Assumed to be in UTF-8
 64 | // TODO: hunspell compatible with "*" prefix for forbidden words
 65 | // and affix support
 66 | // returns list of duplicated words and/or error
 67 | func (s *GoSpell) AddWordList(r io.Reader) ([]string, error) {
 68 | 	var duplicates []string
 69 | 	scanner := bufio.NewScanner(r)
 70 | 	for scanner.Scan() {
 71 | 		line := strings.TrimSpace(scanner.Text())
 72 | 		if len(line) == 0 || line == "#" {
 73 | 			continue
 74 | 		}
 75 | 		for _, word := range CaseVariations(line, CaseStyle(line)) {
 76 | 			if !s.AddWordRaw(word) {
 77 | 				duplicates = append(duplicates, word)
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 	if err := scanner.Err(); err != nil {
 82 | 		return duplicates, err
 83 | 	}
 84 | 	return duplicates, nil
 85 | }
 86 | 
 87 | // Spell checks to see if a given word is in the internal dictionaries
 88 | // TODO: add multiple dictionaries
 89 | func (s *GoSpell) Spell(word string) bool {
 90 | 	//log.Printf("Checking %s", word)
 91 | 	_, ok := s.Dict[word]
 92 | 	if ok {
 93 | 		return true
 94 | 	}
 95 | 	if isNumber(word) {
 96 | 		return true
 97 | 	}
 98 | 	if isNumberHex(word) {
 99 | 		return true
100 | 	}
101 | 
102 | 	if isNumberBinary(word) {
103 | 		return true
104 | 	}
105 | 
106 | 	if isHash(word) {
107 | 		return true
108 | 	}
109 | 
110 | 	// check compounds
111 | 	for _, pat := range s.compounds {
112 | 		if pat.MatchString(word) {
113 | 			return true
114 | 		}
115 | 	}
116 | 
117 | 	// Maybe a word with units? e.g. 100GB
118 | 	units := isNumberUnits(word)
119 | 	if units != "" {
120 | 		// dictionary appears to have list of units
121 | 		if _, ok = s.Dict[units]; ok {
122 | 			return true
123 | 		}
124 | 	}
125 | 
126 | 	// if camelCase and each word e.g. "camel" "Case" is know
127 | 	// then the word is considered known
128 | 	if chunks := splitCamelCase(word); len(chunks) > 0 {
129 | 		if false {
130 | 			for _, chunk := range chunks {
131 | 				if _, ok = s.Dict[chunk]; !ok {
132 | 					return false
133 | 				}
134 | 			}
135 | 		}
136 | 		return true
137 | 	}
138 | 
139 | 	return false
140 | }
141 | 
142 | // NewGoSpellReader creates a speller from io.Readers for
143 | // Hunspell files
144 | func NewGoSpellReader(aff, dic io.Reader) (*GoSpell, error) {
145 | 	affix, err := NewDictConfig(aff)
146 | 	if err != nil {
147 | 		return nil, err
148 | 	}
149 | 
150 | 	scanner := bufio.NewScanner(dic)
151 | 	// get first line
152 | 	if !scanner.Scan() {
153 | 		return nil, scanner.Err()
154 | 	}
155 | 	line := scanner.Text()
156 | 	i, err := strconv.ParseInt(line, 10, 64)
157 | 	if err != nil {
158 | 		return nil, err
159 | 	}
160 | 
161 | 	gs := GoSpell{
162 | 		Dict:      make(map[string]struct{}, i*5),
163 | 		compounds: make([]*regexp.Regexp, 0, len(affix.CompoundRule)),
164 | 		splitter:  NewSplitter(affix.WordChars),
165 | 	}
166 | 
167 | 	words := []string{}
168 | 	for scanner.Scan() {
169 | 		line := scanner.Text()
170 | 		words, err = affix.Expand(line, words)
171 | 		if err != nil {
172 | 			return nil, fmt.Errorf("Unable to process %q: %s", line, err)
173 | 		}
174 | 
175 | 		if len(words) == 0 {
176 | 			//log.Printf("No words for %s", line)
177 | 			continue
178 | 		}
179 | 
180 | 		style := CaseStyle(words[0])
181 | 		for _, word := range words {
182 | 			for _, wordform := range CaseVariations(word, style) {
183 | 				gs.Dict[wordform] = struct{}{}
184 | 			}
185 | 		}
186 | 	}
187 | 
188 | 	if err := scanner.Err(); err != nil {
189 | 		return nil, err
190 | 	}
191 | 
192 | 	for _, compoundRule := range affix.CompoundRule {
193 | 		pattern := "^"
194 | 		for _, key := range compoundRule {
195 | 			switch key {
196 | 			case '(', ')', '+', '?', '*':
197 | 				pattern = pattern + string(key)
198 | 			default:
199 | 				groups := affix.compoundMap[key]
200 | 				pattern = pattern + "(" + strings.Join(groups, "|") + ")"
201 | 			}
202 | 		}
203 | 		pattern = pattern + "$"
204 | 		pat, err := regexp.Compile(pattern)
205 | 		if err != nil {
206 | 			log.Printf("REGEXP FAIL= %q %s", pattern, err)
207 | 		} else {
208 | 			gs.compounds = append(gs.compounds, pat)
209 | 		}
210 | 
211 | 	}
212 | 
213 | 	if len(affix.IconvReplacements) > 0 {
214 | 		gs.ireplacer = strings.NewReplacer(affix.IconvReplacements...)
215 | 	}
216 | 	return &gs, nil
217 | }
218 | 
219 | // NewGoSpell from AFF and DIC Hunspell filenames
220 | func NewGoSpell(affFile, dicFile string) (*GoSpell, error) {
221 | 	aff, err := os.Open(affFile)
222 | 	if err != nil {
223 | 		return nil, fmt.Errorf("Unable to open aff: %s", err)
224 | 	}
225 | 	defer aff.Close()
226 | 	dic, err := os.Open(dicFile)
227 | 	if err != nil {
228 | 		return nil, fmt.Errorf("Unable to open dic: %s", err)
229 | 	}
230 | 	defer dic.Close()
231 | 	h, err := NewGoSpellReader(aff, dic)
232 | 	return h, err
233 | }
234 | 


--------------------------------------------------------------------------------
/notwords.go:
--------------------------------------------------------------------------------
 1 | package gospell
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // Functions to remove non-words such as URLs, file paths, etc.
 9 | 
10 | // This needs auditing as I believe it is wrong
11 | func enURLChar(c rune) bool {
12 | 	return (c >= 'a' && c <= 'z') ||
13 | 		(c >= 'A' && c <= 'Z') ||
14 | 		(c >= '0' && c <= '9') ||
15 | 		c == '-' ||
16 | 		c == '_' ||
17 | 		c == '\\' ||
18 | 		c == '.' ||
19 | 		c == ':' ||
20 | 		c == ';' ||
21 | 		c == '/' ||
22 | 		c == '~' ||
23 | 		c == '%' ||
24 | 		c == '*' ||
25 | 		c == '$' ||
26 | 		c == '[' ||
27 | 		c == ']' ||
28 | 		c == '?' ||
29 | 		c == '#' ||
30 | 		c == '!'
31 | }
32 | func enNotURLChar(c rune) bool {
33 | 	return !enURLChar(c)
34 | }
35 | 
36 | // RemoveURL attempts to strip away obvious URLs
37 | //
38 | func RemoveURL(s string) string {
39 | 	var idx int
40 | 
41 | 	for {
42 | 		if idx = strings.Index(s, "http"); idx == -1 {
43 | 			return s
44 | 		}
45 | 
46 | 		news := s[:idx]
47 | 		endx := strings.IndexFunc(s[idx:], enNotURLChar)
48 | 		if endx != -1 {
49 | 			news = news + " " + s[idx+endx:]
50 | 		}
51 | 		s = news
52 | 	}
53 | }
54 | 
55 | // RemovePath attempts to strip away embedded file system paths, e.g.
56 | //  /foo/bar or /static/myimg.png
57 | //
58 | //  TODO: windows style
59 | //
60 | func RemovePath(s string) string {
61 | 	out := bytes.Buffer{}
62 | 	var idx int
63 | 	for len(s) > 0 {
64 | 		if idx = strings.IndexByte(s, '/'); idx == -1 {
65 | 			out.WriteString(s)
66 | 			break
67 | 		}
68 | 
69 | 		if idx > 0 {
70 | 			idx--
71 | 		}
72 | 
73 | 		var chclass string
74 | 		switch s[idx] {
75 | 		case '/', ' ', '\n', '\t', '\r':
76 | 			chclass = " \n\r\t"
77 | 		case '[':
78 | 			chclass = "]\n"
79 | 		case '(':
80 | 			chclass = ")\n"
81 | 		default:
82 | 			out.WriteString(s[:idx+2])
83 | 			s = s[idx+2:]
84 | 			continue
85 | 		}
86 | 
87 | 		endx := strings.IndexAny(s[idx+1:], chclass)
88 | 		if endx != -1 {
89 | 			out.WriteString(s[:idx+1])
90 | 			out.Write(bytes.Repeat([]byte{' '}, endx))
91 | 			s = s[idx+endx+1:]
92 | 		} else {
93 | 			out.WriteString(s)
94 | 			break
95 | 		}
96 | 	}
97 | 	return out.String()
98 | }
99 | 


--------------------------------------------------------------------------------
/notwords_test.go:
--------------------------------------------------------------------------------
 1 | package gospell
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestRemovePath(t *testing.T) {
 8 | 	cases := []struct {
 9 | 		word string
10 | 		want string
11 | 	}{
12 | 		{" /foo/bar abc", "          abc"},
13 | 		{"X/foo/bar abc", "X/foo/bar abc"},
14 | 		{"[/foo/bar] abc", "[        ] abc"},
15 | 		{"/", "/"},
16 | 	}
17 | 	for pos, tt := range cases {
18 | 		got := RemovePath(tt.word)
19 | 		if got != tt.want {
20 | 			t.Errorf("%d want %q  got %q", pos, tt.want, got)
21 | 		}
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/plaintext/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Nick Galbreath
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/plaintext/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all: install lint test
 3 | 
 4 | install:
 5 | 	go get ./...
 6 | 	go install ./...
 7 | 
 8 | lint:
 9 | 	golint ./...
10 | 	go vet ./...
11 | 	find . -name '*.go' | xargs gofmt -w -s
12 | 
13 | test:
14 | 	go test .
15 | 	misspell *.md *.go
16 | 
17 | clean:
18 | 	rm -f *~
19 | 	go clean ./...
20 | 	git gc
21 | 
22 | ci: install lint test
23 | 
24 | docker-ci:
25 | 	docker run --rm \
26 | 		-e COVERALLS_REPO_TOKEN=$COVERALLS_REPO_TOKEN \
27 | 		-v $(PWD):/go/src/github.com/client9/plaintext \
28 | 		-w /go/src/github.com/client9/plaintext \
29 | 		nickg/golang-dev-docker \
30 | 		make ci
31 | 
32 | .PHONY: ci docker-ci
33 | 


--------------------------------------------------------------------------------
/plaintext/README.md:
--------------------------------------------------------------------------------
 1 | # plaintext
 2 | [![Build Status](https://travis-ci.org/client9/plaintext.svg?branch=master)](https://travis-ci.org/client9/plaintext) [![Go Report Card](http://goreportcard.com/badge/client9/plaintext)](http://goreportcard.com/report/client9/plaintext) [![GoDoc](https://godoc.org/github.com/client9/plaintext?status.svg)](https://godoc.org/github.com/client9/plaintext) [![Coverage](http://gocover.io/_badge/github.com/client9/plaintext)](http://gocover.io/github.com/client9/plaintext) [![license](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](https://raw.githubusercontent.com/client9/plaintext/master/LICENSE)
 3 | 
 4 | Extract human languages in plain UTF-8 text from computer code and markup
 5 | 
 6 | The output is (or should be) *line-preserving*, meaning, no new lines are added or subtracted.
 7 | 
 8 | ```html
 9 | <p>
10 | foo
11 | </p>
12 | ```
13 | 
14 | becomes
15 | 
16 | ```html
17 | 
18 | foo
19 | 
20 | ```
21 | 
22 | 


--------------------------------------------------------------------------------
/plaintext/cmd/plaintext/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"io/ioutil"
 6 | 	"log"
 7 | 	"os"
 8 | 
 9 | 	"github.com/client9/plaintext"
10 | )
11 | 
12 | func main() {
13 | 	extension := flag.String("s", "", "over-ride file suffix to determine parser")
14 | 	flag.Parse()
15 | 	ext := *extension
16 | 	if ext != "" && ext[0] != '.' {
17 | 		ext = "." + ext
18 | 	}
19 | 	args := flag.Args()
20 | 
21 | 	// stdin support
22 | 	if len(args) == 0 {
23 | 		raw, err := ioutil.ReadAll(os.Stdin)
24 | 		if err != nil {
25 | 			log.Fatalf("Unable to read Stdin: %s", err)
26 | 		}
27 | 		md, err := plaintext.ExtractorByFilename("stdin" + *extension)
28 | 		if err != nil {
29 | 			log.Fatalf("Unable to create parser: %s", err)
30 | 		}
31 | 
32 | 		raw = plaintext.StripTemplate(raw)
33 | 		os.Stdout.Write(md.Text(raw))
34 | 	}
35 | 
36 | 	for _, arg := range args {
37 | 		raw, err := ioutil.ReadFile(arg)
38 | 		if err != nil {
39 | 			log.Fatalf("Unable to read %q: %s", arg, err)
40 | 		}
41 | 		md, err := plaintext.ExtractorByFilename(arg + *extension)
42 | 		if err != nil {
43 | 			log.Fatalf("Unable to create parser: %s", err)
44 | 		}
45 | 
46 | 		raw = plaintext.StripTemplate(raw)
47 | 		os.Stdout.Write(md.Text(raw))
48 | 		os.Stdout.Write([]byte{'\n'})
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/plaintext/golang.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"text/scanner"
 6 | )
 7 | 
 8 | // GolangText extracts plaintext from Golang and other similar C or Java like files
 9 | //
10 | // Need to study.   https://godoc.org/github.com/fluhus/godoc-tricks
11 | //  Does not process embedded code blocks
12 | //
13 | type GolangText struct {
14 | }
15 | 
16 | // NewGolangText creates a new extractor
17 | func NewGolangText() (*GolangText, error) {
18 | 	return &GolangText{}, nil
19 | }
20 | 
21 | // Text satisfies the Extractor interface
22 | //
23 | //ReplaceGo is a specialized routine for correcting Golang source
24 | // files.  Currently only checks comments, not identifiers for
25 | // spelling.
26 | //
27 | // Other items:
28 | //   - check strings, but need to ignore
29 | //      * import "statements" blocks
30 | //      * import ( "blocks" )
31 | //   - skip first comment (line 0) if build comment
32 | //
33 | func (p *GolangText) Text(raw []byte) []byte {
34 | 	out := bytes.Buffer{}
35 | 	s := scanner.Scanner{}
36 | 	s.Init(bytes.NewReader(raw))
37 | 	s.Error = (func(s *scanner.Scanner, msg string) {})
38 | 	s.Mode = scanner.ScanIdents | scanner.ScanFloats | scanner.ScanChars | scanner.ScanStrings | scanner.ScanRawStrings | scanner.ScanComments
39 | 	for {
40 | 		switch s.Scan() {
41 | 		case scanner.Comment:
42 | 			out.WriteString(s.TokenText())
43 | 			out.WriteByte('\n')
44 | 		case scanner.EOF:
45 | 			return out.Bytes()
46 | 		}
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/plaintext/html.go:
--------------------------------------------------------------------------------
  1 | package plaintext
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"golang.org/x/net/html"
  6 | )
  7 | 
  8 | var blockTag = map[string]struct{}{
  9 | 	"br":         {},
 10 | 	"h1":         {},
 11 | 	"h2":         {},
 12 | 	"h3":         {},
 13 | 	"h4":         {},
 14 | 	"h5":         {},
 15 | 	"pre":        {},
 16 | 	"li":         {},
 17 | 	"p":          {},
 18 | 	"div":        {},
 19 | 	"blockquote": {},
 20 | }
 21 | 
 22 | func isBlock(tag []byte) bool {
 23 | 	_, ok := blockTag[string(tag)]
 24 | 	return ok
 25 | }
 26 | 
 27 | // count number of newlines in a text block
 28 | func countNewlines(raw []byte) int {
 29 | 	count := 0
 30 | 	for idx := bytes.IndexByte(raw, '\n'); idx != -1 && idx < len(raw); raw = raw[idx:] {
 31 | 		count++
 32 | 		idx++
 33 | 	}
 34 | 	return count
 35 | }
 36 | 
 37 | // HTMLText extracts plain text from HTML markup
 38 | type HTMLText struct {
 39 | 	InspectImageAlt bool
 40 | }
 41 | 
 42 | // InspectImageAlt is a sample for options  WIP
 43 | func InspectImageAlt(opt *HTMLText) error {
 44 | 	opt.InspectImageAlt = true
 45 | 	return nil
 46 | }
 47 | 
 48 | // NewHTMLText creates a new HTMLText extractor, using options.
 49 | func NewHTMLText(options ...func(*HTMLText) error) (*HTMLText, error) {
 50 | 	extractor := HTMLText{}
 51 | 	for _, option := range options {
 52 | 		err := option(&extractor)
 53 | 		if err != nil {
 54 | 			return nil, err
 55 | 		}
 56 | 	}
 57 | 	return &extractor, nil
 58 | }
 59 | 
 60 | // Text satisfies the plaintext.Extractor interface
 61 | func (p *HTMLText) Text(raw []byte) []byte {
 62 | 	isCodeTag := false
 63 | 	isStyleTag := false
 64 | 	isScriptTag := false
 65 | 
 66 | 	out := bytes.Buffer{}
 67 | 
 68 | 	z := html.NewTokenizer(bytes.NewReader(raw))
 69 | 	for {
 70 | 		tt := z.Next()
 71 | 		switch tt {
 72 | 		case html.ErrorToken:
 73 | 			return out.Bytes()
 74 | 		case html.StartTagToken:
 75 | 			tn, hasAttr := z.TagName()
 76 | 			if bytes.Equal(tn, []byte("code")) {
 77 | 				isCodeTag = true
 78 | 				continue
 79 | 			}
 80 | 			if bytes.Equal(tn, []byte("style")) {
 81 | 				isStyleTag = true
 82 | 				continue
 83 | 			}
 84 | 			if bytes.Equal(tn, []byte("script")) {
 85 | 				isScriptTag = true
 86 | 				continue
 87 | 			}
 88 | 			if bytes.Equal(tn, []byte("img")) {
 89 | 				var key, val []byte
 90 | 				for hasAttr {
 91 | 					key, val, hasAttr = z.TagAttr()
 92 | 					if len(val) > 0 && bytes.Equal(key, []byte("alt")) {
 93 | 						out.Write(val)
 94 | 						out.Write([]byte(" "))
 95 | 					}
 96 | 				}
 97 | 			}
 98 | 		case html.EndTagToken:
 99 | 			tn, _ := z.TagName()
100 | 			if bytes.Equal(tn, []byte("code")) {
101 | 				isCodeTag = false
102 | 				continue
103 | 			}
104 | 			if bytes.Equal(tn, []byte("style")) {
105 | 				isStyleTag = false
106 | 				continue
107 | 			}
108 | 			if bytes.Equal(tn, []byte("script")) {
109 | 				isScriptTag = false
110 | 				continue
111 | 			}
112 | 		case html.TextToken:
113 | 			if isCodeTag || isStyleTag || isScriptTag {
114 | 				// we want to preserve the line count
115 | 				out.Write(bytes.Repeat([]byte{'\n'}, countNewlines(z.Text())))
116 | 				continue
117 | 			}
118 | 			out.Write([]byte(z.Text()))
119 | 		}
120 | 	}
121 | }
122 | 


--------------------------------------------------------------------------------
/plaintext/html_test.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestHTML(t *testing.T) {
 9 | 	cases := []struct {
10 | 		text string
11 | 		want string
12 | 	}{
13 | 		{
14 | 			`1
15 | 2
16 | 3<script>
17 | 4
18 | 5
19 | 6</script>
20 | 7`,
21 | 			`1
22 | 2
23 | 3
24 | 
25 | 
26 | 
27 | 7`,
28 | 		},
29 | 	}
30 | 	for pos, tt := range cases {
31 | 		mt, err := NewHTMLText()
32 | 		if err != nil {
33 | 			t.Fatalf("Unable to run test: %s", err)
34 | 		}
35 | 		got := string(mt.Text([]byte(tt.text)))
36 | 		lenGot := len(strings.Split(got, "\n"))
37 | 		lenWant := len(strings.Split(tt.want, "\n"))
38 | 		if lenGot != lenWant {
39 | 			t.Errorf("Test %d failed: want %d got %d lines ", pos, lenWant, lenGot)
40 | 		}
41 | 		if tt.want != got {
42 | 			t.Errorf("Test %d failed:  want %q, got %q", pos, tt.want, got)
43 | 		}
44 | 
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/plaintext/identity.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | // Identity provides a pass-through plain text extractor
 4 | type Identity struct {
 5 | }
 6 | 
 7 | // NewIdentity creates an identity-extractor
 8 | func NewIdentity() (*Identity, error) {
 9 | 	return &Identity{}, nil
10 | }
11 | 
12 | // Text satisfies the plaintext.Extractor interface
13 | func (p *Identity) Text(raw []byte) []byte {
14 | 	return raw
15 | }
16 | 


--------------------------------------------------------------------------------
/plaintext/identity_test.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | // TestIdentity test is input is passed back
 8 | func TestIdentity(t *testing.T) {
 9 | 	p, err := NewIdentity()
10 | 	if err != nil {
11 | 		t.Fatalf("unable to run test")
12 | 	}
13 | 	raw := []byte("whatever[]<>")
14 | 	orig := string(raw)
15 | 	got := string(p.Text(raw))
16 | 	if got != orig {
17 | 		t.Errorf("identity failed")
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/plaintext/markdown.go:
--------------------------------------------------------------------------------
  1 | package plaintext
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"regexp"
  6 | )
  7 | 
  8 | var allSymbols = regexp.MustCompile("^[ =*|-]*$")
  9 | var linkTarget = regexp.MustCompile(`\]\([^ )]*\)?`)
 10 | var blockQuote = regexp.MustCompile("^>[ >]*")
 11 | var leadingHeadline = regexp.MustCompile("^ *#+ *")
 12 | var trailingHeadline = regexp.MustCompile(" *#+ *$")
 13 | 
 14 | // code fences can have leading whitespace apparently
 15 | var codeFence = regexp.MustCompile("^\\s*```")
 16 | 
 17 | // single line, single back quote code snippet
 18 | // this is the most common case although markdown
 19 | // apparently supports ``...`\n\n....`` style multi-line
 20 | // to allow embedded back quotes
 21 | var simpleCode = regexp.MustCompile("`[^`]+`")
 22 | 
 23 | // MarkdownText extracts plain text from markdown sources
 24 | type MarkdownText struct {
 25 | 	Extractor Extractor
 26 | }
 27 | 
 28 | // NewMarkdownText creates a new extractor
 29 | func NewMarkdownText(options ...func(*MarkdownText) error) (*MarkdownText, error) {
 30 | 	processor := MarkdownText{}
 31 | 	for _, option := range options {
 32 | 		err := option(&processor)
 33 | 		if err != nil {
 34 | 			return nil, err
 35 | 		}
 36 | 	}
 37 | 
 38 | 	if processor.Extractor == nil {
 39 | 		e, err := NewHTMLText()
 40 | 		if err != nil {
 41 | 			return nil, err
 42 | 		}
 43 | 		processor.Extractor = e
 44 | 	}
 45 | 
 46 | 	return &processor, nil
 47 | }
 48 | 
 49 | func cleanupLine(s []byte) []byte {
 50 | 
 51 | 	// strip away various headings from back and front
 52 | 	s = leadingHeadline.ReplaceAll(s, nil)
 53 | 	s = trailingHeadline.ReplaceAll(s, nil)
 54 | 
 55 | 	// strip away leading "> > > " from block quotes
 56 | 	s = blockQuote.ReplaceAll(s, nil)
 57 | 
 58 | 	// is all "-", "=", "*", "|" make empty
 59 | 	// this eliminates various HR variations and
 60 | 	// table decoration and is not a word anyways
 61 | 	if allSymbols.Match(s) {
 62 | 		return []byte{}
 63 | 	}
 64 | 
 65 | 	s = simpleCode.ReplaceAll(s, nil)
 66 | 
 67 | 	// there is no reason to NOT replace `*` `~` or `_` with a space character
 68 | 	// not used in words
 69 | 	s = bytes.Replace(s, []byte{'*'}, nil, -1)
 70 | 	s = bytes.Replace(s, []byte{'~'}, nil, -1)
 71 | 	s = bytes.Replace(s, []byte{'_'}, nil, -1)
 72 | 
 73 | 	// links. 	[link](/MyURI)
 74 | 	// Stuff inside the "link" can be on different lines, but "](/URI)"
 75 | 	// is all on one line so we can delete ](....space )
 76 | 	// ![ is for images
 77 | 	s = bytes.Replace(s, []byte{'!', '['}, nil, -1)
 78 | 	s = bytes.Replace(s, []byte{'['}, nil, -1)
 79 | 	s = linkTarget.ReplaceAll(s, nil)
 80 | 	return s
 81 | }
 82 | 
 83 | // Text extracts text from a markdown source
 84 | func (p *MarkdownText) Text(text []byte) []byte {
 85 | 	inCodeFence := false
 86 | 	inCodeIndent := false
 87 | 
 88 | 	buf := bytes.Buffer{}
 89 | 	lines := bytes.Split(text, []byte{'\n'})
 90 | 	for pos, line := range lines {
 91 | 		if pos > 0 {
 92 | 			buf.WriteByte('\n')
 93 | 		}
 94 | 
 95 | 		if codeFence.Match(line) {
 96 | 			inCodeFence = !inCodeFence
 97 | 			continue
 98 | 		}
 99 | 
100 | 		if bytes.HasPrefix(line, []byte{' ', ' ', ' ', ' '}) {
101 | 			inCodeIndent = !inCodeIndent
102 | 			continue
103 | 		}
104 | 
105 | 		if !inCodeFence && !inCodeIndent {
106 | 			buf.Write(cleanupLine(line))
107 | 		}
108 | 	}
109 | 	return p.Extractor.Text(buf.Bytes())
110 | }
111 | 


--------------------------------------------------------------------------------
/plaintext/markdown_test.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestMD(t *testing.T) {
 8 | 	cases := []struct {
 9 | 		text string
10 | 		want string
11 | 	}{
12 | 		{"\nfoo bar\n", "\nfoo bar\n"},
13 | 		{"\nfoo bar\n", "\nfoo bar\n"},
14 | 		{"\n\nfoo bar\n", "\n\nfoo bar\n"},
15 | 		{"\nfoo\nbar\n", "\nfoo\nbar\n"},
16 | 		{"\nfoo\n\nbar\n", "\nfoo\n\nbar\n"},
17 | 		{"*italic*", "italic"},
18 | 		{"**bold**", "bold"},
19 | 		{"_emphasis_", "emphasis"},
20 | 		{"**combo _text_**", "combo text"},
21 | 		{"~~strike~~", "strike"},
22 | 		{"# heading1\nfoo", "heading1\nfoo"},
23 | 
24 | 		// in-line code should be ignored
25 | 		{"first `middle` last", "first  last"},
26 | 
27 | 		// auto-links really should be ignore, but they get removed in plain-text tokenizer
28 | 		{"first http://foobar.com/apple last ", "first http://foobar.com/apple last "},
29 | 
30 | 		// links
31 | 		{"foo\n[hello world](http://foobar.com/apple) foo ", "foo\nhello world foo "},
32 | 		{"[Visit GitHub!](https://www.github.com)", "Visit GitHub!"},
33 | 
34 | 		// images
35 | 		{"![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)", "Image of Yaktocat"},
36 | 		{"![GitHub Logo](/images/logo.png)", "GitHub Logo"},
37 | 
38 | 		// code fence
39 | 		{"```\ncode\n```\nnotcode", "\n\n\nnotcode"},
40 | 
41 | 		// indented code fence
42 | 		{"    ```\ncode\n    ```\nnotcode", "\n\n\nnotcode"},
43 | 
44 | 		// blockquote
45 | 		{"> blockquote1\n> blockquote2\n", "blockquote1\nblockquote2\n"},
46 | 
47 | 		// entity
48 | 		{"&lt;", "<"},
49 | 	}
50 | 
51 | 	mt, err := NewMarkdownText()
52 | 	if err != nil {
53 | 		t.Fatalf("Unable to run test: %s", err)
54 | 	}
55 | 
56 | 	for pos, tt := range cases {
57 | 		got := string(mt.Text([]byte(tt.text)))
58 | 		if tt.want != got {
59 | 			t.Errorf("Test %d failed:  want %q, got %q", pos, tt.want, got)
60 | 		}
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/plaintext/mime.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // returns the mime type of the full filename if none
 9 | func getSuffix(filename string) string {
10 | 	idx := strings.LastIndex(filename, ".")
11 | 	if idx == -1 || idx+1 == len(filename) {
12 | 		return filename
13 | 	}
14 | 	return filename[idx+1:]
15 | }
16 | 
17 | // ExtractorByFilename returns an plaintext extractor based on
18 | // filename heuristic
19 | func ExtractorByFilename(filename string) (Extractor, error) {
20 | 	var e Extractor
21 | 	var err error
22 | 	switch getSuffix(filename) {
23 | 	case "md", "markdown":
24 | 		e, err = NewMarkdownText()
25 | 	case "html":
26 | 		e, err = NewHTMLText()
27 | 	case "go", "h", "c", "java", "hxx", "cxx", "js":
28 | 		e, err = NewGolangText()
29 | 	case "py", "sh", "pl", "Makefile", "Dockerfile":
30 | 		e, err = NewScriptText()
31 | 	case "txt", "stdin":
32 | 		e, err = NewIdentity()
33 | 	default:
34 | 		err = errors.New("unknown file type")
35 | 	}
36 | 	if err != nil {
37 | 		return nil, err
38 | 	}
39 | 	return e, nil
40 | }
41 | 


--------------------------------------------------------------------------------
/plaintext/script.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | )
 6 | 
 7 | // ScriptText extract plaintext from "generic script" languages
 8 | // that use the '#' character to denote a comment line
 9 | // It's not so smart.
10 | // TODO: add support for Ruby, multi-line comment
11 | //  http://www.tutorialspoint.com/ruby/ruby_comments.htm
12 | type ScriptText struct {
13 | }
14 | 
15 | // NewScriptText creates a new file extractor
16 | func NewScriptText() (*ScriptText, error) {
17 | 	return &ScriptText{}, nil
18 | }
19 | 
20 | // Text extracts plaintext
21 | func (p *ScriptText) Text(text []byte) []byte {
22 | 	buf := bytes.Buffer{}
23 | 	lines := bytes.Split(text, []byte{'\n'})
24 | 	for pos, line := range lines {
25 | 		if pos > 0 {
26 | 			buf.WriteByte('\n')
27 | 		}
28 | 
29 | 		// BUG: if '#' is in a string
30 | 		if idx := bytes.IndexByte(line, '#'); idx != -1 {
31 | 			buf.Write(line[idx:])
32 | 		}
33 | 	}
34 | 	return buf.Bytes()
35 | }
36 | 


--------------------------------------------------------------------------------
/plaintext/script_test.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestScript(t *testing.T) {
 8 | 	cases := []struct {
 9 | 		text string
10 | 		want string
11 | 	}{
12 | 		{"\nfoo1\n# line2bar\nfoo3", "\n\n# line2bar\n"},
13 | 		{"\nfoo1\nline2# bar\nfoo3", "\n\n# bar\n"},
14 | 	}
15 | 	mt, err := NewScriptText()
16 | 	if err != nil {
17 | 		t.Fatalf("Unable to run test: %s", err)
18 | 	}
19 | 	for pos, tt := range cases {
20 | 		got := string(mt.Text([]byte(tt.text)))
21 | 		if tt.want != got {
22 | 			t.Errorf("Test %d failed:  want %q, got %q", pos, tt.want, got)
23 | 		}
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/plaintext/template.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"regexp"
 5 | )
 6 | 
 7 | // StripTemplate is a WIP on remove golang template markup from a file
 8 | func StripTemplate(raw []byte) []byte {
 9 | 	r, err := regexp.Compile(`({{[^}]+}})`)
10 | 	if err != nil {
11 | 		panic(err)
12 | 	}
13 | 	return r.ReplaceAllLiteral(raw, []byte{0x20})
14 | }
15 | 


--------------------------------------------------------------------------------
/plaintext/template_test.go:
--------------------------------------------------------------------------------
 1 | package plaintext
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestStrip(t *testing.T) {
 8 | 	orig := []byte("foo{{ junk }}bar")
 9 | 	want := "foo bar"
10 | 	got := string(StripTemplate(orig))
11 | 
12 | 	if got != want {
13 | 		t.Errorf("Want %q, Got %q", want, got)
14 | 	}
15 | }
16 | 


--------------------------------------------------------------------------------
/plaintext/text.go:
--------------------------------------------------------------------------------
1 | package plaintext
2 | 
3 | // Extractor is an interface for extracting plaintext
4 | type Extractor interface {
5 | 	Text([]byte) []byte
6 | }
7 | 


--------------------------------------------------------------------------------
/words.go:
--------------------------------------------------------------------------------
  1 | package gospell
  2 | 
  3 | import (
  4 | 	"regexp"
  5 | 	"strings"
  6 | 	"unicode"
  7 | )
  8 | 
  9 | // number form, may include dots, commas and dashes
 10 | var numberRegexp = regexp.MustCompile("^([0-9]+[.,-]?)+$")
 11 | 
 12 | // number form with units, e.g. 123ms, 12in  1ft
 13 | var numberUnitsRegexp = regexp.MustCompile("^[0-9]+[a-zA-Z]+$")
 14 | 
 15 | // 0x12FF or 0x1B or x12FF
 16 | // does anyone use 0XFF ??
 17 | var numberHexRegexp = regexp.MustCompile("^0?[x][0-9A-Fa-f]+$")
 18 | 
 19 | var numberBinaryRegexp = regexp.MustCompile("^0[b][01]+$")
 20 | 
 21 | var camelCaseRegexp1 = regexp.MustCompile("[A-Z]+")
 22 | 
 23 | var shaHashRegexp = regexp.MustCompile("^[0-9a-z]{40}$")
 24 | 
 25 | // Splitter splits a text into words
 26 | // Highly likely this implementation will change so we are encapsulating.
 27 | type Splitter struct {
 28 | 	fn func(c rune) bool
 29 | }
 30 | 
 31 | // Split is the function to split an input into a `[]string`
 32 | func (s *Splitter) Split(in string) []string {
 33 | 	return strings.FieldsFunc(in, s.fn)
 34 | }
 35 | 
 36 | // NewSplitter creates a new splitter.  The input is a string in
 37 | // UTF-8 encoding.  Each rune in the string will be considered to be a
 38 | // valid word character.  Runes that are NOT here are deemed a word
 39 | // boundary Current implementation uses
 40 | // https://golang.org/pkg/strings/#FieldsFunc
 41 | func NewSplitter(chars string) *Splitter {
 42 | 	s := Splitter{}
 43 | 	s.fn = (func(c rune) bool {
 44 | 		// break if it's not a letter, and not another special character
 45 | 		return !unicode.IsLetter(c) && -1 == strings.IndexRune(chars, c)
 46 | 	})
 47 | 	return &s
 48 | }
 49 | 
 50 | func isNumber(s string) bool {
 51 | 	return numberRegexp.MatchString(s)
 52 | }
 53 | 
 54 | func isNumberBinary(s string) bool {
 55 | 	return numberBinaryRegexp.MatchString(s)
 56 | }
 57 | 
 58 | // is word in the form of a "number with units", e.g. "101ms", "3ft",
 59 | // "5GB" if true, return the units, if not return empty string This is
 60 | // highly English based and not sure how applicable it is to other
 61 | // languages.
 62 | func isNumberUnits(s string) string {
 63 | 	// regexp.FindAllStringSubmatch is too confusing
 64 | 	if !numberUnitsRegexp.MatchString(s) {
 65 | 		return ""
 66 | 	}
 67 | 	// Starts with a number
 68 | 	for idx, ch := range s {
 69 | 		if ch >= '0' && ch <= '9' {
 70 | 			continue
 71 | 		}
 72 | 		return s[idx:]
 73 | 	}
 74 | 	panic("assertion failed")
 75 | }
 76 | 
 77 | func isNumberHex(s string) bool {
 78 | 	return numberHexRegexp.MatchString(s)
 79 | }
 80 | 
 81 | func isHash(s string) bool {
 82 | 	return shaHashRegexp.MatchString(s)
 83 | }
 84 | 
 85 | func splitCamelCase(s string) []string {
 86 | 	out := []string{}
 87 | 
 88 | 	s = strings.Replace(s, "HTTP", "Http", -1)
 89 | 	s = strings.Replace(s, "HTML", "Html", -1)
 90 | 	s = strings.Replace(s, "URL", "Url", -1)
 91 | 	s = strings.Replace(s, "URI", "Uri", -1)
 92 | 
 93 | 	caps := camelCaseRegexp1.FindAllStringIndex(s, -1)
 94 | 
 95 | 	// all lower case
 96 | 	if len(caps) == 0 {
 97 | 		return nil
 98 | 	}
 99 | 
100 | 	// is only first character capitalized? or is the whole word capitalized
101 | 	if len(caps) == 1 && caps[0][0] == 0 && (caps[0][1] == 1 || caps[0][1] == len(s)) {
102 | 		return nil
103 | 	}
104 | 	last := 0
105 | 	for i := 0; i < len(caps); i++ {
106 | 		if last != caps[i][0] {
107 | 			out = append(out, s[last:caps[i][0]])
108 | 			last = caps[i][0]
109 | 		}
110 | 		if caps[i][1]-caps[i][0] > 1 {
111 | 			out = append(out, s[caps[i][0]:caps[i][1]])
112 | 			last = caps[i][1]
113 | 		}
114 | 	}
115 | 	if last < len(s) {
116 | 		out = append(out, s[last:])
117 | 	}
118 | 
119 | 	return out
120 | }
121 | 


--------------------------------------------------------------------------------
/words_test.go:
--------------------------------------------------------------------------------
  1 | package gospell
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestSplitter(t *testing.T) {
  9 | 
 10 | 	s := NewSplitter("012345689")
 11 | 
 12 | 	cases := []struct {
 13 | 		word string
 14 | 		want []string
 15 | 	}{
 16 | 		{"abc", []string{"abc"}},
 17 | 		{"abc xyz", []string{"abc", "xyz"}},
 18 | 		{"abc! xyz!", []string{"abc", "xyz"}},
 19 | 		{"1st 2nd x86 amd64", []string{"1st", "2nd", "x86", "amd64"}},
 20 | 	}
 21 | 
 22 | 	for pos, tt := range cases {
 23 | 		got := s.Split(tt.word)
 24 | 		if !reflect.DeepEqual(tt.want, got) {
 25 | 			t.Errorf("%d want %v  got %v", pos, tt.want, got)
 26 | 		}
 27 | 	}
 28 | }
 29 | 
 30 | func TestIsNumber(t *testing.T) {
 31 | 
 32 | 	cases := []struct {
 33 | 		word string
 34 | 		want bool
 35 | 	}{
 36 | 		{"0", true},
 37 | 		{"00", true},
 38 | 		{"100", true},
 39 | 		{"1.", true},
 40 | 		{"1.0.", true},
 41 | 		{"1.0.0.", true},
 42 | 		{"1,0", true},
 43 | 		{"1-0", true},
 44 | 		{"1..0", false},
 45 | 		{"1--0", false},
 46 | 		{"1..0", false},
 47 | 		{"1-.0", false},
 48 | 		{"-1.0", false},
 49 | 		{",1", false},
 50 | 	}
 51 | 	for _, tt := range cases {
 52 | 		if isNumber(tt.word) != tt.want {
 53 | 			t.Errorf("%q is not %v", tt.word, tt.want)
 54 | 		}
 55 | 	}
 56 | }
 57 | 
 58 | func TestIsNumberUnits(t *testing.T) {
 59 | 	cases := []struct {
 60 | 		word string
 61 | 		want string
 62 | 	}{
 63 | 		{"0", ""},
 64 | 		{"xxx", ""},
 65 | 		{"101a-b-c", ""},
 66 | 		{"10GB", "GB"},
 67 | 		{"1G", "G"},
 68 | 	}
 69 | 	for _, tt := range cases {
 70 | 		if isNumberUnits(tt.word) != tt.want {
 71 | 			t.Errorf("%q is not %v", tt.word, tt.want)
 72 | 		}
 73 | 	}
 74 | }
 75 | 
 76 | func TestIsNumberHex(t *testing.T) {
 77 | 	cases := []struct {
 78 | 		word string
 79 | 		want bool
 80 | 	}{
 81 | 		{"0", false},
 82 | 		{"0x", false},
 83 | 		{"x", false},
 84 | 		{"0x0", true},
 85 | 		{"0xF", true},
 86 | 		{"0xf", true},
 87 | 		{"0xFF", true},
 88 | 		{"0x12", true},
 89 | 		{"x12", true},
 90 | 		{"x86", true},
 91 | 		{"xabcdef", true},
 92 | 		{"0xZZ", false},
 93 | 	}
 94 | 	for _, tt := range cases {
 95 | 		if isNumberHex(tt.word) != tt.want {
 96 | 			t.Errorf("%q is not %v", tt.word, tt.want)
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | func TestSplitCamelCase(t *testing.T) {
102 | 	cases := []struct {
103 | 		word string
104 | 		want []string
105 | 	}{
106 | 		{"foo", nil}, // not camel case
107 | 		{"Foo", nil}, // not camel case
108 | 		{"FOO", nil}, // not camel case
109 | 		{"FooBar", []string{"Foo", "Bar"}},
110 | 		{"fooBar", []string{"foo", "Bar"}},
111 | 		{"FOOword", []string{"FOO", "word"}},
112 | 		{"isFOO", []string{"is", "FOO"}},
113 | 		{"RemoveURL", []string{"Remove", "Url"}},
114 | 	}
115 | 	for _, tt := range cases {
116 | 		got := splitCamelCase(tt.word)
117 | 		if !reflect.DeepEqual(tt.want, got) {
118 | 			t.Errorf("%q : want %v got %v", tt.word, tt.want, got)
119 | 		}
120 | 	}
121 | }
122 | 


--------------------------------------------------------------------------------