├── .gitignore ├── .spelling.hjson ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── aff.go ├── aff_test.go ├── case.go ├── case_test.go ├── cmd ├── gospell │ └── main.go └── sample │ └── sample.go ├── file.go ├── gospell.go ├── notwords.go ├── notwords_test.go ├── plaintext ├── LICENSE ├── Makefile ├── README.md ├── cmd │ └── plaintext │ │ └── main.go ├── golang.go ├── html.go ├── html_test.go ├── identity.go ├── identity_test.go ├── markdown.go ├── markdown_test.go ├── mime.go ├── script.go ├── script_test.go ├── template.go ├── template_test.go └── text.go ├── words.go └── words_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | 3 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 4 | *.o 5 | *.a 6 | *.so 7 | 8 | # Folders 9 | _obj 10 | _test 11 | 12 | # Architecture specific extensions/prefixes 13 | *.[568vq] 14 | [568vq].out 15 | 16 | *.cgo1.go 17 | *.cgo2.c 18 | _cgo_defun.c 19 | _cgo_gotypes.go 20 | _cgo_export.* 21 | 22 | _testmain.go 23 | 24 | *.exe 25 | *.test 26 | *.prof 27 | -------------------------------------------------------------------------------- /.spelling.hjson: -------------------------------------------------------------------------------- 1 | extra: [ 2 | words/LocalDictionary 3 | ] 4 | additions: [ 5 | AFF 6 | DIC 7 | PFX 8 | SFX 9 | hunspell 10 | UTF 11 | WIP 12 | gospell 13 | io 14 | googlesource 15 | ] 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | language: generic 4 | script: 5 | - make -e docker-ci 6 | 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Nick Galbreath 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: install lint test 3 | 4 | install: 5 | go get ./... 6 | go install ./... 7 | 8 | lint: 9 | golint ./... 10 | go vet ./... 11 | find . -name '*.go' | xargs gofmt -w -s 12 | 13 | test: 14 | go test ./... 15 | find . -name '*.go' | xargs misspell 16 | find . -name '*.md' | xargs misspell 17 | 18 | clean: 19 | rm -f *~ cmd/gospell/*~ 20 | go clean ./... 21 | git gc 22 | 23 | ci: install lint test 24 | 25 | docker-ci: 26 | docker run --rm \ 27 | -e COVERALLS_REPO_TOKEN=$COVERALLS_REPO_TOKEN \ 28 | -v $(PWD):/go/src/github.com/client9/gospell \ 29 | -w /go/src/github.com/client9/gospell \ 30 | nickg/golang-dev-docker \ 31 | make ci 32 | 33 | .PHONY: ci docker-ci 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gospell 2 | [![Build Status](https://travis-ci.org/client9/gospell.svg?branch=master)](https://travis-ci.org/client9/gospell) [![Go Report Card](http://goreportcard.com/badge/client9/gospell)](http://goreportcard.com/report/client9/gospell) [![GoDoc](https://godoc.org/github.com/client9/gospell?status.svg)](https://godoc.org/github.com/client9/gospell) [![Coverage](http://gocover.io/_badge/github.com/client9/gospell)](http://gocover.io/github.com/client9/gospell) [![license](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](https://raw.githubusercontent.com/client9/gospell/master/LICENSE) 3 | 4 | pure golang spelling dictionary based on hunspell dictionaries. 5 | 6 | NOTE: I'm not an expert in linguistics nor spelling. Help is very 7 | welcome! 8 | 9 | ### What is hunspell? 10 | 11 | * http://hunspell.github.io 12 | * https://github.com/hunspell 13 | 14 | NOTE: This is not affiliated with Hunspell although if they wanted 15 | merge it in as an official project, I'd be happy to donate the code 16 | (although it's in no shape to do so right now). 17 | 18 | ### Where can I get English dictionaries? 19 | 20 | 21 | 22 | The world of spelling dictionaries is surprisingly complicated, as 23 | "lists of words" are frequently proprietary and with conflicting 24 | software licenses. 25 | 26 | 27 | ### Kevin Atkinson 28 | 29 | [Kevin Atkinson](http://www.kevina.org) 30 | maintains many open source lists via 31 | the [SCOWL](http://wordlist.aspell.net) project. The source code and 32 | raw lists are available on 33 | [GitHub `kevina/wordlist`](https://github.com/kevina/wordlist) 34 | 35 | 36 | #### Marco A.G.Pinto 37 | 38 | Marco maintains the released dictionaries for Firefox and Apache Open 39 | Office. The word lists appears to be actively updated. 40 | 41 | https://github.com/marcoagpinto/aoo-mozilla-en-dict 42 | 43 | #### Open Office 44 | 45 | http://extensions.openoffice.org/en/project/english-dictionaries-apache-openoffice 46 | 47 | The downloaded file has a `.oxt` extension but it's a compressed `tar` 48 | file. Extract the files using: 49 | 50 | ``` 51 | mkdir dict-en 52 | cd dict-en 53 | tar -xzf ../dict-en.oxt 54 | ``` 55 | 56 | #### Chromium 57 | 58 | The Chrome/Chromium browser uses Hunspell and it's source tree 59 | contains various up-to-date dictionaries, some with additional words. You can view them at 60 | [chromium.googlesource.com](https://chromium.googlesource.com/chromium/deps/hunspell_dictionaries/+/master) 61 | and you can check them out locally via 62 | 63 | ```bash 64 | git clone --depth=1 https://chromium.googlesource.com/chromium/deps/hunspell_dictionaries 65 | ``` 66 | 67 | More information can be found in the [chromium developer guide](https://www.chromium.org/developers/how-tos/editing-the-spell-checking-dictionaries) 68 | -------------------------------------------------------------------------------- /aff.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "regexp" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | // AffixType is either an affix prefix or suffix 13 | type AffixType int 14 | 15 | // specific Affix types 16 | const ( 17 | Prefix AffixType = iota 18 | Suffix 19 | ) 20 | 21 | // Affix is a rule for affix (adding prefixes or suffixes) 22 | type Affix struct { 23 | Type AffixType // either PFX or SFX 24 | CrossProduct bool 25 | Rules []Rule 26 | } 27 | 28 | // Expand provides all variations of a given word based on this affix rule 29 | func (a Affix) Expand(word string, out []string) []string { 30 | for _, r := range a.Rules { 31 | if r.matcher != nil && !r.matcher.MatchString(word) { 32 | continue 33 | } 34 | if a.Type == Prefix { 35 | out = append(out, r.AffixText+word) 36 | // TODO is does Strip apply to prefixes too? 37 | } else { 38 | stripWord := word 39 | if r.Strip != "" && strings.HasSuffix(word, r.Strip) { 40 | stripWord = word[:len(word)-len(r.Strip)] 41 | } 42 | out = append(out, stripWord+r.AffixText) 43 | } 44 | } 45 | return out 46 | } 47 | 48 | // Rule is a Affix rule 49 | type Rule struct { 50 | Strip string 51 | AffixText string // suffix or prefix text to add 52 | Pattern string // original matching pattern from AFF file 53 | matcher *regexp.Regexp // matcher to see if this rule applies or not 54 | } 55 | 56 | // DictConfig is a partial representation of a Hunspell AFF (Affix) file. 57 | type DictConfig struct { 58 | Flag string 59 | TryChars string 60 | WordChars string 61 | NoSuggestFlag rune 62 | IconvReplacements []string 63 | Replacements [][2]string 64 | AffixMap map[rune]Affix 65 | CamelCase int 66 | CompoundMin int 67 | CompoundOnly string 68 | CompoundRule []string 69 | compoundMap map[rune][]string 70 | } 71 | 72 | // Expand expands a word/affix using dictionary/affix rules 73 | // This also supports CompoundRule flags 74 | func (a DictConfig) Expand(wordAffix string, out []string) ([]string, error) { 75 | out = out[:0] 76 | idx := strings.Index(wordAffix, "/") 77 | 78 | // not found 79 | if idx == -1 { 80 | out = append(out, wordAffix) 81 | return out, nil 82 | } 83 | if idx == 0 || idx+1 == len(wordAffix) { 84 | return nil, fmt.Errorf("Slash char found in first or last position") 85 | } 86 | // safe 87 | word, keyString := wordAffix[:idx], wordAffix[idx+1:] 88 | 89 | // check to see if any of the flags are in the 90 | // "compound only". If so then nothing to add 91 | compoundOnly := false 92 | for _, key := range keyString { 93 | if strings.IndexRune(a.CompoundOnly, key) != -1 { 94 | compoundOnly = true 95 | continue 96 | } 97 | if _, ok := a.compoundMap[key]; !ok { 98 | // the isn't a compound flag 99 | continue 100 | } 101 | // is a compound flag 102 | a.compoundMap[key] = append(a.compoundMap[key], word) 103 | } 104 | 105 | if compoundOnly { 106 | return out, nil 107 | } 108 | 109 | out = append(out, word) 110 | prefixes := make([]Affix, 0, 5) 111 | suffixes := make([]Affix, 0, 5) 112 | for _, key := range keyString { 113 | // want keyString to []?something? 114 | // then iterate over that 115 | af, ok := a.AffixMap[key] 116 | if !ok { 117 | // is it compound flag? 118 | if _, ok := a.compoundMap[key]; ok { 119 | continue 120 | } 121 | // is it a NoSuggest? 122 | if key == a.NoSuggestFlag { 123 | continue 124 | } 125 | // no idea 126 | return nil, fmt.Errorf("unable to find affix key %v", key) 127 | } 128 | if !af.CrossProduct { 129 | out = af.Expand(word, out) 130 | continue 131 | } 132 | if af.Type == Prefix { 133 | prefixes = append(prefixes, af) 134 | } else { 135 | suffixes = append(suffixes, af) 136 | } 137 | } 138 | 139 | // expand all suffixes with out any prefixes 140 | for _, suf := range suffixes { 141 | out = suf.Expand(word, out) 142 | } 143 | for _, pre := range prefixes { 144 | prewords := pre.Expand(word, nil) 145 | out = append(out, prewords...) 146 | 147 | // now do cross product 148 | for _, suf := range suffixes { 149 | for _, w := range prewords { 150 | out = suf.Expand(w, out) 151 | } 152 | } 153 | } 154 | return out, nil 155 | } 156 | 157 | func isCrossProduct(val string) (bool, error) { 158 | switch val { 159 | case "Y": 160 | return true, nil 161 | case "N": 162 | return false, nil 163 | } 164 | return false, fmt.Errorf("CrossProduct is not Y or N: got %q", val) 165 | } 166 | 167 | // NewDictConfig reads an Hunspell AFF file 168 | func NewDictConfig(file io.Reader) (*DictConfig, error) { 169 | aff := DictConfig{ 170 | Flag: "ASCII", 171 | AffixMap: make(map[rune]Affix), 172 | compoundMap: make(map[rune][]string), 173 | CompoundMin: 3, // default in Hunspell 174 | } 175 | scanner := bufio.NewScanner(file) 176 | for scanner.Scan() { 177 | line := scanner.Text() 178 | parts := strings.Fields(line) 179 | if len(parts) == 0 { 180 | continue 181 | } 182 | switch parts[0] { 183 | case "#": 184 | continue 185 | case "TRY": 186 | if len(parts) != 2 { 187 | return nil, fmt.Errorf("TRY stanza had %d fields, expected 2", len(parts)) 188 | } 189 | aff.TryChars = parts[1] 190 | case "ICONV": 191 | // if only 2 fields, then its the first stanza that just provides a count 192 | // we don't care, as we dynamically allocate 193 | if len(parts) == 2 { 194 | continue 195 | } 196 | if len(parts) != 3 { 197 | return nil, fmt.Errorf("ICONV stanza had %d fields, expected 2", len(parts)) 198 | } 199 | // we have 3 200 | aff.IconvReplacements = append(aff.IconvReplacements, parts[1], parts[2]) 201 | case "REP": 202 | // if only 2 fields, then its the first stanza that just provides a count 203 | // we don't care, as we dynamically allocate 204 | if len(parts) == 2 { 205 | continue 206 | } 207 | if len(parts) != 3 { 208 | return nil, fmt.Errorf("REP stanza had %d fields, expected 2", len(parts)) 209 | } 210 | // we have 3 211 | aff.Replacements = append(aff.Replacements, [2]string{parts[1], parts[2]}) 212 | case "COMPOUNDMIN": 213 | if len(parts) != 2 { 214 | return nil, fmt.Errorf("COMPOUNDMIN stanza had %d fields, expected 2", len(parts)) 215 | } 216 | val, err := strconv.ParseInt(parts[1], 10, 64) 217 | if err != nil { 218 | return nil, fmt.Errorf("COMPOUNDMIN stanza had %q expected number", parts[1]) 219 | } 220 | aff.CompoundMin = int(val) 221 | case "ONLYINCOMPOUND": 222 | if len(parts) != 2 { 223 | return nil, fmt.Errorf("ONLYINCOMPOUND stanza had %d fields, expected 2", len(parts)) 224 | } 225 | aff.CompoundOnly = parts[1] 226 | case "COMPOUNDRULE": 227 | if len(parts) != 2 { 228 | return nil, fmt.Errorf("COMPOUNDRULE stanza had %d fields, expected 2", len(parts)) 229 | } 230 | val, err := strconv.ParseInt(parts[1], 10, 64) 231 | if err == nil { 232 | aff.CompoundRule = make([]string, 0, val) 233 | } else { 234 | aff.CompoundRule = append(aff.CompoundRule, parts[1]) 235 | for _, char := range parts[1] { 236 | if _, ok := aff.compoundMap[char]; !ok { 237 | aff.compoundMap[char] = []string{} 238 | } 239 | } 240 | } 241 | case "NOSUGGEST": 242 | if len(parts) != 2 { 243 | return nil, fmt.Errorf("NOSUGGEST stanza had %d fields, expected 2", len(parts)) 244 | } 245 | // should use runes or parse correctly 246 | chars := []rune(parts[1]) 247 | if len(chars) != 1 { 248 | return nil, fmt.Errorf("NOSUGGEST stanza had more than one flag: %q", parts[1]) 249 | } 250 | aff.NoSuggestFlag = chars[0] 251 | case "WORDCHARS": 252 | if len(parts) != 2 { 253 | return nil, fmt.Errorf("WORDCHAR stanza had %d fields, expected 2", len(parts)) 254 | } 255 | aff.WordChars = parts[1] 256 | case "FLAG": 257 | if len(parts) != 2 { 258 | return nil, fmt.Errorf("FLAG stanza had %d, expected 1", len(parts)) 259 | } 260 | aff.Flag = parts[1] 261 | return nil, fmt.Errorf("FLAG stanza not yet supported") 262 | case "PFX", "SFX": 263 | atype := Prefix 264 | if parts[0] == "SFX" { 265 | atype = Suffix 266 | } 267 | 268 | switch len(parts) { 269 | case 4: 270 | cross, err := isCrossProduct(parts[2]) 271 | if err != nil { 272 | return nil, err 273 | } 274 | // this is a new Affix! 275 | a := Affix{ 276 | Type: atype, 277 | CrossProduct: cross, 278 | } 279 | flag := rune(parts[1][0]) 280 | aff.AffixMap[flag] = a 281 | case 5: 282 | // does this need to be split out into suffix and prefix? 283 | flag := rune(parts[1][0]) 284 | a, ok := aff.AffixMap[flag] 285 | if !ok { 286 | return nil, fmt.Errorf("Got rules for flag %q but no definition", flag) 287 | } 288 | 289 | strip := "" 290 | if parts[2] != "0" { 291 | strip = parts[2] 292 | } 293 | 294 | var matcher *regexp.Regexp 295 | var err error 296 | pat := parts[4] 297 | if pat != "." { 298 | if a.Type == Prefix { 299 | pat = "^" + pat 300 | } else { 301 | pat = pat + "$" 302 | } 303 | matcher, err = regexp.Compile(pat) 304 | if err != nil { 305 | return nil, fmt.Errorf("Unable to compile %s", pat) 306 | } 307 | } 308 | 309 | a.Rules = append(a.Rules, Rule{ 310 | Strip: strip, 311 | AffixText: parts[3], 312 | Pattern: parts[4], 313 | matcher: matcher, 314 | }) 315 | aff.AffixMap[flag] = a 316 | default: 317 | return nil, fmt.Errorf("%s stanza had %d fields, expected 4 or 5", parts[0], len(parts)) 318 | } 319 | default: 320 | // nothing 321 | } 322 | } 323 | 324 | if err := scanner.Err(); err != nil { 325 | return nil, err 326 | } 327 | 328 | return &aff, nil 329 | } 330 | -------------------------------------------------------------------------------- /aff_test.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "reflect" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | // SmokeTest for AFF parser. Contains a little bit of everything. 10 | // 11 | func TestAFFSmoke(t *testing.T) { 12 | sample := ` 13 | # 14 | 15 | TRY abc 16 | WORDCHARS 123 17 | ICONV 1 18 | ICONV a b 19 | PFX A Y 1 20 | PFX A 0 re . 21 | SFX D Y 4 22 | SFX D 0 d e 23 | SFX D y ied [^aeiou]y 24 | SFX D 0 ed [^ey] 25 | SFX D 0 ed [aeiou]y 26 | REP 1 27 | REP a ei 28 | COMPOUNDMIN 2 29 | ` 30 | aff, err := NewDictConfig(strings.NewReader(sample)) 31 | if err != nil { 32 | t.Fatalf("Unable to parse sample: %s", err) 33 | } 34 | 35 | if aff.TryChars != "abc" { 36 | t.Errorf("TRY stanza is %s", aff.TryChars) 37 | } 38 | 39 | if aff.WordChars != "123" { 40 | t.Errorf("WORDCHARS stanza is %s", aff.WordChars) 41 | } 42 | 43 | if aff.CompoundMin != 2 { 44 | t.Errorf("COMPOUNDMIN stanza not processed, want 2 got %d", aff.CompoundMin) 45 | } 46 | 47 | if len(aff.IconvReplacements) != 2 { 48 | t.Errorf("Didn't get ICONV replacement") 49 | } else { 50 | if aff.IconvReplacements[0] != "a" || aff.IconvReplacements[1] != "b" { 51 | t.Errorf("Replacement isnt a->b, got %v", aff.IconvReplacements) 52 | } 53 | } 54 | 55 | if len(aff.Replacements) != 1 { 56 | t.Errorf("Didn't get REPlacement") 57 | } else { 58 | pair := aff.Replacements[0] 59 | if pair[0] != "a" || pair[1] != "ei" { 60 | t.Errorf("Replacement isnt [a ie] got %v", pair) 61 | } 62 | } 63 | 64 | if len(aff.AffixMap) != 2 { 65 | t.Errorf("AffixMap is wrong size") 66 | } 67 | a, ok := aff.AffixMap[rune('A')] 68 | if !ok { 69 | t.Fatalf("Didn't get Affix for A") 70 | } 71 | if a.Type != Prefix { 72 | t.Fatalf("A Affix should be PFX %v, got %v", Prefix, a.Type) 73 | } 74 | if !a.CrossProduct { 75 | t.Fatalf("A Affix should be a cross product") 76 | } 77 | 78 | variations := a.Expand("define", nil) 79 | if len(variations) != 1 { 80 | t.Fatalf("Expected 1 variation got %d", len(variations)) 81 | } 82 | if variations[0] != "redefine" { 83 | t.Errorf("Expected %s got %s", "redefine", variations[0]) 84 | } 85 | 86 | a, ok = aff.AffixMap[rune('D')] 87 | if !ok { 88 | t.Fatalf("Didn't get Affix for D") 89 | } 90 | if a.Type != Suffix { 91 | t.Fatalf("Affix D is not a SFX %v", Suffix) 92 | } 93 | if len(a.Rules) != 4 { 94 | t.Fatalf("Affix should have 4 rules, got %d", len(a.Rules)) 95 | } 96 | variations = a.Expand("accept", nil) 97 | if len(variations) != 1 { 98 | t.Fatalf("D Affix should have %d rules, got %d", 1, len(variations)) 99 | } 100 | if variations[0] != "accepted" { 101 | t.Errorf("Expected %s got %s", "accepted", variations[0]) 102 | } 103 | } 104 | 105 | func TestExpand(t *testing.T) { 106 | sample := ` 107 | SET UTF-8 108 | TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ' 109 | 110 | REP 2 111 | REP f ph 112 | REP ph f 113 | 114 | PFX A Y 1 115 | PFX A 0 re . 116 | 117 | SFX B Y 2 118 | SFX B 0 ed [^y] 119 | SFX B y ied y 120 | ` 121 | aff, err := NewDictConfig(strings.NewReader(sample)) 122 | if err != nil { 123 | t.Fatalf("Unable to parse sample: %s", err) 124 | } 125 | 126 | cases := []struct { 127 | word string 128 | want []string 129 | }{ 130 | {"hello", []string{"hello"}}, 131 | {"try/B", []string{"try", "tried"}}, 132 | {"work/AB", []string{"work", "worked", "rework", "reworked"}}, 133 | } 134 | for pos, tt := range cases { 135 | got, err := aff.Expand(tt.word, nil) 136 | if err != nil { 137 | t.Errorf("%d: affix expansions error: %s", pos, err) 138 | } 139 | if !reflect.DeepEqual(tt.want, got) { 140 | t.Errorf("%d: affix expansion want %v got %v", pos, tt.want, got) 141 | } 142 | } 143 | } 144 | 145 | func TestCompound(t *testing.T) { 146 | sampleAff := ` 147 | SET UTF-8 148 | COMPOUNDMIN 1 149 | ONLYINCOMPOUND c 150 | COMPOUNDRULE 2 151 | COMPOUNDRULE n*1t 152 | COMPOUNDRULE n*mp 153 | WORDCHARS 0123456789 154 | ` 155 | sampleDic := `23 156 | 0/nm 157 | 0th/pt 158 | 1/n1 159 | 1st/p 160 | 1th/tc 161 | 2/nm 162 | 2nd/p 163 | 2th/tc 164 | 3/nm 165 | 3rd/p 166 | 3th/tc 167 | 4/nm 168 | 4th/pt 169 | 5/nm 170 | 5th/pt 171 | 6/nm 172 | 6th/pt 173 | 7/nm 174 | 7th/pt 175 | 8/nm 176 | 8th/pt 177 | 9/nm 178 | 9th/pt 179 | ` 180 | aff := strings.NewReader(sampleAff) 181 | dic := strings.NewReader(sampleDic) 182 | gs, err := NewGoSpellReader(aff, dic) 183 | if err != nil { 184 | t.Fatalf("Unable to create GoSpell: %s", err) 185 | } 186 | 187 | cases := []struct { 188 | word string 189 | spell bool 190 | }{ 191 | {"0", true}, 192 | {"1", true}, 193 | {"2", true}, 194 | {"3", true}, 195 | {"4", true}, 196 | {"5", true}, 197 | {"6", true}, 198 | {"7", true}, 199 | {"8", true}, 200 | {"9", true}, 201 | {"10", true}, 202 | {"21", true}, 203 | {"32", true}, 204 | {"43", true}, 205 | {"54", true}, 206 | {"65", true}, 207 | {"76", true}, 208 | {"87", true}, 209 | {"98", true}, 210 | {"99", true}, 211 | {"1st", true}, 212 | {"21st", true}, 213 | {"11th", true}, 214 | {"1th", false}, 215 | {"12th", true}, 216 | {"2th", false}, 217 | {"13th", true}, 218 | {"3th", false}, 219 | {"3rd", true}, 220 | {"33rd", true}, 221 | {"4th", true}, 222 | {"5th", true}, 223 | {"6th", true}, 224 | {"7th", true}, 225 | {"8th", true}, 226 | {"9th", true}, 227 | {"14th", true}, 228 | {"15th", true}, 229 | {"16th", true}, 230 | {"17th", true}, 231 | {"18th", true}, 232 | {"19th", true}, 233 | {"111", true}, 234 | {"111st", false}, 235 | {"111th", true}, 236 | } 237 | for pos, tt := range cases { 238 | if gs.Spell(tt.word) != tt.spell { 239 | t.Errorf("%d %q was not %v", pos, tt.word, tt.spell) 240 | } 241 | } 242 | } 243 | 244 | func TestSpell(t *testing.T) { 245 | sampleAff := ` 246 | SET UTF-8 247 | WORDCHARS 0123456789 248 | 249 | PFX A Y 1 250 | PFX A 0 re . 251 | 252 | SFX B Y 2 253 | SFX B 0 ed [^y] 254 | SFX B y ied y 255 | ` 256 | 257 | sampleDic := `4 258 | hello 259 | try/B 260 | work/AB 261 | GB 262 | ` 263 | aff := strings.NewReader(sampleAff) 264 | dic := strings.NewReader(sampleDic) 265 | gs, err := NewGoSpellReader(aff, dic) 266 | if err != nil { 267 | t.Fatalf("Unable to create GoSpell: %s", err) 268 | } 269 | 270 | cases := []struct { 271 | word string 272 | spell bool 273 | }{ 274 | {"hello", true}, 275 | {"try", true}, 276 | {"tried", true}, 277 | {"work", true}, 278 | {"worked", true}, 279 | {"rework", true}, 280 | {"reworked", true}, 281 | {"junk", false}, 282 | {"100", true}, 283 | {"1", true}, 284 | {"100GB", true}, 285 | {"100mi", false}, 286 | {"0xFF", true}, 287 | {"0x12ff", true}, 288 | } 289 | for pos, tt := range cases { 290 | if gs.Spell(tt.word) != tt.spell { 291 | t.Errorf("%d %q was not %v", pos, tt.word, tt.spell) 292 | } 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /case.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "strings" 5 | "unicode" 6 | ) 7 | 8 | // WordCase is an enum of various word casing styles 9 | type WordCase int 10 | 11 | // Various WordCase types.. likely to be not correct 12 | const ( 13 | AllLower WordCase = iota 14 | AllUpper 15 | Title 16 | Mixed 17 | Camel 18 | ) 19 | 20 | // CaseStyle returns what case style a word is in 21 | func CaseStyle(word string) WordCase { 22 | hasTitle := false 23 | upperCount := 0 24 | lowerCount := 0 25 | runeCount := 0 26 | 27 | // this iterates over RUNES not BYTES 28 | for _, r := range word { 29 | // ASCII apostrophe doesn't count 30 | // want words like "don't" to have 31 | // upper case forms when adding to dictionary 32 | if r == 0x0027 { 33 | continue 34 | } 35 | runeCount++ 36 | if unicode.IsLower(r) { 37 | lowerCount++ 38 | continue 39 | } 40 | if unicode.IsUpper(r) { 41 | if runeCount == 1 { 42 | hasTitle = true 43 | } 44 | upperCount++ 45 | continue 46 | } 47 | 48 | //??? 49 | } 50 | 51 | switch { 52 | case runeCount == lowerCount: 53 | return AllLower 54 | case runeCount == upperCount: 55 | return AllUpper 56 | case hasTitle && runeCount-1 == lowerCount: 57 | return Title 58 | default: 59 | return Mixed 60 | } 61 | } 62 | 63 | // CaseVariations returns 64 | // If AllUpper or First-Letter-Only is upcased: add the all upper case version 65 | // If AllLower, add the original, the title and upcase forms 66 | // If Mixed, return the original, and the all upcase form 67 | // 68 | func CaseVariations(word string, style WordCase) []string { 69 | switch style { 70 | case AllLower: 71 | return []string{word, strings.ToUpper(word[0:1]) + word[1:], strings.ToUpper(word)} 72 | case AllUpper: 73 | return []string{strings.ToUpper(word)} 74 | default: 75 | return []string{word, strings.ToUpper(word)} 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /case_test.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestCaseStyle(t *testing.T) { 9 | cases := []struct { 10 | word string 11 | want WordCase 12 | }{ 13 | {"lower", AllLower}, 14 | {"what's", AllLower}, 15 | {"UPPER", AllUpper}, 16 | {"Title", Title}, 17 | {"CamelCase", Mixed}, 18 | {"camelCase", Mixed}, 19 | } 20 | 21 | for pos, tt := range cases { 22 | got := CaseStyle(tt.word) 23 | if tt.want != got { 24 | t.Errorf("Case %d %q: want %v got %v", pos, tt.word, tt.want, got) 25 | } 26 | } 27 | } 28 | 29 | func TestCaseVariations(t *testing.T) { 30 | cases := []struct { 31 | word string 32 | want []string 33 | }{ 34 | {"that's", []string{"that's", "That's", "THAT'S"}}, 35 | } 36 | for pos, tt := range cases { 37 | got := CaseVariations(tt.word, CaseStyle(tt.word)) 38 | if !reflect.DeepEqual(tt.want, got) { 39 | t.Errorf("Case %d %q: want %v got %v", pos, tt.word, tt.want, got) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /cmd/gospell/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "io/ioutil" 7 | "log" 8 | "os" 9 | "path/filepath" 10 | "text/template" 11 | "time" 12 | 13 | "github.com/client9/gospell" 14 | "github.com/client9/gospell/plaintext" 15 | ) 16 | 17 | var ( 18 | stdout *log.Logger // see below in init() 19 | defaultLog *template.Template 20 | defaultWord *template.Template 21 | defaultLine *template.Template 22 | ) 23 | 24 | const ( 25 | defaultLogTmpl = `{{ .Path }}:{{ .LineNum }}:{{ js .Original }}` 26 | defaultWordTmpl = `{{ .Original }}` 27 | defaultLineTmpl = `{{ .Line }}` 28 | ) 29 | 30 | func init() { 31 | // we see it so it doesn't use a prefix or include a time stamp. 32 | stdout = log.New(os.Stdout, "", 0) 33 | defaultLog = template.Must(template.New("defaultLog").Parse(defaultLogTmpl)) 34 | defaultWord = template.Must(template.New("defaultWord").Parse(defaultWordTmpl)) 35 | defaultLine = template.Must(template.New("defaultLine").Parse(defaultLineTmpl)) 36 | } 37 | 38 | func main() { 39 | format := flag.String("f", "", "use Golang template for log message") 40 | listOnly := flag.Bool("l", false, "only print unknown word") 41 | lineOnly := flag.Bool("L", false, "print line with unknown word") 42 | 43 | // TODO based on OS (Windows vs. Linux) 44 | dictPath := flag.String("path", ".:/usr/local/share/hunspell:/usr/share/hunspell", "Search path for dictionaries") 45 | 46 | // TODO based on environment variable settings 47 | dicts := flag.String("d", "en_US", "dictionaries to load") 48 | 49 | personalDict := flag.String("p", "", "personal wordlist file") 50 | 51 | flag.Parse() 52 | args := flag.Args() 53 | 54 | if *listOnly { 55 | defaultLog = defaultWord 56 | } 57 | 58 | if *lineOnly { 59 | defaultLog = defaultLine 60 | } 61 | 62 | if len(*format) > 0 { 63 | t, err := template.New("custom").Parse(*format) 64 | if err != nil { 65 | log.Fatalf("Unable to compile log format: %s", err) 66 | } 67 | defaultLog = t 68 | } 69 | 70 | affFile := "" 71 | dicFile := "" 72 | for _, base := range filepath.SplitList(*dictPath) { 73 | affFile = filepath.Join(base, *dicts+".aff") 74 | dicFile = filepath.Join(base, *dicts+".dic") 75 | //log.Printf("Trying %s", affFile) 76 | _, err1 := os.Stat(affFile) 77 | _, err2 := os.Stat(dicFile) 78 | if err1 == nil && err2 == nil { 79 | break 80 | } 81 | affFile = "" 82 | dicFile = "" 83 | } 84 | 85 | if affFile == "" { 86 | log.Fatalf("Unable to load %s", *dicts) 87 | } 88 | 89 | log.Printf("Loading %s %s", affFile, dicFile) 90 | timeStart := time.Now() 91 | h, err := gospell.NewGoSpell(affFile, dicFile) 92 | timeEnd := time.Now() 93 | 94 | // note: 10x too slow 95 | log.Printf("Loaded in %v", timeEnd.Sub(timeStart)) 96 | if err != nil { 97 | log.Fatalf("%s", err) 98 | } 99 | 100 | if *personalDict != "" { 101 | raw, err := ioutil.ReadFile(*personalDict) 102 | if err != nil { 103 | log.Fatalf("Unable to load personal dictionary %s: %s", *personalDict, err) 104 | } 105 | duplicates, err := h.AddWordList(bytes.NewReader(raw)) 106 | if err != nil { 107 | log.Fatalf("Unable to process personal dictionary %s: %s", *personalDict, err) 108 | } 109 | if len(duplicates) > 0 { 110 | for _, word := range duplicates { 111 | log.Printf("Word %q in personal dictionary already exists in main dictionary", word) 112 | } 113 | } 114 | } 115 | 116 | // stdin support 117 | if len(args) == 0 { 118 | raw, err := ioutil.ReadAll(os.Stdin) 119 | if err != nil { 120 | log.Fatalf("Unable to read Stdin: %s", err) 121 | } 122 | pt, _ := plaintext.NewIdentity() 123 | out := gospell.SpellFile(h, pt, raw) 124 | for _, diff := range out { 125 | diff.Filename = "stdin" 126 | diff.Path = "" 127 | buf := bytes.Buffer{} 128 | defaultLog.Execute(&buf, diff) 129 | // goroutine-safe print to os.Stdout 130 | stdout.Println(buf.String()) 131 | } 132 | } 133 | for _, arg := range args { 134 | // ignore directories 135 | if f, err := os.Stat(arg); err != nil || f.IsDir() { 136 | continue 137 | } 138 | 139 | raw, err := ioutil.ReadFile(arg) 140 | if err != nil { 141 | log.Fatalf("Unable to read %q: %s", arg, err) 142 | } 143 | pt, err := plaintext.ExtractorByFilename(arg) 144 | if err != nil { 145 | continue 146 | } 147 | out := gospell.SpellFile(h, pt, raw) 148 | for _, diff := range out { 149 | diff.Filename = filepath.Base(arg) 150 | diff.Path = arg 151 | buf := bytes.Buffer{} 152 | defaultLog.Execute(&buf, diff) 153 | // goroutine-safe print to os.Stdout 154 | stdout.Println(buf.String()) 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /cmd/sample/sample.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/naoina/toml" 5 | glob "github.com/ryanuber/go-glob" 6 | "io/ioutil" 7 | "log" 8 | "os" 9 | "path/filepath" 10 | "strings" 11 | 12 | "github.com/client9/gospell" 13 | "github.com/client9/gospell/plaintext" 14 | ) 15 | 16 | // Dictionary is the configuration structure 17 | type Dictionary struct { 18 | Language string `json:"language"` // core dictionary 19 | Extra []string // extra word packs 20 | //Wordlist []string // personal word list files 21 | Additions []string // inline word additions 22 | Removals []string 23 | 24 | FileSet []DictionaryFileSet `json:"fileset"` 25 | } 26 | 27 | // FileSet represents options to select or exclude a group of files 28 | type FileSet struct { 29 | Path string 30 | Include []string 31 | Exclude []string 32 | 33 | Matches []string 34 | } 35 | 36 | // DictionaryFileSet extends FileSet to include other information ont 37 | // the file type 38 | type DictionaryFileSet struct { 39 | FileSet 40 | Charset string 41 | Source string 42 | TemplateType string 43 | } 44 | 45 | func (fs *FileSet) visit(path string, info os.FileInfo, err error) error { 46 | if err != nil { 47 | log.Printf("visitor failed on %q: %s", path, err) 48 | return nil 49 | } 50 | included := false 51 | for _, inc := range fs.Include { 52 | if glob.Glob(inc, path) { 53 | included = true 54 | break 55 | } 56 | } 57 | excluded := false 58 | for _, exc := range fs.Exclude { 59 | if strings.Index(path, exc) != -1 { 60 | excluded = true 61 | break 62 | } 63 | } 64 | if included && !excluded && !info.IsDir() { 65 | fs.Matches = append(fs.Matches, path) 66 | //log.Printf("path allowed: %q", path) 67 | return nil 68 | } 69 | if !included && !excluded { 70 | //log.Printf("path ignored: %q", path) 71 | return nil 72 | } 73 | if excluded && info.IsDir() { 74 | //log.Printf("path ignoring directory %q", path) 75 | return filepath.SkipDir 76 | } 77 | //log.Printf("Included then excluded: %q", path) 78 | return nil 79 | } 80 | 81 | func main() { 82 | config, err := ioutil.ReadFile(".spelling.toml") 83 | if err != nil { 84 | log.Fatalf("Unable to reading config: %s", err) 85 | } 86 | //log.Printf("JSON: %s", cson.ToJSON([]byte(config))) 87 | s := Dictionary{} 88 | err = toml.Unmarshal([]byte(config), &s) 89 | if err != nil { 90 | log.Printf("out : %+v", s) 91 | log.Fatalf("err = %v", err) 92 | } 93 | if s.Language == "" { 94 | s.Language = "en_US" 95 | } 96 | if s.Language != "en_US" { 97 | log.Fatalf("Only support en_US: got %q", s.Language) 98 | } 99 | gs, err := gospell.NewGoSpell("/usr/local/share/hunspell/en_US.aff", "/usr/local/share/hunspell/en_US.dic") 100 | if err != nil { 101 | log.Fatalf("Unable to load dictionary: %s", err) 102 | } 103 | 104 | for _, wordfile := range s.Extra { 105 | _, err := gs.AddWordListFile(wordfile) 106 | if err != nil { 107 | log.Printf("Unable to read word list %s: %s", wordfile, err) 108 | } 109 | } 110 | 111 | for _, word := range s.Additions { 112 | log.Printf("Adding %q", word) 113 | gs.AddWordRaw(word) 114 | } 115 | 116 | if len(s.FileSet) == 0 { 117 | s.FileSet = append(s.FileSet, DictionaryFileSet{ 118 | FileSet: FileSet{ 119 | Path: ".", 120 | Include: []string{"*"}, 121 | Exclude: []string{".git"}, 122 | }, 123 | }) 124 | } 125 | finalExit := 0 126 | for _, fs := range s.FileSet { 127 | if fs.Path == "" { 128 | fs.Path = "." 129 | } 130 | filepath.Walk(fs.Path, fs.visit) 131 | for _, filename := range fs.Matches { 132 | raw, err := ioutil.ReadFile(filename) 133 | if err != nil { 134 | log.Printf("Unable to read %q: %s", filename, err) 135 | finalExit = finalExit | 2 136 | continue 137 | } 138 | pt, err := plaintext.ExtractorByFilename(filename) 139 | if err != nil { 140 | continue 141 | } 142 | out := gospell.SpellFile(gs, pt, raw) 143 | for _, diff := range out { 144 | diff.Filename = filepath.Base(filename) 145 | diff.Path = filename 146 | finalExit = finalExit | 1 147 | log.Printf("Got a %s:%d %s", diff.Path, diff.LineNum, diff.Original) 148 | } 149 | } 150 | } 151 | os.Exit(finalExit) 152 | } 153 | -------------------------------------------------------------------------------- /file.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "github.com/client9/gospell/plaintext" 5 | 6 | "strings" 7 | ) 8 | 9 | // Diff represent a unknown word in a file 10 | type Diff struct { 11 | Filename string 12 | Path string 13 | Original string 14 | Line string 15 | LineNum int 16 | } 17 | 18 | // SpellFile is attempts to spell-check a file. This interface is not 19 | // very good so expect changes. 20 | func SpellFile(gs *GoSpell, ext plaintext.Extractor, raw []byte) []Diff { 21 | out := []Diff{} 22 | 23 | // remove any golang templates 24 | raw = plaintext.StripTemplate(raw) 25 | 26 | // extract plain text 27 | raw = ext.Text(raw) 28 | 29 | // do character conversion "smart quotes" to quotes, etc 30 | // as specified in the Affix file 31 | rawstring := gs.InputConversion(raw) 32 | 33 | // zap URLS 34 | s := RemoveURL(rawstring) 35 | // zap file paths 36 | s = RemovePath(s) 37 | 38 | for linenum, line := range strings.Split(s, "\n") { 39 | // now get words 40 | words := gs.Split(line) 41 | for _, word := range words { 42 | // HACK 43 | word = strings.Trim(word, "'") 44 | if known := gs.Spell(word); !known { 45 | out = append(out, Diff{ 46 | Line: line, 47 | LineNum: linenum + 1, 48 | Original: word, 49 | }) 50 | } 51 | } 52 | } 53 | return out 54 | } 55 | -------------------------------------------------------------------------------- /gospell.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "log" 8 | "os" 9 | "regexp" 10 | "strconv" 11 | "strings" 12 | ) 13 | 14 | // GoSpell is main struct 15 | type GoSpell struct { 16 | Config DictConfig 17 | Dict map[string]struct{} // likely will contain some value later 18 | 19 | ireplacer *strings.Replacer // input conversion 20 | compounds []*regexp.Regexp 21 | splitter *Splitter 22 | } 23 | 24 | // InputConversion does any character substitution before checking 25 | // This is based on the ICONV stanza 26 | func (s *GoSpell) InputConversion(raw []byte) string { 27 | sraw := string(raw) 28 | if s.ireplacer == nil { 29 | return sraw 30 | } 31 | return s.ireplacer.Replace(sraw) 32 | } 33 | 34 | // Split a text into Words 35 | func (s *GoSpell) Split(text string) []string { 36 | return s.splitter.Split(text) 37 | } 38 | 39 | // AddWordRaw adds a single word to the internal dictionary without modifications 40 | // returns true if added 41 | // return false is already exists 42 | func (s *GoSpell) AddWordRaw(word string) bool { 43 | _, ok := s.Dict[word] 44 | if ok { 45 | // already exists 46 | return false 47 | } 48 | s.Dict[word] = struct{}{} 49 | return true 50 | } 51 | 52 | // AddWordListFile reads in a word list file 53 | func (s *GoSpell) AddWordListFile(name string) ([]string, error) { 54 | fd, err := os.Open(name) 55 | if err != nil { 56 | return nil, err 57 | } 58 | defer fd.Close() 59 | return s.AddWordList(fd) 60 | } 61 | 62 | // AddWordList adds basic word lists, just one word per line 63 | // Assumed to be in UTF-8 64 | // TODO: hunspell compatible with "*" prefix for forbidden words 65 | // and affix support 66 | // returns list of duplicated words and/or error 67 | func (s *GoSpell) AddWordList(r io.Reader) ([]string, error) { 68 | var duplicates []string 69 | scanner := bufio.NewScanner(r) 70 | for scanner.Scan() { 71 | line := strings.TrimSpace(scanner.Text()) 72 | if len(line) == 0 || line == "#" { 73 | continue 74 | } 75 | for _, word := range CaseVariations(line, CaseStyle(line)) { 76 | if !s.AddWordRaw(word) { 77 | duplicates = append(duplicates, word) 78 | } 79 | } 80 | } 81 | if err := scanner.Err(); err != nil { 82 | return duplicates, err 83 | } 84 | return duplicates, nil 85 | } 86 | 87 | // Spell checks to see if a given word is in the internal dictionaries 88 | // TODO: add multiple dictionaries 89 | func (s *GoSpell) Spell(word string) bool { 90 | //log.Printf("Checking %s", word) 91 | _, ok := s.Dict[word] 92 | if ok { 93 | return true 94 | } 95 | if isNumber(word) { 96 | return true 97 | } 98 | if isNumberHex(word) { 99 | return true 100 | } 101 | 102 | if isNumberBinary(word) { 103 | return true 104 | } 105 | 106 | if isHash(word) { 107 | return true 108 | } 109 | 110 | // check compounds 111 | for _, pat := range s.compounds { 112 | if pat.MatchString(word) { 113 | return true 114 | } 115 | } 116 | 117 | // Maybe a word with units? e.g. 100GB 118 | units := isNumberUnits(word) 119 | if units != "" { 120 | // dictionary appears to have list of units 121 | if _, ok = s.Dict[units]; ok { 122 | return true 123 | } 124 | } 125 | 126 | // if camelCase and each word e.g. "camel" "Case" is know 127 | // then the word is considered known 128 | if chunks := splitCamelCase(word); len(chunks) > 0 { 129 | if false { 130 | for _, chunk := range chunks { 131 | if _, ok = s.Dict[chunk]; !ok { 132 | return false 133 | } 134 | } 135 | } 136 | return true 137 | } 138 | 139 | return false 140 | } 141 | 142 | // NewGoSpellReader creates a speller from io.Readers for 143 | // Hunspell files 144 | func NewGoSpellReader(aff, dic io.Reader) (*GoSpell, error) { 145 | affix, err := NewDictConfig(aff) 146 | if err != nil { 147 | return nil, err 148 | } 149 | 150 | scanner := bufio.NewScanner(dic) 151 | // get first line 152 | if !scanner.Scan() { 153 | return nil, scanner.Err() 154 | } 155 | line := scanner.Text() 156 | i, err := strconv.ParseInt(line, 10, 64) 157 | if err != nil { 158 | return nil, err 159 | } 160 | 161 | gs := GoSpell{ 162 | Dict: make(map[string]struct{}, i*5), 163 | compounds: make([]*regexp.Regexp, 0, len(affix.CompoundRule)), 164 | splitter: NewSplitter(affix.WordChars), 165 | } 166 | 167 | words := []string{} 168 | for scanner.Scan() { 169 | line := scanner.Text() 170 | words, err = affix.Expand(line, words) 171 | if err != nil { 172 | return nil, fmt.Errorf("Unable to process %q: %s", line, err) 173 | } 174 | 175 | if len(words) == 0 { 176 | //log.Printf("No words for %s", line) 177 | continue 178 | } 179 | 180 | style := CaseStyle(words[0]) 181 | for _, word := range words { 182 | for _, wordform := range CaseVariations(word, style) { 183 | gs.Dict[wordform] = struct{}{} 184 | } 185 | } 186 | } 187 | 188 | if err := scanner.Err(); err != nil { 189 | return nil, err 190 | } 191 | 192 | for _, compoundRule := range affix.CompoundRule { 193 | pattern := "^" 194 | for _, key := range compoundRule { 195 | switch key { 196 | case '(', ')', '+', '?', '*': 197 | pattern = pattern + string(key) 198 | default: 199 | groups := affix.compoundMap[key] 200 | pattern = pattern + "(" + strings.Join(groups, "|") + ")" 201 | } 202 | } 203 | pattern = pattern + "$" 204 | pat, err := regexp.Compile(pattern) 205 | if err != nil { 206 | log.Printf("REGEXP FAIL= %q %s", pattern, err) 207 | } else { 208 | gs.compounds = append(gs.compounds, pat) 209 | } 210 | 211 | } 212 | 213 | if len(affix.IconvReplacements) > 0 { 214 | gs.ireplacer = strings.NewReplacer(affix.IconvReplacements...) 215 | } 216 | return &gs, nil 217 | } 218 | 219 | // NewGoSpell from AFF and DIC Hunspell filenames 220 | func NewGoSpell(affFile, dicFile string) (*GoSpell, error) { 221 | aff, err := os.Open(affFile) 222 | if err != nil { 223 | return nil, fmt.Errorf("Unable to open aff: %s", err) 224 | } 225 | defer aff.Close() 226 | dic, err := os.Open(dicFile) 227 | if err != nil { 228 | return nil, fmt.Errorf("Unable to open dic: %s", err) 229 | } 230 | defer dic.Close() 231 | h, err := NewGoSpellReader(aff, dic) 232 | return h, err 233 | } 234 | -------------------------------------------------------------------------------- /notwords.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "bytes" 5 | "strings" 6 | ) 7 | 8 | // Functions to remove non-words such as URLs, file paths, etc. 9 | 10 | // This needs auditing as I believe it is wrong 11 | func enURLChar(c rune) bool { 12 | return (c >= 'a' && c <= 'z') || 13 | (c >= 'A' && c <= 'Z') || 14 | (c >= '0' && c <= '9') || 15 | c == '-' || 16 | c == '_' || 17 | c == '\\' || 18 | c == '.' || 19 | c == ':' || 20 | c == ';' || 21 | c == '/' || 22 | c == '~' || 23 | c == '%' || 24 | c == '*' || 25 | c == '$' || 26 | c == '[' || 27 | c == ']' || 28 | c == '?' || 29 | c == '#' || 30 | c == '!' 31 | } 32 | func enNotURLChar(c rune) bool { 33 | return !enURLChar(c) 34 | } 35 | 36 | // RemoveURL attempts to strip away obvious URLs 37 | // 38 | func RemoveURL(s string) string { 39 | var idx int 40 | 41 | for { 42 | if idx = strings.Index(s, "http"); idx == -1 { 43 | return s 44 | } 45 | 46 | news := s[:idx] 47 | endx := strings.IndexFunc(s[idx:], enNotURLChar) 48 | if endx != -1 { 49 | news = news + " " + s[idx+endx:] 50 | } 51 | s = news 52 | } 53 | } 54 | 55 | // RemovePath attempts to strip away embedded file system paths, e.g. 56 | // /foo/bar or /static/myimg.png 57 | // 58 | // TODO: windows style 59 | // 60 | func RemovePath(s string) string { 61 | out := bytes.Buffer{} 62 | var idx int 63 | for len(s) > 0 { 64 | if idx = strings.IndexByte(s, '/'); idx == -1 { 65 | out.WriteString(s) 66 | break 67 | } 68 | 69 | if idx > 0 { 70 | idx-- 71 | } 72 | 73 | var chclass string 74 | switch s[idx] { 75 | case '/', ' ', '\n', '\t', '\r': 76 | chclass = " \n\r\t" 77 | case '[': 78 | chclass = "]\n" 79 | case '(': 80 | chclass = ")\n" 81 | default: 82 | out.WriteString(s[:idx+2]) 83 | s = s[idx+2:] 84 | continue 85 | } 86 | 87 | endx := strings.IndexAny(s[idx+1:], chclass) 88 | if endx != -1 { 89 | out.WriteString(s[:idx+1]) 90 | out.Write(bytes.Repeat([]byte{' '}, endx)) 91 | s = s[idx+endx+1:] 92 | } else { 93 | out.WriteString(s) 94 | break 95 | } 96 | } 97 | return out.String() 98 | } 99 | -------------------------------------------------------------------------------- /notwords_test.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestRemovePath(t *testing.T) { 8 | cases := []struct { 9 | word string 10 | want string 11 | }{ 12 | {" /foo/bar abc", " abc"}, 13 | {"X/foo/bar abc", "X/foo/bar abc"}, 14 | {"[/foo/bar] abc", "[ ] abc"}, 15 | {"/", "/"}, 16 | } 17 | for pos, tt := range cases { 18 | got := RemovePath(tt.word) 19 | if got != tt.want { 20 | t.Errorf("%d want %q got %q", pos, tt.want, got) 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /plaintext/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Nick Galbreath 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /plaintext/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: install lint test 3 | 4 | install: 5 | go get ./... 6 | go install ./... 7 | 8 | lint: 9 | golint ./... 10 | go vet ./... 11 | find . -name '*.go' | xargs gofmt -w -s 12 | 13 | test: 14 | go test . 15 | misspell *.md *.go 16 | 17 | clean: 18 | rm -f *~ 19 | go clean ./... 20 | git gc 21 | 22 | ci: install lint test 23 | 24 | docker-ci: 25 | docker run --rm \ 26 | -e COVERALLS_REPO_TOKEN=$COVERALLS_REPO_TOKEN \ 27 | -v $(PWD):/go/src/github.com/client9/plaintext \ 28 | -w /go/src/github.com/client9/plaintext \ 29 | nickg/golang-dev-docker \ 30 | make ci 31 | 32 | .PHONY: ci docker-ci 33 | -------------------------------------------------------------------------------- /plaintext/README.md: -------------------------------------------------------------------------------- 1 | # plaintext 2 | [![Build Status](https://travis-ci.org/client9/plaintext.svg?branch=master)](https://travis-ci.org/client9/plaintext) [![Go Report Card](http://goreportcard.com/badge/client9/plaintext)](http://goreportcard.com/report/client9/plaintext) [![GoDoc](https://godoc.org/github.com/client9/plaintext?status.svg)](https://godoc.org/github.com/client9/plaintext) [![Coverage](http://gocover.io/_badge/github.com/client9/plaintext)](http://gocover.io/github.com/client9/plaintext) [![license](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](https://raw.githubusercontent.com/client9/plaintext/master/LICENSE) 3 | 4 | Extract human languages in plain UTF-8 text from computer code and markup 5 | 6 | The output is (or should be) *line-preserving*, meaning, no new lines are added or subtracted. 7 | 8 | ```html 9 |

10 | foo 11 |

12 | ``` 13 | 14 | becomes 15 | 16 | ```html 17 | 18 | foo 19 | 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /plaintext/cmd/plaintext/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "io/ioutil" 6 | "log" 7 | "os" 8 | 9 | "github.com/client9/plaintext" 10 | ) 11 | 12 | func main() { 13 | extension := flag.String("s", "", "over-ride file suffix to determine parser") 14 | flag.Parse() 15 | ext := *extension 16 | if ext != "" && ext[0] != '.' { 17 | ext = "." + ext 18 | } 19 | args := flag.Args() 20 | 21 | // stdin support 22 | if len(args) == 0 { 23 | raw, err := ioutil.ReadAll(os.Stdin) 24 | if err != nil { 25 | log.Fatalf("Unable to read Stdin: %s", err) 26 | } 27 | md, err := plaintext.ExtractorByFilename("stdin" + *extension) 28 | if err != nil { 29 | log.Fatalf("Unable to create parser: %s", err) 30 | } 31 | 32 | raw = plaintext.StripTemplate(raw) 33 | os.Stdout.Write(md.Text(raw)) 34 | } 35 | 36 | for _, arg := range args { 37 | raw, err := ioutil.ReadFile(arg) 38 | if err != nil { 39 | log.Fatalf("Unable to read %q: %s", arg, err) 40 | } 41 | md, err := plaintext.ExtractorByFilename(arg + *extension) 42 | if err != nil { 43 | log.Fatalf("Unable to create parser: %s", err) 44 | } 45 | 46 | raw = plaintext.StripTemplate(raw) 47 | os.Stdout.Write(md.Text(raw)) 48 | os.Stdout.Write([]byte{'\n'}) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /plaintext/golang.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "bytes" 5 | "text/scanner" 6 | ) 7 | 8 | // GolangText extracts plaintext from Golang and other similar C or Java like files 9 | // 10 | // Need to study. https://godoc.org/github.com/fluhus/godoc-tricks 11 | // Does not process embedded code blocks 12 | // 13 | type GolangText struct { 14 | } 15 | 16 | // NewGolangText creates a new extractor 17 | func NewGolangText() (*GolangText, error) { 18 | return &GolangText{}, nil 19 | } 20 | 21 | // Text satisfies the Extractor interface 22 | // 23 | //ReplaceGo is a specialized routine for correcting Golang source 24 | // files. Currently only checks comments, not identifiers for 25 | // spelling. 26 | // 27 | // Other items: 28 | // - check strings, but need to ignore 29 | // * import "statements" blocks 30 | // * import ( "blocks" ) 31 | // - skip first comment (line 0) if build comment 32 | // 33 | func (p *GolangText) Text(raw []byte) []byte { 34 | out := bytes.Buffer{} 35 | s := scanner.Scanner{} 36 | s.Init(bytes.NewReader(raw)) 37 | s.Error = (func(s *scanner.Scanner, msg string) {}) 38 | s.Mode = scanner.ScanIdents | scanner.ScanFloats | scanner.ScanChars | scanner.ScanStrings | scanner.ScanRawStrings | scanner.ScanComments 39 | for { 40 | switch s.Scan() { 41 | case scanner.Comment: 42 | out.WriteString(s.TokenText()) 43 | out.WriteByte('\n') 44 | case scanner.EOF: 45 | return out.Bytes() 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /plaintext/html.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "bytes" 5 | "golang.org/x/net/html" 6 | ) 7 | 8 | var blockTag = map[string]struct{}{ 9 | "br": {}, 10 | "h1": {}, 11 | "h2": {}, 12 | "h3": {}, 13 | "h4": {}, 14 | "h5": {}, 15 | "pre": {}, 16 | "li": {}, 17 | "p": {}, 18 | "div": {}, 19 | "blockquote": {}, 20 | } 21 | 22 | func isBlock(tag []byte) bool { 23 | _, ok := blockTag[string(tag)] 24 | return ok 25 | } 26 | 27 | // count number of newlines in a text block 28 | func countNewlines(raw []byte) int { 29 | count := 0 30 | for idx := bytes.IndexByte(raw, '\n'); idx != -1 && idx < len(raw); raw = raw[idx:] { 31 | count++ 32 | idx++ 33 | } 34 | return count 35 | } 36 | 37 | // HTMLText extracts plain text from HTML markup 38 | type HTMLText struct { 39 | InspectImageAlt bool 40 | } 41 | 42 | // InspectImageAlt is a sample for options WIP 43 | func InspectImageAlt(opt *HTMLText) error { 44 | opt.InspectImageAlt = true 45 | return nil 46 | } 47 | 48 | // NewHTMLText creates a new HTMLText extractor, using options. 49 | func NewHTMLText(options ...func(*HTMLText) error) (*HTMLText, error) { 50 | extractor := HTMLText{} 51 | for _, option := range options { 52 | err := option(&extractor) 53 | if err != nil { 54 | return nil, err 55 | } 56 | } 57 | return &extractor, nil 58 | } 59 | 60 | // Text satisfies the plaintext.Extractor interface 61 | func (p *HTMLText) Text(raw []byte) []byte { 62 | isCodeTag := false 63 | isStyleTag := false 64 | isScriptTag := false 65 | 66 | out := bytes.Buffer{} 67 | 68 | z := html.NewTokenizer(bytes.NewReader(raw)) 69 | for { 70 | tt := z.Next() 71 | switch tt { 72 | case html.ErrorToken: 73 | return out.Bytes() 74 | case html.StartTagToken: 75 | tn, hasAttr := z.TagName() 76 | if bytes.Equal(tn, []byte("code")) { 77 | isCodeTag = true 78 | continue 79 | } 80 | if bytes.Equal(tn, []byte("style")) { 81 | isStyleTag = true 82 | continue 83 | } 84 | if bytes.Equal(tn, []byte("script")) { 85 | isScriptTag = true 86 | continue 87 | } 88 | if bytes.Equal(tn, []byte("img")) { 89 | var key, val []byte 90 | for hasAttr { 91 | key, val, hasAttr = z.TagAttr() 92 | if len(val) > 0 && bytes.Equal(key, []byte("alt")) { 93 | out.Write(val) 94 | out.Write([]byte(" ")) 95 | } 96 | } 97 | } 98 | case html.EndTagToken: 99 | tn, _ := z.TagName() 100 | if bytes.Equal(tn, []byte("code")) { 101 | isCodeTag = false 102 | continue 103 | } 104 | if bytes.Equal(tn, []byte("style")) { 105 | isStyleTag = false 106 | continue 107 | } 108 | if bytes.Equal(tn, []byte("script")) { 109 | isScriptTag = false 110 | continue 111 | } 112 | case html.TextToken: 113 | if isCodeTag || isStyleTag || isScriptTag { 114 | // we want to preserve the line count 115 | out.Write(bytes.Repeat([]byte{'\n'}, countNewlines(z.Text()))) 116 | continue 117 | } 118 | out.Write([]byte(z.Text())) 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /plaintext/html_test.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestHTML(t *testing.T) { 9 | cases := []struct { 10 | text string 11 | want string 12 | }{ 13 | { 14 | `1 15 | 2 16 | 3 20 | 7`, 21 | `1 22 | 2 23 | 3 24 | 25 | 26 | 27 | 7`, 28 | }, 29 | } 30 | for pos, tt := range cases { 31 | mt, err := NewHTMLText() 32 | if err != nil { 33 | t.Fatalf("Unable to run test: %s", err) 34 | } 35 | got := string(mt.Text([]byte(tt.text))) 36 | lenGot := len(strings.Split(got, "\n")) 37 | lenWant := len(strings.Split(tt.want, "\n")) 38 | if lenGot != lenWant { 39 | t.Errorf("Test %d failed: want %d got %d lines ", pos, lenWant, lenGot) 40 | } 41 | if tt.want != got { 42 | t.Errorf("Test %d failed: want %q, got %q", pos, tt.want, got) 43 | } 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /plaintext/identity.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | // Identity provides a pass-through plain text extractor 4 | type Identity struct { 5 | } 6 | 7 | // NewIdentity creates an identity-extractor 8 | func NewIdentity() (*Identity, error) { 9 | return &Identity{}, nil 10 | } 11 | 12 | // Text satisfies the plaintext.Extractor interface 13 | func (p *Identity) Text(raw []byte) []byte { 14 | return raw 15 | } 16 | -------------------------------------------------------------------------------- /plaintext/identity_test.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | // TestIdentity test is input is passed back 8 | func TestIdentity(t *testing.T) { 9 | p, err := NewIdentity() 10 | if err != nil { 11 | t.Fatalf("unable to run test") 12 | } 13 | raw := []byte("whatever[]<>") 14 | orig := string(raw) 15 | got := string(p.Text(raw)) 16 | if got != orig { 17 | t.Errorf("identity failed") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /plaintext/markdown.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "bytes" 5 | "regexp" 6 | ) 7 | 8 | var allSymbols = regexp.MustCompile("^[ =*|-]*$") 9 | var linkTarget = regexp.MustCompile(`\]\([^ )]*\)?`) 10 | var blockQuote = regexp.MustCompile("^>[ >]*") 11 | var leadingHeadline = regexp.MustCompile("^ *#+ *") 12 | var trailingHeadline = regexp.MustCompile(" *#+ *$") 13 | 14 | // code fences can have leading whitespace apparently 15 | var codeFence = regexp.MustCompile("^\\s*```") 16 | 17 | // single line, single back quote code snippet 18 | // this is the most common case although markdown 19 | // apparently supports ``...`\n\n....`` style multi-line 20 | // to allow embedded back quotes 21 | var simpleCode = regexp.MustCompile("`[^`]+`") 22 | 23 | // MarkdownText extracts plain text from markdown sources 24 | type MarkdownText struct { 25 | Extractor Extractor 26 | } 27 | 28 | // NewMarkdownText creates a new extractor 29 | func NewMarkdownText(options ...func(*MarkdownText) error) (*MarkdownText, error) { 30 | processor := MarkdownText{} 31 | for _, option := range options { 32 | err := option(&processor) 33 | if err != nil { 34 | return nil, err 35 | } 36 | } 37 | 38 | if processor.Extractor == nil { 39 | e, err := NewHTMLText() 40 | if err != nil { 41 | return nil, err 42 | } 43 | processor.Extractor = e 44 | } 45 | 46 | return &processor, nil 47 | } 48 | 49 | func cleanupLine(s []byte) []byte { 50 | 51 | // strip away various headings from back and front 52 | s = leadingHeadline.ReplaceAll(s, nil) 53 | s = trailingHeadline.ReplaceAll(s, nil) 54 | 55 | // strip away leading "> > > " from block quotes 56 | s = blockQuote.ReplaceAll(s, nil) 57 | 58 | // is all "-", "=", "*", "|" make empty 59 | // this eliminates various HR variations and 60 | // table decoration and is not a word anyways 61 | if allSymbols.Match(s) { 62 | return []byte{} 63 | } 64 | 65 | s = simpleCode.ReplaceAll(s, nil) 66 | 67 | // there is no reason to NOT replace `*` `~` or `_` with a space character 68 | // not used in words 69 | s = bytes.Replace(s, []byte{'*'}, nil, -1) 70 | s = bytes.Replace(s, []byte{'~'}, nil, -1) 71 | s = bytes.Replace(s, []byte{'_'}, nil, -1) 72 | 73 | // links. [link](/MyURI) 74 | // Stuff inside the "link" can be on different lines, but "](/URI)" 75 | // is all on one line so we can delete ](....space ) 76 | // ![ is for images 77 | s = bytes.Replace(s, []byte{'!', '['}, nil, -1) 78 | s = bytes.Replace(s, []byte{'['}, nil, -1) 79 | s = linkTarget.ReplaceAll(s, nil) 80 | return s 81 | } 82 | 83 | // Text extracts text from a markdown source 84 | func (p *MarkdownText) Text(text []byte) []byte { 85 | inCodeFence := false 86 | inCodeIndent := false 87 | 88 | buf := bytes.Buffer{} 89 | lines := bytes.Split(text, []byte{'\n'}) 90 | for pos, line := range lines { 91 | if pos > 0 { 92 | buf.WriteByte('\n') 93 | } 94 | 95 | if codeFence.Match(line) { 96 | inCodeFence = !inCodeFence 97 | continue 98 | } 99 | 100 | if bytes.HasPrefix(line, []byte{' ', ' ', ' ', ' '}) { 101 | inCodeIndent = !inCodeIndent 102 | continue 103 | } 104 | 105 | if !inCodeFence && !inCodeIndent { 106 | buf.Write(cleanupLine(line)) 107 | } 108 | } 109 | return p.Extractor.Text(buf.Bytes()) 110 | } 111 | -------------------------------------------------------------------------------- /plaintext/markdown_test.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestMD(t *testing.T) { 8 | cases := []struct { 9 | text string 10 | want string 11 | }{ 12 | {"\nfoo bar\n", "\nfoo bar\n"}, 13 | {"\nfoo bar\n", "\nfoo bar\n"}, 14 | {"\n\nfoo bar\n", "\n\nfoo bar\n"}, 15 | {"\nfoo\nbar\n", "\nfoo\nbar\n"}, 16 | {"\nfoo\n\nbar\n", "\nfoo\n\nbar\n"}, 17 | {"*italic*", "italic"}, 18 | {"**bold**", "bold"}, 19 | {"_emphasis_", "emphasis"}, 20 | {"**combo _text_**", "combo text"}, 21 | {"~~strike~~", "strike"}, 22 | {"# heading1\nfoo", "heading1\nfoo"}, 23 | 24 | // in-line code should be ignored 25 | {"first `middle` last", "first last"}, 26 | 27 | // auto-links really should be ignore, but they get removed in plain-text tokenizer 28 | {"first http://foobar.com/apple last ", "first http://foobar.com/apple last "}, 29 | 30 | // links 31 | {"foo\n[hello world](http://foobar.com/apple) foo ", "foo\nhello world foo "}, 32 | {"[Visit GitHub!](https://www.github.com)", "Visit GitHub!"}, 33 | 34 | // images 35 | {"![Image of Yaktocat](https://octodex.github.com/images/yaktocat.png)", "Image of Yaktocat"}, 36 | {"![GitHub Logo](/images/logo.png)", "GitHub Logo"}, 37 | 38 | // code fence 39 | {"```\ncode\n```\nnotcode", "\n\n\nnotcode"}, 40 | 41 | // indented code fence 42 | {" ```\ncode\n ```\nnotcode", "\n\n\nnotcode"}, 43 | 44 | // blockquote 45 | {"> blockquote1\n> blockquote2\n", "blockquote1\nblockquote2\n"}, 46 | 47 | // entity 48 | {"<", "<"}, 49 | } 50 | 51 | mt, err := NewMarkdownText() 52 | if err != nil { 53 | t.Fatalf("Unable to run test: %s", err) 54 | } 55 | 56 | for pos, tt := range cases { 57 | got := string(mt.Text([]byte(tt.text))) 58 | if tt.want != got { 59 | t.Errorf("Test %d failed: want %q, got %q", pos, tt.want, got) 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /plaintext/mime.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "errors" 5 | "strings" 6 | ) 7 | 8 | // returns the mime type of the full filename if none 9 | func getSuffix(filename string) string { 10 | idx := strings.LastIndex(filename, ".") 11 | if idx == -1 || idx+1 == len(filename) { 12 | return filename 13 | } 14 | return filename[idx+1:] 15 | } 16 | 17 | // ExtractorByFilename returns an plaintext extractor based on 18 | // filename heuristic 19 | func ExtractorByFilename(filename string) (Extractor, error) { 20 | var e Extractor 21 | var err error 22 | switch getSuffix(filename) { 23 | case "md", "markdown": 24 | e, err = NewMarkdownText() 25 | case "html": 26 | e, err = NewHTMLText() 27 | case "go", "h", "c", "java", "hxx", "cxx", "js": 28 | e, err = NewGolangText() 29 | case "py", "sh", "pl", "Makefile", "Dockerfile": 30 | e, err = NewScriptText() 31 | case "txt", "stdin": 32 | e, err = NewIdentity() 33 | default: 34 | err = errors.New("unknown file type") 35 | } 36 | if err != nil { 37 | return nil, err 38 | } 39 | return e, nil 40 | } 41 | -------------------------------------------------------------------------------- /plaintext/script.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "bytes" 5 | ) 6 | 7 | // ScriptText extract plaintext from "generic script" languages 8 | // that use the '#' character to denote a comment line 9 | // It's not so smart. 10 | // TODO: add support for Ruby, multi-line comment 11 | // http://www.tutorialspoint.com/ruby/ruby_comments.htm 12 | type ScriptText struct { 13 | } 14 | 15 | // NewScriptText creates a new file extractor 16 | func NewScriptText() (*ScriptText, error) { 17 | return &ScriptText{}, nil 18 | } 19 | 20 | // Text extracts plaintext 21 | func (p *ScriptText) Text(text []byte) []byte { 22 | buf := bytes.Buffer{} 23 | lines := bytes.Split(text, []byte{'\n'}) 24 | for pos, line := range lines { 25 | if pos > 0 { 26 | buf.WriteByte('\n') 27 | } 28 | 29 | // BUG: if '#' is in a string 30 | if idx := bytes.IndexByte(line, '#'); idx != -1 { 31 | buf.Write(line[idx:]) 32 | } 33 | } 34 | return buf.Bytes() 35 | } 36 | -------------------------------------------------------------------------------- /plaintext/script_test.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestScript(t *testing.T) { 8 | cases := []struct { 9 | text string 10 | want string 11 | }{ 12 | {"\nfoo1\n# line2bar\nfoo3", "\n\n# line2bar\n"}, 13 | {"\nfoo1\nline2# bar\nfoo3", "\n\n# bar\n"}, 14 | } 15 | mt, err := NewScriptText() 16 | if err != nil { 17 | t.Fatalf("Unable to run test: %s", err) 18 | } 19 | for pos, tt := range cases { 20 | got := string(mt.Text([]byte(tt.text))) 21 | if tt.want != got { 22 | t.Errorf("Test %d failed: want %q, got %q", pos, tt.want, got) 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /plaintext/template.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "regexp" 5 | ) 6 | 7 | // StripTemplate is a WIP on remove golang template markup from a file 8 | func StripTemplate(raw []byte) []byte { 9 | r, err := regexp.Compile(`({{[^}]+}})`) 10 | if err != nil { 11 | panic(err) 12 | } 13 | return r.ReplaceAllLiteral(raw, []byte{0x20}) 14 | } 15 | -------------------------------------------------------------------------------- /plaintext/template_test.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestStrip(t *testing.T) { 8 | orig := []byte("foo{{ junk }}bar") 9 | want := "foo bar" 10 | got := string(StripTemplate(orig)) 11 | 12 | if got != want { 13 | t.Errorf("Want %q, Got %q", want, got) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /plaintext/text.go: -------------------------------------------------------------------------------- 1 | package plaintext 2 | 3 | // Extractor is an interface for extracting plaintext 4 | type Extractor interface { 5 | Text([]byte) []byte 6 | } 7 | -------------------------------------------------------------------------------- /words.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | "unicode" 7 | ) 8 | 9 | // number form, may include dots, commas and dashes 10 | var numberRegexp = regexp.MustCompile("^([0-9]+[.,-]?)+$") 11 | 12 | // number form with units, e.g. 123ms, 12in 1ft 13 | var numberUnitsRegexp = regexp.MustCompile("^[0-9]+[a-zA-Z]+$") 14 | 15 | // 0x12FF or 0x1B or x12FF 16 | // does anyone use 0XFF ?? 17 | var numberHexRegexp = regexp.MustCompile("^0?[x][0-9A-Fa-f]+$") 18 | 19 | var numberBinaryRegexp = regexp.MustCompile("^0[b][01]+$") 20 | 21 | var camelCaseRegexp1 = regexp.MustCompile("[A-Z]+") 22 | 23 | var shaHashRegexp = regexp.MustCompile("^[0-9a-z]{40}$") 24 | 25 | // Splitter splits a text into words 26 | // Highly likely this implementation will change so we are encapsulating. 27 | type Splitter struct { 28 | fn func(c rune) bool 29 | } 30 | 31 | // Split is the function to split an input into a `[]string` 32 | func (s *Splitter) Split(in string) []string { 33 | return strings.FieldsFunc(in, s.fn) 34 | } 35 | 36 | // NewSplitter creates a new splitter. The input is a string in 37 | // UTF-8 encoding. Each rune in the string will be considered to be a 38 | // valid word character. Runes that are NOT here are deemed a word 39 | // boundary Current implementation uses 40 | // https://golang.org/pkg/strings/#FieldsFunc 41 | func NewSplitter(chars string) *Splitter { 42 | s := Splitter{} 43 | s.fn = (func(c rune) bool { 44 | // break if it's not a letter, and not another special character 45 | return !unicode.IsLetter(c) && -1 == strings.IndexRune(chars, c) 46 | }) 47 | return &s 48 | } 49 | 50 | func isNumber(s string) bool { 51 | return numberRegexp.MatchString(s) 52 | } 53 | 54 | func isNumberBinary(s string) bool { 55 | return numberBinaryRegexp.MatchString(s) 56 | } 57 | 58 | // is word in the form of a "number with units", e.g. "101ms", "3ft", 59 | // "5GB" if true, return the units, if not return empty string This is 60 | // highly English based and not sure how applicable it is to other 61 | // languages. 62 | func isNumberUnits(s string) string { 63 | // regexp.FindAllStringSubmatch is too confusing 64 | if !numberUnitsRegexp.MatchString(s) { 65 | return "" 66 | } 67 | // Starts with a number 68 | for idx, ch := range s { 69 | if ch >= '0' && ch <= '9' { 70 | continue 71 | } 72 | return s[idx:] 73 | } 74 | panic("assertion failed") 75 | } 76 | 77 | func isNumberHex(s string) bool { 78 | return numberHexRegexp.MatchString(s) 79 | } 80 | 81 | func isHash(s string) bool { 82 | return shaHashRegexp.MatchString(s) 83 | } 84 | 85 | func splitCamelCase(s string) []string { 86 | out := []string{} 87 | 88 | s = strings.Replace(s, "HTTP", "Http", -1) 89 | s = strings.Replace(s, "HTML", "Html", -1) 90 | s = strings.Replace(s, "URL", "Url", -1) 91 | s = strings.Replace(s, "URI", "Uri", -1) 92 | 93 | caps := camelCaseRegexp1.FindAllStringIndex(s, -1) 94 | 95 | // all lower case 96 | if len(caps) == 0 { 97 | return nil 98 | } 99 | 100 | // is only first character capitalized? or is the whole word capitalized 101 | if len(caps) == 1 && caps[0][0] == 0 && (caps[0][1] == 1 || caps[0][1] == len(s)) { 102 | return nil 103 | } 104 | last := 0 105 | for i := 0; i < len(caps); i++ { 106 | if last != caps[i][0] { 107 | out = append(out, s[last:caps[i][0]]) 108 | last = caps[i][0] 109 | } 110 | if caps[i][1]-caps[i][0] > 1 { 111 | out = append(out, s[caps[i][0]:caps[i][1]]) 112 | last = caps[i][1] 113 | } 114 | } 115 | if last < len(s) { 116 | out = append(out, s[last:]) 117 | } 118 | 119 | return out 120 | } 121 | -------------------------------------------------------------------------------- /words_test.go: -------------------------------------------------------------------------------- 1 | package gospell 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestSplitter(t *testing.T) { 9 | 10 | s := NewSplitter("012345689") 11 | 12 | cases := []struct { 13 | word string 14 | want []string 15 | }{ 16 | {"abc", []string{"abc"}}, 17 | {"abc xyz", []string{"abc", "xyz"}}, 18 | {"abc! xyz!", []string{"abc", "xyz"}}, 19 | {"1st 2nd x86 amd64", []string{"1st", "2nd", "x86", "amd64"}}, 20 | } 21 | 22 | for pos, tt := range cases { 23 | got := s.Split(tt.word) 24 | if !reflect.DeepEqual(tt.want, got) { 25 | t.Errorf("%d want %v got %v", pos, tt.want, got) 26 | } 27 | } 28 | } 29 | 30 | func TestIsNumber(t *testing.T) { 31 | 32 | cases := []struct { 33 | word string 34 | want bool 35 | }{ 36 | {"0", true}, 37 | {"00", true}, 38 | {"100", true}, 39 | {"1.", true}, 40 | {"1.0.", true}, 41 | {"1.0.0.", true}, 42 | {"1,0", true}, 43 | {"1-0", true}, 44 | {"1..0", false}, 45 | {"1--0", false}, 46 | {"1..0", false}, 47 | {"1-.0", false}, 48 | {"-1.0", false}, 49 | {",1", false}, 50 | } 51 | for _, tt := range cases { 52 | if isNumber(tt.word) != tt.want { 53 | t.Errorf("%q is not %v", tt.word, tt.want) 54 | } 55 | } 56 | } 57 | 58 | func TestIsNumberUnits(t *testing.T) { 59 | cases := []struct { 60 | word string 61 | want string 62 | }{ 63 | {"0", ""}, 64 | {"xxx", ""}, 65 | {"101a-b-c", ""}, 66 | {"10GB", "GB"}, 67 | {"1G", "G"}, 68 | } 69 | for _, tt := range cases { 70 | if isNumberUnits(tt.word) != tt.want { 71 | t.Errorf("%q is not %v", tt.word, tt.want) 72 | } 73 | } 74 | } 75 | 76 | func TestIsNumberHex(t *testing.T) { 77 | cases := []struct { 78 | word string 79 | want bool 80 | }{ 81 | {"0", false}, 82 | {"0x", false}, 83 | {"x", false}, 84 | {"0x0", true}, 85 | {"0xF", true}, 86 | {"0xf", true}, 87 | {"0xFF", true}, 88 | {"0x12", true}, 89 | {"x12", true}, 90 | {"x86", true}, 91 | {"xabcdef", true}, 92 | {"0xZZ", false}, 93 | } 94 | for _, tt := range cases { 95 | if isNumberHex(tt.word) != tt.want { 96 | t.Errorf("%q is not %v", tt.word, tt.want) 97 | } 98 | } 99 | } 100 | 101 | func TestSplitCamelCase(t *testing.T) { 102 | cases := []struct { 103 | word string 104 | want []string 105 | }{ 106 | {"foo", nil}, // not camel case 107 | {"Foo", nil}, // not camel case 108 | {"FOO", nil}, // not camel case 109 | {"FooBar", []string{"Foo", "Bar"}}, 110 | {"fooBar", []string{"foo", "Bar"}}, 111 | {"FOOword", []string{"FOO", "word"}}, 112 | {"isFOO", []string{"is", "FOO"}}, 113 | {"RemoveURL", []string{"Remove", "Url"}}, 114 | } 115 | for _, tt := range cases { 116 | got := splitCamelCase(tt.word) 117 | if !reflect.DeepEqual(tt.want, got) { 118 | t.Errorf("%q : want %v got %v", tt.word, tt.want, got) 119 | } 120 | } 121 | } 122 | --------------------------------------------------------------------------------