├── statefn.go ├── token.go ├── README.md ├── lexer.go └── matcher.go /statefn.go: -------------------------------------------------------------------------------- 1 | package easylex 2 | 3 | type StateFn func(*Lexer) StateFn 4 | -------------------------------------------------------------------------------- /token.go: -------------------------------------------------------------------------------- 1 | package easylex 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type TokenType int 8 | 9 | const ( 10 | TokenEOF TokenType = -1 11 | TokenError TokenType = -2 12 | ) 13 | 14 | type Token struct { 15 | Typ TokenType 16 | Val string 17 | } 18 | 19 | func (t Token) String() string { 20 | switch t.Typ { 21 | case TokenError: 22 | return t.Val 23 | case TokenEOF: 24 | return "EOF" 25 | } 26 | if len(t.Val) > 23 { 27 | return fmt.Sprintf("%.10q...%.10q", t.Val, t.Val[len(t.Val)-10:]) 28 | } 29 | return fmt.Sprintf("%q", t.Val) 30 | } 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # easylex 2 | 3 | Easylex is a go library designed to simplify the process of lexing in go. 4 | It was developed while working on a lexer and parser for the N3 file format, [gon3](http://github.com/rychipman/gon3). 5 | Easylex aims to be simple, performant, and easily extensible. 6 | 7 | ## Design 8 | 9 | Easylex borrows [Rob Pike's lexer design](https://cuddle.googlecode.com/hg/talk/lex.html) for go's native templates. 10 | That design, however, quickly becomes cumbersome when dealing with a more complicated grammar; trying to lex a language with more complexity than go's relatively simple templates will quickly lead to repetitive, hard-to-read code. 11 | 12 | -------------------------------------------------------------------------------- /lexer.go: -------------------------------------------------------------------------------- 1 | package easylex 2 | 3 | import ( 4 | "fmt" 5 | "unicode/utf8" 6 | ) 7 | 8 | const ( 9 | EOF = rune(-1) 10 | ) 11 | 12 | // Lexer is a struct that holds all private state 13 | // necessare for lexing. 14 | type Lexer struct { 15 | input string 16 | state StateFn 17 | start int 18 | pos int 19 | width int 20 | tokens chan Token 21 | } 22 | 23 | // Lex returns a new Lexer instance that will lex the provided 24 | // input starting from the provided state function. 25 | func Lex(input string, state StateFn) *Lexer { 26 | return &Lexer{ 27 | input: input, 28 | state: state, 29 | tokens: make(chan Token, 3), // TODO: troubleshoot buffer issues 30 | } 31 | } 32 | 33 | // NextToken returns the next token in the input 34 | // currently being lexed. 35 | func (l *Lexer) NextToken() Token { 36 | for { 37 | select { 38 | case tok := <-l.tokens: 39 | return tok 40 | default: 41 | if l.state == nil { 42 | break 43 | } 44 | l.state = l.state(l) 45 | } 46 | } 47 | } 48 | 49 | // Emit queues a token of the given type for retrieval by 50 | // NextToken(). The token value is equal to all the runes 51 | // processed since the last call to Emit() or Ignore(). 52 | func (l *Lexer) Emit(t TokenType) { 53 | l.tokens <- Token{ 54 | t, 55 | l.input[l.start:l.pos], 56 | } 57 | l.start = l.pos 58 | } 59 | 60 | // Errorf emits an error token (a token of type TokenError) 61 | // with a value equal to the formatted string. 62 | func (l *Lexer) Errorf(format string, args ...interface{}) StateFn { 63 | l.tokens <- Token{ 64 | TokenError, 65 | fmt.Sprintf(format, args), 66 | } 67 | return nil 68 | } 69 | 70 | // Next returns one rune and increments l.pos by the 71 | // width of that rune. The width of the last rune 72 | // processed is stored in l.width. 73 | func (l *Lexer) Next() rune { 74 | if l.pos >= len(l.input) { 75 | l.width = 0 76 | return EOF 77 | } 78 | var r rune 79 | r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) 80 | l.pos += l.width 81 | return r 82 | } 83 | 84 | // Backup decrements l.pos by the width of the last rune 85 | // processed. Backup can only be called once per call to 86 | // Next(). 87 | func (l *Lexer) Backup() { 88 | l.pos -= l.width 89 | } 90 | 91 | // Ignore resets l.start to the current value of l.pos. 92 | // this ignores all the runes processed since the last 93 | // call to Ignore() or Emit(). 94 | func (l *Lexer) Ignore() { 95 | l.start = l.pos 96 | } 97 | 98 | // Peek returns the value of the rune at l.pos + 1, but 99 | // does not mutate lexer state. 100 | func (l *Lexer) Peek() rune { 101 | r := l.Next() 102 | l.Backup() 103 | return r 104 | } 105 | -------------------------------------------------------------------------------- /matcher.go: -------------------------------------------------------------------------------- 1 | package easylex 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | ) 7 | 8 | type Matcher struct { 9 | *unionMatcher 10 | } 11 | 12 | // NewMatcher creates a new instance of Matcher with default 13 | // behavior (in other words, it will not match anything). 14 | func NewMatcher() *Matcher { 15 | return &Matcher{ 16 | &unionMatcher{ 17 | []textMatcher{}, 18 | }, 19 | } 20 | } 21 | 22 | // AcceptRunes modifies a Matcher to accept any runes that 23 | // are contained withing the provided string. 24 | // The modified Matcher is returned to the caller. 25 | func (m *Matcher) AcceptRunes(valid string) *Matcher { 26 | // TODO: check up on the implementation details of the rune vs byte slice thing 27 | r := &runeMatcher{valid} 28 | m.add(r) 29 | return m 30 | } 31 | 32 | // RejectRunes modifies a Matcher to accept any runes 33 | // that are not contained withing the provided string. 34 | // The modified Matcher is returned to the caller. 35 | func (m *Matcher) RejectRunes(invalid string) *Matcher { 36 | r := &runeFilter{invalid} 37 | m.add(r) 38 | return m 39 | } 40 | 41 | // AcceptUnicodeRange modifies a Matcher to accept any runes 42 | // that fall between the two provided runes (inclusive). 43 | // The modified Matcher is returned to the caller. 44 | func (m *Matcher) AcceptUnicodeRange(first, last rune) *Matcher { 45 | u := &unicodeRangeMatcher{first, last} 46 | m.add(u) 47 | return m 48 | } 49 | 50 | // AcceptString modifies a Matcher to accept a string of 51 | // characters in the input that exactly matches the provided 52 | // string. The modified Matcher is returned to the caller. 53 | func (m *Matcher) AcceptString(exact string) *Matcher { 54 | p := &prefixMatcher{exact} 55 | m.add(p) 56 | return m 57 | } 58 | 59 | func (m *Matcher) AcceptRegex(re *regexp.Regexp) *Matcher { 60 | r := ®exMatcher{re} 61 | m.add(r) 62 | return m 63 | } 64 | 65 | // Union modifies a Matcher to accept the set of characters 66 | // equal to the union between the current Matcher's set of 67 | // accepted characters and another Matcher's set of 68 | // accepted characters. 69 | // The modified Matcher is returned to the caller. 70 | func (m *Matcher) Union(other *Matcher) *Matcher { 71 | u := other 72 | m.add(u) 73 | return m 74 | } 75 | 76 | // MatchOne accepts the next input rune if that rune conforms 77 | // to the rules currently represented by this Matcher. 78 | // If the next character was accepted, MatchOne returns true. 79 | // If the next character was not accepted, MatchOne returns 80 | // false and the state of the Lexer is left unmodified. 81 | func (m *Matcher) MatchOne(l *Lexer) bool { 82 | return m.match(l) 83 | } 84 | 85 | // MatchRun accepts as many consecutive input characters as 86 | // fit the rules currently represented by this Matcher. 87 | // If at least one character was accepted, MatchRun returns true. 88 | // If no characters were accepted, MatchRun returns false and 89 | // the state of the Lexer is left unmodified. 90 | func (m *Matcher) MatchRun(l *Lexer) bool { 91 | success := false 92 | for m.match(l) { 93 | if !success { 94 | success = true 95 | } 96 | } 97 | return success 98 | } 99 | 100 | func (m *Matcher) MatchLookAhead(l *Lexer, lookahead *Matcher) bool { 101 | pos := l.pos 102 | matched := m.match(l) 103 | if !matched { 104 | return false 105 | } 106 | newPos := l.pos 107 | matchedLookahead := lookahead.match(l) 108 | if matchedLookahead { 109 | l.pos = newPos 110 | return true 111 | } else { 112 | l.pos = pos 113 | return false 114 | } 115 | panic("unreachable") 116 | } 117 | 118 | func (m *Matcher) MatchLookAheadRun(l *Lexer, lookahead *Matcher) bool { 119 | success := false 120 | for m.MatchLookAhead(l, lookahead) { 121 | if !success { 122 | success = true 123 | } 124 | } 125 | return success 126 | } 127 | 128 | // Peek returns true if the next input sequence conforms 129 | // to the rules currently represented by this Matcher. 130 | // Peek will always leave the state of the Lexer unchanged. 131 | func (m *Matcher) Peek(l *Lexer) bool { 132 | pos := l.pos 133 | matched := m.match(l) 134 | l.pos = pos 135 | return matched 136 | } 137 | 138 | // AssertOne works identically to MatchOne, except it 139 | // will emit an error token if the match fails instead 140 | // of returning a boolean. 141 | func (m *Matcher) AssertOne(l *Lexer, err string, args ...interface{}) { 142 | success := m.MatchOne(l) 143 | if !success { 144 | l.Errorf(err, args...) 145 | } 146 | } 147 | 148 | // AssertRun works identically to MatchRun, except it 149 | // will emit an error token if the match fails instead 150 | // of returning a boolean. 151 | func (m *Matcher) AssertRun(l *Lexer, err string, args ...interface{}) { 152 | success := m.MatchRun(l) 153 | if !success { 154 | l.Errorf(err, args...) 155 | } 156 | } 157 | 158 | // TODO: make textMatcher an exported interface and allow the 159 | // addition of custom matcher modules to a Matcher 160 | type textMatcher interface { 161 | match(*Lexer) bool 162 | } 163 | 164 | type runeMatcher struct { 165 | valid string 166 | } 167 | 168 | func (r *runeMatcher) match(l *Lexer) bool { 169 | if strings.IndexRune(r.valid, l.Next()) >= 0 { 170 | return true 171 | } 172 | l.Backup() 173 | return false 174 | } 175 | 176 | type runeFilter struct { 177 | invalid string 178 | } 179 | 180 | func (r *runeFilter) match(l *Lexer) bool { 181 | if strings.IndexRune(r.invalid, l.Next()) >= 0 { 182 | l.Backup() 183 | return false 184 | } 185 | return true 186 | } 187 | 188 | type unicodeRangeMatcher struct { 189 | first rune 190 | last rune 191 | } 192 | 193 | func (u *unicodeRangeMatcher) match(l *Lexer) bool { 194 | next := l.Next() 195 | if next >= u.first && next <= u.last { 196 | return true 197 | } 198 | l.Backup() 199 | return false 200 | } 201 | 202 | type prefixMatcher struct { 203 | prefix string 204 | } 205 | 206 | func (p *prefixMatcher) match(l *Lexer) bool { 207 | if strings.HasPrefix(l.input[l.pos:], p.prefix) { 208 | l.pos += len(p.prefix) 209 | return true 210 | } 211 | return false 212 | } 213 | 214 | type regexMatcher struct { 215 | regex *regexp.Regexp 216 | } 217 | 218 | func (r *regexMatcher) match(l *Lexer) bool { 219 | loc := r.regex.FindIndex([]byte(l.input[l.pos:])) 220 | if loc == nil { 221 | return false 222 | } else if loc[0] != 0 { 223 | return false 224 | } 225 | l.pos += loc[1] 226 | return true 227 | } 228 | 229 | type unionMatcher struct { 230 | matchers []textMatcher 231 | } 232 | 233 | func (u *unionMatcher) match(l *Lexer) bool { 234 | for _, m := range u.matchers { 235 | if m.match(l) { 236 | return true 237 | } 238 | } 239 | return false 240 | } 241 | 242 | func (u *unionMatcher) add(t textMatcher) { 243 | u.matchers = append(u.matchers, t) 244 | } 245 | --------------------------------------------------------------------------------