├── statefn.go
├── token.go
├── README.md
├── lexer.go
└── matcher.go


/statefn.go:
--------------------------------------------------------------------------------
1 | package easylex
2 | 
3 | type StateFn func(*Lexer) StateFn
4 | 


--------------------------------------------------------------------------------
/token.go:
--------------------------------------------------------------------------------
 1 | package easylex
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | type TokenType int
 8 | 
 9 | const (
10 | 	TokenEOF   TokenType = -1
11 | 	TokenError TokenType = -2
12 | )
13 | 
14 | type Token struct {
15 | 	Typ TokenType
16 | 	Val string
17 | }
18 | 
19 | func (t Token) String() string {
20 | 	switch t.Typ {
21 | 	case TokenError:
22 | 		return t.Val
23 | 	case TokenEOF:
24 | 		return "EOF"
25 | 	}
26 | 	if len(t.Val) > 23 {
27 | 		return fmt.Sprintf("%.10q...%.10q", t.Val, t.Val[len(t.Val)-10:])
28 | 	}
29 | 	return fmt.Sprintf("%q", t.Val)
30 | }
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # easylex
 2 | 
 3 | Easylex is a go library designed to simplify the process of lexing in go.
 4 | It was developed while working on a lexer and parser for the N3 file format, [gon3](http://github.com/rychipman/gon3).
 5 | Easylex aims to be simple, performant, and easily extensible.
 6 | 
 7 | ## Design
 8 | 
 9 | Easylex borrows [Rob Pike's lexer design](https://cuddle.googlecode.com/hg/talk/lex.html) for go's native templates.
10 | That design, however, quickly becomes cumbersome when dealing with a more complicated grammar; trying to lex a language with more complexity than go's relatively simple templates will quickly lead to repetitive, hard-to-read code.
11 | 
12 | 


--------------------------------------------------------------------------------
/lexer.go:
--------------------------------------------------------------------------------
  1 | package easylex
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"unicode/utf8"
  6 | )
  7 | 
  8 | const (
  9 | 	EOF = rune(-1)
 10 | )
 11 | 
 12 | // Lexer is a struct that holds all private state
 13 | // necessare for lexing.
 14 | type Lexer struct {
 15 | 	input  string
 16 | 	state  StateFn
 17 | 	start  int
 18 | 	pos    int
 19 | 	width  int
 20 | 	tokens chan Token
 21 | }
 22 | 
 23 | // Lex returns a new Lexer instance that will lex the provided
 24 | // input starting from the provided state function.
 25 | func Lex(input string, state StateFn) *Lexer {
 26 | 	return &Lexer{
 27 | 		input:  input,
 28 | 		state:  state,
 29 | 		tokens: make(chan Token, 3), // TODO: troubleshoot buffer issues
 30 | 	}
 31 | }
 32 | 
 33 | // NextToken returns the next token in the input
 34 | // currently being lexed.
 35 | func (l *Lexer) NextToken() Token {
 36 | 	for {
 37 | 		select {
 38 | 		case tok := <-l.tokens:
 39 | 			return tok
 40 | 		default:
 41 | 			if l.state == nil {
 42 | 				break
 43 | 			}
 44 | 			l.state = l.state(l)
 45 | 		}
 46 | 	}
 47 | }
 48 | 
 49 | // Emit queues a token of the given type for retrieval by
 50 | // NextToken(). The token value is equal to all the runes
 51 | // processed since the last call to Emit() or Ignore().
 52 | func (l *Lexer) Emit(t TokenType) {
 53 | 	l.tokens <- Token{
 54 | 		t,
 55 | 		l.input[l.start:l.pos],
 56 | 	}
 57 | 	l.start = l.pos
 58 | }
 59 | 
 60 | // Errorf emits an error token (a token of type TokenError)
 61 | // with a value equal to the formatted string.
 62 | func (l *Lexer) Errorf(format string, args ...interface{}) StateFn {
 63 | 	l.tokens <- Token{
 64 | 		TokenError,
 65 | 		fmt.Sprintf(format, args),
 66 | 	}
 67 | 	return nil
 68 | }
 69 | 
 70 | // Next returns one rune and increments l.pos by the
 71 | // width of that rune. The width of the last rune
 72 | // processed is stored in l.width.
 73 | func (l *Lexer) Next() rune {
 74 | 	if l.pos >= len(l.input) {
 75 | 		l.width = 0
 76 | 		return EOF
 77 | 	}
 78 | 	var r rune
 79 | 	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
 80 | 	l.pos += l.width
 81 | 	return r
 82 | }
 83 | 
 84 | // Backup decrements l.pos by the width of the last rune
 85 | // processed. Backup can only be called once per call to
 86 | // Next().
 87 | func (l *Lexer) Backup() {
 88 | 	l.pos -= l.width
 89 | }
 90 | 
 91 | // Ignore resets l.start to the current value of l.pos.
 92 | // this ignores all the runes processed since the last
 93 | // call to Ignore() or Emit().
 94 | func (l *Lexer) Ignore() {
 95 | 	l.start = l.pos
 96 | }
 97 | 
 98 | // Peek returns the value of the rune at l.pos + 1, but
 99 | // does not mutate lexer state.
100 | func (l *Lexer) Peek() rune {
101 | 	r := l.Next()
102 | 	l.Backup()
103 | 	return r
104 | }
105 | 


--------------------------------------------------------------------------------
/matcher.go:
--------------------------------------------------------------------------------
  1 | package easylex
  2 | 
  3 | import (
  4 | 	"regexp"
  5 | 	"strings"
  6 | )
  7 | 
  8 | type Matcher struct {
  9 | 	*unionMatcher
 10 | }
 11 | 
 12 | // NewMatcher creates a new instance of Matcher with default
 13 | // behavior (in other words, it will not match anything).
 14 | func NewMatcher() *Matcher {
 15 | 	return &Matcher{
 16 | 		&unionMatcher{
 17 | 			[]textMatcher{},
 18 | 		},
 19 | 	}
 20 | }
 21 | 
 22 | // AcceptRunes modifies a Matcher to accept any runes that
 23 | // are contained withing the provided string.
 24 | // The modified Matcher is returned to the caller.
 25 | func (m *Matcher) AcceptRunes(valid string) *Matcher {
 26 | 	// TODO: check up on the implementation details of the rune vs byte slice thing
 27 | 	r := &runeMatcher{valid}
 28 | 	m.add(r)
 29 | 	return m
 30 | }
 31 | 
 32 | // RejectRunes modifies a Matcher to accept any runes
 33 | // that are not contained withing the provided string.
 34 | // The modified Matcher is returned to the caller.
 35 | func (m *Matcher) RejectRunes(invalid string) *Matcher {
 36 | 	r := &runeFilter{invalid}
 37 | 	m.add(r)
 38 | 	return m
 39 | }
 40 | 
 41 | // AcceptUnicodeRange modifies a Matcher to accept any runes
 42 | // that fall between the two provided runes (inclusive).
 43 | // The modified Matcher is returned to the caller.
 44 | func (m *Matcher) AcceptUnicodeRange(first, last rune) *Matcher {
 45 | 	u := &unicodeRangeMatcher{first, last}
 46 | 	m.add(u)
 47 | 	return m
 48 | }
 49 | 
 50 | // AcceptString modifies a Matcher to accept a string of
 51 | // characters in the input that exactly matches the provided
 52 | // string. The modified Matcher is returned to the caller.
 53 | func (m *Matcher) AcceptString(exact string) *Matcher {
 54 | 	p := &prefixMatcher{exact}
 55 | 	m.add(p)
 56 | 	return m
 57 | }
 58 | 
 59 | func (m *Matcher) AcceptRegex(re *regexp.Regexp) *Matcher {
 60 | 	r := &regexMatcher{re}
 61 | 	m.add(r)
 62 | 	return m
 63 | }
 64 | 
 65 | // Union modifies a Matcher to accept the set of characters
 66 | // equal to the union between the current Matcher's set of
 67 | // accepted characters and another Matcher's set of
 68 | // accepted characters.
 69 | // The modified Matcher is returned to the caller.
 70 | func (m *Matcher) Union(other *Matcher) *Matcher {
 71 | 	u := other
 72 | 	m.add(u)
 73 | 	return m
 74 | }
 75 | 
 76 | // MatchOne accepts the next input rune if that rune conforms
 77 | // to the rules currently represented by this Matcher.
 78 | // If the next character was accepted, MatchOne returns true.
 79 | // If the next character was not accepted, MatchOne returns
 80 | // false and the state of the Lexer is left unmodified.
 81 | func (m *Matcher) MatchOne(l *Lexer) bool {
 82 | 	return m.match(l)
 83 | }
 84 | 
 85 | // MatchRun accepts as many consecutive input characters as
 86 | // fit the rules currently represented by this Matcher.
 87 | // If at least one character was accepted, MatchRun returns true.
 88 | // If no characters were accepted, MatchRun returns false and
 89 | // the state of the Lexer is left unmodified.
 90 | func (m *Matcher) MatchRun(l *Lexer) bool {
 91 | 	success := false
 92 | 	for m.match(l) {
 93 | 		if !success {
 94 | 			success = true
 95 | 		}
 96 | 	}
 97 | 	return success
 98 | }
 99 | 
100 | func (m *Matcher) MatchLookAhead(l *Lexer, lookahead *Matcher) bool {
101 | 	pos := l.pos
102 | 	matched := m.match(l)
103 | 	if !matched {
104 | 		return false
105 | 	}
106 | 	newPos := l.pos
107 | 	matchedLookahead := lookahead.match(l)
108 | 	if matchedLookahead {
109 | 		l.pos = newPos
110 | 		return true
111 | 	} else {
112 | 		l.pos = pos
113 | 		return false
114 | 	}
115 | 	panic("unreachable")
116 | }
117 | 
118 | func (m *Matcher) MatchLookAheadRun(l *Lexer, lookahead *Matcher) bool {
119 | 	success := false
120 | 	for m.MatchLookAhead(l, lookahead) {
121 | 		if !success {
122 | 			success = true
123 | 		}
124 | 	}
125 | 	return success
126 | }
127 | 
128 | // Peek returns true if the next input sequence conforms
129 | // to the rules currently represented by this Matcher.
130 | // Peek will always leave the state of the Lexer unchanged.
131 | func (m *Matcher) Peek(l *Lexer) bool {
132 | 	pos := l.pos
133 | 	matched := m.match(l)
134 | 	l.pos = pos
135 | 	return matched
136 | }
137 | 
138 | // AssertOne works identically to MatchOne, except it
139 | // will emit an error token if the match fails instead
140 | // of returning a boolean.
141 | func (m *Matcher) AssertOne(l *Lexer, err string, args ...interface{}) {
142 | 	success := m.MatchOne(l)
143 | 	if !success {
144 | 		l.Errorf(err, args...)
145 | 	}
146 | }
147 | 
148 | // AssertRun works identically to MatchRun, except it
149 | // will emit an error token if the match fails instead
150 | // of returning a boolean.
151 | func (m *Matcher) AssertRun(l *Lexer, err string, args ...interface{}) {
152 | 	success := m.MatchRun(l)
153 | 	if !success {
154 | 		l.Errorf(err, args...)
155 | 	}
156 | }
157 | 
158 | // TODO: make textMatcher an exported interface and allow the
159 | // addition of custom matcher modules to a Matcher
160 | type textMatcher interface {
161 | 	match(*Lexer) bool
162 | }
163 | 
164 | type runeMatcher struct {
165 | 	valid string
166 | }
167 | 
168 | func (r *runeMatcher) match(l *Lexer) bool {
169 | 	if strings.IndexRune(r.valid, l.Next()) >= 0 {
170 | 		return true
171 | 	}
172 | 	l.Backup()
173 | 	return false
174 | }
175 | 
176 | type runeFilter struct {
177 | 	invalid string
178 | }
179 | 
180 | func (r *runeFilter) match(l *Lexer) bool {
181 | 	if strings.IndexRune(r.invalid, l.Next()) >= 0 {
182 | 		l.Backup()
183 | 		return false
184 | 	}
185 | 	return true
186 | }
187 | 
188 | type unicodeRangeMatcher struct {
189 | 	first rune
190 | 	last  rune
191 | }
192 | 
193 | func (u *unicodeRangeMatcher) match(l *Lexer) bool {
194 | 	next := l.Next()
195 | 	if next >= u.first && next <= u.last {
196 | 		return true
197 | 	}
198 | 	l.Backup()
199 | 	return false
200 | }
201 | 
202 | type prefixMatcher struct {
203 | 	prefix string
204 | }
205 | 
206 | func (p *prefixMatcher) match(l *Lexer) bool {
207 | 	if strings.HasPrefix(l.input[l.pos:], p.prefix) {
208 | 		l.pos += len(p.prefix)
209 | 		return true
210 | 	}
211 | 	return false
212 | }
213 | 
214 | type regexMatcher struct {
215 | 	regex *regexp.Regexp
216 | }
217 | 
218 | func (r *regexMatcher) match(l *Lexer) bool {
219 | 	loc := r.regex.FindIndex([]byte(l.input[l.pos:]))
220 | 	if loc == nil {
221 | 		return false
222 | 	} else if loc[0] != 0 {
223 | 		return false
224 | 	}
225 | 	l.pos += loc[1]
226 | 	return true
227 | }
228 | 
229 | type unionMatcher struct {
230 | 	matchers []textMatcher
231 | }
232 | 
233 | func (u *unionMatcher) match(l *Lexer) bool {
234 | 	for _, m := range u.matchers {
235 | 		if m.match(l) {
236 | 			return true
237 | 		}
238 | 	}
239 | 	return false
240 | }
241 | 
242 | func (u *unionMatcher) add(t textMatcher) {
243 | 	u.matchers = append(u.matchers, t)
244 | }
245 | 


--------------------------------------------------------------------------------