├── Lexer.py
├── Parser.py
└── README.md


/Lexer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | #Lexer takes in ordered list of token templates to generate tokens.
  3 | #Each type of token has a name, a regexp, uses a lambda function (takes matched string) to generate token.value and a corresponding mode
  4 | #To ignore a token pass in a function that returns None
  5 | 
  6 | 
  7 | #Class for tokens
  8 | class Token(object):
  9 |     #name = name of token
 10 |     #value = the data stored in this token based on the matched string
 11 |     #start/end = where the token started/ended in the string
 12 |     def __init__(self,name,value,start,end,line,col):
 13 |         self.name = name
 14 |         self.value = value
 15 |         self.start = start
 16 |         self.end = end
 17 |         self.line = line
 18 |         self.col = col
 19 | 
 20 |     #String equivalency
 21 |     def __eq__(self,other):
 22 |         return self.name == other
 23 |         
 24 |     #__eq__ and __hash__ must be same
 25 |     def __hash__(self):
 26 |         return hash(self.name)
 27 | 
 28 |     #Prints name and value
 29 |     def __repr__(self):
 30 |         return '('+self.name+', "'+str(self.value)+'"'+')'
 31 | 
 32 | #Class for templates of tokens        
 33 | class Token_template(object):
 34 |     #name = Name of produced token, lowercase
 35 |     #regexp = Regular expression recognizing the token
 36 |     #process = Lambda function for processing the string into the token value
 37 |     def __init__(self,name,regexp,process=None):
 38 |         self.name = name
 39 |         r = re.compile(regexp)
 40 |         self.regexp = r
 41 |         self.process = process
 42 | 
 43 |     #Returns first token from string [start]  
 44 |     def match(self,string,start,line,col):
 45 |         #create re.match object with string
 46 |         matched = self.regexp.match(string,start)
 47 |         #return False if nothing matches
 48 |         if not matched:
 49 |             return False
 50 |         #Keep track of where the token ends so it can be used as the start position again
 51 |         end = matched.end()
 52 |         #If the token has a process, process the value. Otherwise, keep the matched string.
 53 |         if self.process:
 54 |             value = self.process(matched.group())
 55 |         else:
 56 |             value = matched.group()
 57 |         #Look for newline chars inside string. Update col # as well
 58 |         for c in matched.group():
 59 |             if c == '\n':
 60 |                 line += 1
 61 |                 col = 1
 62 |         #Make a new token with extracted args if it matches correctly
 63 |         return Token(self.name,value,start,end,line,col)
 64 | 
 65 | #Initializes a token template
 66 | def temp(name,regexp,process=None):
 67 |     return Token_template(name,regexp,process)
 68 | 
 69 | #Actual lexer that takes input string and list of templates
 70 | def lex(string,lexer):
 71 |     start = 0
 72 |     tokens = []
 73 |     line = 1
 74 |     col = 1
 75 |     #If empty string is passed, hardcode return empty list
 76 |     if string == '':
 77 |         return []
 78 |     #Keep looping until no more string is left
 79 |     while True:
 80 |         valid = False
 81 | 
 82 |         for tp in lexer:
 83 |             #Search through every token template looking for a match
 84 |             token = tp.match(string, start, line, col)
 85 |             #Go to the next one if no match
 86 |             if not token:
 87 |                 continue
 88 |             #If token matched isn't ignored, add it to the list            
 89 |             if token.value != None:
 90 |                 tokens.append(token)
 91 |             #Update column, line, and start values of the lexer
 92 |             start = token.end
 93 |             valid = True
 94 |             if token.line != line:
 95 |                 col = 1
 96 |             else:
 97 |                 col += token.end - token.start
 98 |             line = token.line
 99 |             break
100 |                     
101 |         #If the last run yielded no tokens, the string is not viable
102 |         if not valid:
103 |             raise Exception("Token error at position "+str(col)+" on line "+str(line)+'.')
104 |         #If we are at the end of the string, return finished list of tokens
105 |         if start == len(string): return tokens
106 |  
107 | 
108 |  
109 | 


--------------------------------------------------------------------------------
/Parser.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | from Lexer import *
  3 | 
  4 | #########################################################################################################################
  5 | #Grammar: Consists of a dictionary of left-side symbols mapped to lists of rules for faster access
  6 | #
  7 | #Grammar rule structure: [ equivalent token , [ token1, token2 ...], [tree name], tree process, antilookahead ]
  8 | #                        Terminals are lower case, non-terminals are first letter uppercase strings
  9 | #                        S -> A b     ['S',['A','b'],'start',lambda a: a[0]+a[1]]
 10 | #                        Make rules with rule() procedure
 11 | #                        Anti-lookaheads signify before which tokens the rule DOESN'T apply. Used to set precedence.
 12 | #Parse tree: -Parse state keeps list of tokens collected by a parsing state when it shifts. [0] is name of tree.
 13 | #            -A lambda function, declared in the rules, takes in the token list and processes it into a final tree.
 14 | #             This is done when the state goes through reduction and when the start state is outputted
 15 | #            -lambda p: (p[0],p[2]) on the rule P | a b c  ----->   (name,b.value) 
 16 | #Parse states: [ left symbol, deque[seen tokens], deque[unseen tokens], origin position, token list, process, antilookahead]
 17 | ##              S -> A . b from 3    ['S',['A'],['b'], 3, [name,A], lambda function, ['c']]
 18 | 
 19 | 
 20 | 
 21 | #Creates grammar rule using a string for the rewrite rule and also parameters for making a parse tree
 22 | def rule(rstring,tname,tprocess, antilookahead=[]):
 23 |     rule = []
 24 |     #Split string into terminal/nonterminals. First one is leftside of rewrite rule, the rest are right side of rewrite rule
 25 |     rstring = rstring.split()
 26 |     rule.append(rstring.pop(0))
 27 |     rule.append(rstring)
 28 |     #Add the tree name and process for future tree-making
 29 |     rule.append([tname])
 30 |     rule.append(tprocess)
 31 |     rule.append(antilookahead)
 32 |     return rule
 33 | 
 34 | def parse(grammar,chart,tokens,startrule):
 35 | 
 36 |     #Turn tree(list of tokens) into a finished parse tree segment via process(lambda function) 
 37 |     def make_tree(tree, process):
 38 |         return process(tree)
 39 | 
 40 |     #Append val to chart entry without repeats
 41 |     def addto(curpos, val):
 42 |         ref = val[:4]
 43 |         if ref not in reference[curpos]:
 44 |             chart[curpos].append(val)
 45 |             reference[curpos].append(ref)
 46 |             
 47 |     #Add all grammar rules to the current chart position that matches the pending non-terminal     
 48 |     def closure(grammar,chart,token,curpos):
 49 |         for rule in grammar[token]:
 50 |         #If any grammar rule's leftside equals to first unseen non-terminal
 51 |             #Create initialized parsing state. Mutable compenents are copied. Turn last two components of rule into Tree instance
 52 |             state = [rule[0],deque([]),deque(rule[1]),curpos,list(rule[2]),rule[3],rule[4]]
 53 |             addto(curpos,state)
 54 | 
 55 |     #Generates the next state via the current state and next element to add to the tree
 56 |     #Used in shifting and reduction
 57 |     def nextstate(state,element):
 58 |         #Create the next parsing state, copy the lists. 
 59 |         nextstate = [state[0],deque(state[1]),deque(state[2]),state[3],list(state[4]),state[5], state[6]]
 60 |         #Cut the beginning of the unseen to the end of the seen tokens
 61 |         shifted = nextstate[2].popleft()
 62 |         nextstate[1].append(shifted)
 63 | 
 64 |         #If element is a token, add its value. Otherwise just add it
 65 |     ##    if type(element) == Token:
 66 |     ##        nextstate[4].append(element.value)
 67 |     ##    else:
 68 |         nextstate[4].append(element)
 69 |         return nextstate
 70 |         
 71 |     #Match terminal tokens and advance the parsing state into the next chart position
 72 |     def shift(tokens,chart,state,curpos):
 73 |         #If current token matches the next token of the parsing state
 74 |         if tokens[curpos] == state[2][0]:
 75 |             #Generate the next state by modifying the current state and adding the current token to the tree
 76 |             addto(curpos+1,nextstate(state,tokens[curpos].value))
 77 | 
 78 |     #Complete the non-terminal of a finished parsing state and add it to the current position
 79 |     def reduction(origin,chart,equal,curpos,tree):
 80 |         #Go back to the origin chart position to look for the origin state
 81 |         for state in chart[origin]:
 82 |             #If the state isn't finished and its pending token is the desired non-terminal
 83 |             if state[2] and state[2][0] == equal:
 84 |                 #Generate the next state by modifying the origin state and add to chart
 85 |                 addto(curpos,nextstate(state,tree))
 86 |                 
 87 |     #Create alternate version of the chart as reference to addto()
 88 |     reference = {}
 89 |     #End marker to prevent shifting outside of the token list at the end
 90 |     endline, endpos = tokens[-1].line, tokens[-1].col
 91 |     tokens.append(Token("endmarker",'eof',-1,-1,endline,endpos))
 92 |     #Initialize chart positions as lists, add the starting rule to chart[0]
 93 |     for n in xrange(len(tokens)+1):
 94 |         chart[n] = []
 95 |         reference[n] = []
 96 |     chart[0].append([startrule[0],[],deque(startrule[1]),0,startrule[2],startrule[3],startrule[4]])
 97 | 
 98 |     for curpos in xrange(len(tokens)+1):
 99 |         #If current postion is empty, no state has shifted successfully and the string is invalid
100 |         if chart[curpos] == []:
101 |             curtoken = tokens[curpos-1]
102 |             raise Exception('Unexpected '+str(curtoken.value)+' at line '+str(curtoken.line)+' position '+str(curtoken.col)+'.')
103 | 
104 |         #For each state in the current chart position. Loop will include new states added by closure.    
105 |         for state in chart[curpos]:
106 |             #Variables for components of current parsing state
107 |             equal = state[0]
108 |             seen = state[1]
109 |             unseen = state[2]
110 |             origin = state[3]
111 |             tree = state[4]
112 |             process = state[5]
113 |             antilookahead = state[6]
114 | 
115 |             #If we are at the end of the tokens and we have the state we started with finished, string is valid. Then the tree is returned
116 |             if curpos == len(tokens)-1 and equal == startrule[0] and unseen == deque([]) and origin == 0:
117 |                 return make_tree(tree,process)
118 | 
119 |             #If state is finished and the next token isn't an anti-lookahead, finish its tree and run reduction to it, passing in the finished tree
120 |             if not unseen:
121 |                 if tokens[curpos] not in antilookahead:
122 |                     tree = make_tree(tree,process)
123 |                     reduction(origin,chart,equal,curpos,tree)
124 |                 else:
125 |                     continue
126 |             #If state's pending token is non-terminal(first letter cap) then run closure
127 |             elif unseen[0][0] >= 'A' and unseen[0][0] <= 'Z':
128 |                 closure(grammar,chart,unseen[0],curpos)                
129 |             #If state's pending token is terminal run shifting to it
130 |             else:
131 |                 shift(tokens,chart,state,curpos)
132 | 
133 | ##############################################################################
134 | #Test grammars and lexers
135 | ##
136 | ##                
137 | ##grammar = {'S':[rule('S S STM',None,lambda p: [p[2]]+p[1]),
138 | ##		rule('S ',None,lambda p: [])],
139 | ##	   'CALL':[rule('CALL ID OPTARG','call',lambda p: (p[0],p[1],p[2]))],
140 | ##           'ID':[rule('ID word',None,lambda p: p[1])],
141 | ##	   'OPTARG':[rule('OPTARG pl ARGS pr',None,lambda p: p[2]),
142 | ##                    rule('OPTARG pl pr',None,lambda p: [])],
143 | ##	   'ARGS':[rule('ARGS EXP comma ARGS',None,lambda p: [p[1]]+p[3]),
144 | ##                   rule('ARGS EXP',None,lambda p: [p[1]])],
145 | ##	   'EXP':[rule('EXP int','int',lambda p: (p[0],p[1])),
146 | ##                  rule('EXP EXP add EXP','add',lambda p: (p[0],p[1],p[3])),
147 | ##		  rule('EXP EXP minus EXP','minus',lambda p: (p[0],p[1],p[3])),
148 | ##		  rule('EXP CALL',None,lambda p: p[1])],
149 | ##	   'STM':[rule('STM EXP scolon','statement',lambda p: (p[0],p[1]))]}
150 | ##chart = {}       
151 | ##
152 | ##
153 | ##lexer = [temp('word','[A-Za-z]+'),
154 | ##         temp('pl','\('),
155 | ##         temp('pr','\)'),
156 | ##         temp('int','[1-9][0-9]*',lambda a: int(a)),
157 | ##         temp('comma',','),
158 | ##         temp('space',' +',lambda a: None),
159 | ##         temp('newline','\n',lambda a: None),
160 | ##         temp('add','\+'),
161 | ##         temp('scolon',';'),
162 | ##         temp('minus','\-')]
163 | ##
164 | ##string = '''proc (1+2-3,4);
165 | ##            proc (1+2-3,4);'''
166 | ##
167 | ##print parse(grammar, {}, lex(string, lexer), grammar['S'][0])
168 | 
169 | ##############################################################################
170 | 
171 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Lexer-and-Parser
2 | A complete parser generator which tokenizes the input string before creating an abstract syntax tree by processing the tokens with a context-free grammar. The tokens are defined using the regex library and the actual parser an implementation of Earley's parsing algorithm. Only the Parse.py file needs to be imported, though functions from both files need to be used to generate a complete parser. The lex function can read strings and turn them into a list of "Token" objects, and the parse functions takes the list of tokens and returns a custom abstract syntax tree. Thus, usage of the two files' functions must be in that order and synchronized.  
3 | 
4 | DISCLAIMER: If a specific token needs to be recognized but omitted by the lexer (from Lexer.py), make sure to input a function into the "process" parameter of the template definition but set its return value to None.
5 | 


--------------------------------------------------------------------------------