├── Lexer.py ├── Parser.py └── README.md /Lexer.py: -------------------------------------------------------------------------------- 1 | import re 2 | #Lexer takes in ordered list of token templates to generate tokens. 3 | #Each type of token has a name, a regexp, uses a lambda function (takes matched string) to generate token.value and a corresponding mode 4 | #To ignore a token pass in a function that returns None 5 | 6 | 7 | #Class for tokens 8 | class Token(object): 9 | #name = name of token 10 | #value = the data stored in this token based on the matched string 11 | #start/end = where the token started/ended in the string 12 | def __init__(self,name,value,start,end,line,col): 13 | self.name = name 14 | self.value = value 15 | self.start = start 16 | self.end = end 17 | self.line = line 18 | self.col = col 19 | 20 | #String equivalency 21 | def __eq__(self,other): 22 | return self.name == other 23 | 24 | #__eq__ and __hash__ must be same 25 | def __hash__(self): 26 | return hash(self.name) 27 | 28 | #Prints name and value 29 | def __repr__(self): 30 | return '('+self.name+', "'+str(self.value)+'"'+')' 31 | 32 | #Class for templates of tokens 33 | class Token_template(object): 34 | #name = Name of produced token, lowercase 35 | #regexp = Regular expression recognizing the token 36 | #process = Lambda function for processing the string into the token value 37 | def __init__(self,name,regexp,process=None): 38 | self.name = name 39 | r = re.compile(regexp) 40 | self.regexp = r 41 | self.process = process 42 | 43 | #Returns first token from string [start] 44 | def match(self,string,start,line,col): 45 | #create re.match object with string 46 | matched = self.regexp.match(string,start) 47 | #return False if nothing matches 48 | if not matched: 49 | return False 50 | #Keep track of where the token ends so it can be used as the start position again 51 | end = matched.end() 52 | #If the token has a process, process the value. Otherwise, keep the matched string. 53 | if self.process: 54 | value = self.process(matched.group()) 55 | else: 56 | value = matched.group() 57 | #Look for newline chars inside string. Update col # as well 58 | for c in matched.group(): 59 | if c == '\n': 60 | line += 1 61 | col = 1 62 | #Make a new token with extracted args if it matches correctly 63 | return Token(self.name,value,start,end,line,col) 64 | 65 | #Initializes a token template 66 | def temp(name,regexp,process=None): 67 | return Token_template(name,regexp,process) 68 | 69 | #Actual lexer that takes input string and list of templates 70 | def lex(string,lexer): 71 | start = 0 72 | tokens = [] 73 | line = 1 74 | col = 1 75 | #If empty string is passed, hardcode return empty list 76 | if string == '': 77 | return [] 78 | #Keep looping until no more string is left 79 | while True: 80 | valid = False 81 | 82 | for tp in lexer: 83 | #Search through every token template looking for a match 84 | token = tp.match(string, start, line, col) 85 | #Go to the next one if no match 86 | if not token: 87 | continue 88 | #If token matched isn't ignored, add it to the list 89 | if token.value != None: 90 | tokens.append(token) 91 | #Update column, line, and start values of the lexer 92 | start = token.end 93 | valid = True 94 | if token.line != line: 95 | col = 1 96 | else: 97 | col += token.end - token.start 98 | line = token.line 99 | break 100 | 101 | #If the last run yielded no tokens, the string is not viable 102 | if not valid: 103 | raise Exception("Token error at position "+str(col)+" on line "+str(line)+'.') 104 | #If we are at the end of the string, return finished list of tokens 105 | if start == len(string): return tokens 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /Parser.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from Lexer import * 3 | 4 | ######################################################################################################################### 5 | #Grammar: Consists of a dictionary of left-side symbols mapped to lists of rules for faster access 6 | # 7 | #Grammar rule structure: [ equivalent token , [ token1, token2 ...], [tree name], tree process, antilookahead ] 8 | # Terminals are lower case, non-terminals are first letter uppercase strings 9 | # S -> A b ['S',['A','b'],'start',lambda a: a[0]+a[1]] 10 | # Make rules with rule() procedure 11 | # Anti-lookaheads signify before which tokens the rule DOESN'T apply. Used to set precedence. 12 | #Parse tree: -Parse state keeps list of tokens collected by a parsing state when it shifts. [0] is name of tree. 13 | # -A lambda function, declared in the rules, takes in the token list and processes it into a final tree. 14 | # This is done when the state goes through reduction and when the start state is outputted 15 | # -lambda p: (p[0],p[2]) on the rule P | a b c -----> (name,b.value) 16 | #Parse states: [ left symbol, deque[seen tokens], deque[unseen tokens], origin position, token list, process, antilookahead] 17 | ## S -> A . b from 3 ['S',['A'],['b'], 3, [name,A], lambda function, ['c']] 18 | 19 | 20 | 21 | #Creates grammar rule using a string for the rewrite rule and also parameters for making a parse tree 22 | def rule(rstring,tname,tprocess, antilookahead=[]): 23 | rule = [] 24 | #Split string into terminal/nonterminals. First one is leftside of rewrite rule, the rest are right side of rewrite rule 25 | rstring = rstring.split() 26 | rule.append(rstring.pop(0)) 27 | rule.append(rstring) 28 | #Add the tree name and process for future tree-making 29 | rule.append([tname]) 30 | rule.append(tprocess) 31 | rule.append(antilookahead) 32 | return rule 33 | 34 | def parse(grammar,chart,tokens,startrule): 35 | 36 | #Turn tree(list of tokens) into a finished parse tree segment via process(lambda function) 37 | def make_tree(tree, process): 38 | return process(tree) 39 | 40 | #Append val to chart entry without repeats 41 | def addto(curpos, val): 42 | ref = val[:4] 43 | if ref not in reference[curpos]: 44 | chart[curpos].append(val) 45 | reference[curpos].append(ref) 46 | 47 | #Add all grammar rules to the current chart position that matches the pending non-terminal 48 | def closure(grammar,chart,token,curpos): 49 | for rule in grammar[token]: 50 | #If any grammar rule's leftside equals to first unseen non-terminal 51 | #Create initialized parsing state. Mutable compenents are copied. Turn last two components of rule into Tree instance 52 | state = [rule[0],deque([]),deque(rule[1]),curpos,list(rule[2]),rule[3],rule[4]] 53 | addto(curpos,state) 54 | 55 | #Generates the next state via the current state and next element to add to the tree 56 | #Used in shifting and reduction 57 | def nextstate(state,element): 58 | #Create the next parsing state, copy the lists. 59 | nextstate = [state[0],deque(state[1]),deque(state[2]),state[3],list(state[4]),state[5], state[6]] 60 | #Cut the beginning of the unseen to the end of the seen tokens 61 | shifted = nextstate[2].popleft() 62 | nextstate[1].append(shifted) 63 | 64 | #If element is a token, add its value. Otherwise just add it 65 | ## if type(element) == Token: 66 | ## nextstate[4].append(element.value) 67 | ## else: 68 | nextstate[4].append(element) 69 | return nextstate 70 | 71 | #Match terminal tokens and advance the parsing state into the next chart position 72 | def shift(tokens,chart,state,curpos): 73 | #If current token matches the next token of the parsing state 74 | if tokens[curpos] == state[2][0]: 75 | #Generate the next state by modifying the current state and adding the current token to the tree 76 | addto(curpos+1,nextstate(state,tokens[curpos].value)) 77 | 78 | #Complete the non-terminal of a finished parsing state and add it to the current position 79 | def reduction(origin,chart,equal,curpos,tree): 80 | #Go back to the origin chart position to look for the origin state 81 | for state in chart[origin]: 82 | #If the state isn't finished and its pending token is the desired non-terminal 83 | if state[2] and state[2][0] == equal: 84 | #Generate the next state by modifying the origin state and add to chart 85 | addto(curpos,nextstate(state,tree)) 86 | 87 | #Create alternate version of the chart as reference to addto() 88 | reference = {} 89 | #End marker to prevent shifting outside of the token list at the end 90 | endline, endpos = tokens[-1].line, tokens[-1].col 91 | tokens.append(Token("endmarker",'eof',-1,-1,endline,endpos)) 92 | #Initialize chart positions as lists, add the starting rule to chart[0] 93 | for n in xrange(len(tokens)+1): 94 | chart[n] = [] 95 | reference[n] = [] 96 | chart[0].append([startrule[0],[],deque(startrule[1]),0,startrule[2],startrule[3],startrule[4]]) 97 | 98 | for curpos in xrange(len(tokens)+1): 99 | #If current postion is empty, no state has shifted successfully and the string is invalid 100 | if chart[curpos] == []: 101 | curtoken = tokens[curpos-1] 102 | raise Exception('Unexpected '+str(curtoken.value)+' at line '+str(curtoken.line)+' position '+str(curtoken.col)+'.') 103 | 104 | #For each state in the current chart position. Loop will include new states added by closure. 105 | for state in chart[curpos]: 106 | #Variables for components of current parsing state 107 | equal = state[0] 108 | seen = state[1] 109 | unseen = state[2] 110 | origin = state[3] 111 | tree = state[4] 112 | process = state[5] 113 | antilookahead = state[6] 114 | 115 | #If we are at the end of the tokens and we have the state we started with finished, string is valid. Then the tree is returned 116 | if curpos == len(tokens)-1 and equal == startrule[0] and unseen == deque([]) and origin == 0: 117 | return make_tree(tree,process) 118 | 119 | #If state is finished and the next token isn't an anti-lookahead, finish its tree and run reduction to it, passing in the finished tree 120 | if not unseen: 121 | if tokens[curpos] not in antilookahead: 122 | tree = make_tree(tree,process) 123 | reduction(origin,chart,equal,curpos,tree) 124 | else: 125 | continue 126 | #If state's pending token is non-terminal(first letter cap) then run closure 127 | elif unseen[0][0] >= 'A' and unseen[0][0] <= 'Z': 128 | closure(grammar,chart,unseen[0],curpos) 129 | #If state's pending token is terminal run shifting to it 130 | else: 131 | shift(tokens,chart,state,curpos) 132 | 133 | ############################################################################## 134 | #Test grammars and lexers 135 | ## 136 | ## 137 | ##grammar = {'S':[rule('S S STM',None,lambda p: [p[2]]+p[1]), 138 | ## rule('S ',None,lambda p: [])], 139 | ## 'CALL':[rule('CALL ID OPTARG','call',lambda p: (p[0],p[1],p[2]))], 140 | ## 'ID':[rule('ID word',None,lambda p: p[1])], 141 | ## 'OPTARG':[rule('OPTARG pl ARGS pr',None,lambda p: p[2]), 142 | ## rule('OPTARG pl pr',None,lambda p: [])], 143 | ## 'ARGS':[rule('ARGS EXP comma ARGS',None,lambda p: [p[1]]+p[3]), 144 | ## rule('ARGS EXP',None,lambda p: [p[1]])], 145 | ## 'EXP':[rule('EXP int','int',lambda p: (p[0],p[1])), 146 | ## rule('EXP EXP add EXP','add',lambda p: (p[0],p[1],p[3])), 147 | ## rule('EXP EXP minus EXP','minus',lambda p: (p[0],p[1],p[3])), 148 | ## rule('EXP CALL',None,lambda p: p[1])], 149 | ## 'STM':[rule('STM EXP scolon','statement',lambda p: (p[0],p[1]))]} 150 | ##chart = {} 151 | ## 152 | ## 153 | ##lexer = [temp('word','[A-Za-z]+'), 154 | ## temp('pl','\('), 155 | ## temp('pr','\)'), 156 | ## temp('int','[1-9][0-9]*',lambda a: int(a)), 157 | ## temp('comma',','), 158 | ## temp('space',' +',lambda a: None), 159 | ## temp('newline','\n',lambda a: None), 160 | ## temp('add','\+'), 161 | ## temp('scolon',';'), 162 | ## temp('minus','\-')] 163 | ## 164 | ##string = '''proc (1+2-3,4); 165 | ## proc (1+2-3,4);''' 166 | ## 167 | ##print parse(grammar, {}, lex(string, lexer), grammar['S'][0]) 168 | 169 | ############################################################################## 170 | 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lexer-and-Parser 2 | A complete parser generator which tokenizes the input string before creating an abstract syntax tree by processing the tokens with a context-free grammar. The tokens are defined using the regex library and the actual parser an implementation of Earley's parsing algorithm. Only the Parse.py file needs to be imported, though functions from both files need to be used to generate a complete parser. The lex function can read strings and turn them into a list of "Token" objects, and the parse functions takes the list of tokens and returns a custom abstract syntax tree. Thus, usage of the two files' functions must be in that order and synchronized. 3 | 4 | DISCLAIMER: If a specific token needs to be recognized but omitted by the lexer (from Lexer.py), make sure to input a function into the "process" parameter of the template definition but set its return value to None. 5 | --------------------------------------------------------------------------------