├── README └── cssparser.py /README: -------------------------------------------------------------------------------- 1 | Some samples of functional parsers that were done using funcparserlib (http://code.google.com/p/funcparserlib/). 2 | -------------------------------------------------------------------------------- /cssparser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | A CSS parser using funcparserlib. 4 | """ 5 | import sys 6 | from pprint import pformat 7 | 8 | from funcparserlib.lexer import make_tokenizer 9 | 10 | from funcparserlib.parser import (some, a, maybe, many, finished, skip, 11 | forward_decl, NoParseError) 12 | 13 | def tokenize(s): 14 | 'str -> Sequence(Token)' 15 | alias = { 16 | 'h': r'[0-9a-f]', 17 | 'nonascii': r'[\200-\377]', 18 | 'unicode': r'\\(%(h)s){1,6}(\r\n|[ \t\r\n\f])?', 19 | 'escape': r'(%(unicode)s)|\\[^\r\n\f0-9a-f]', 20 | 'nmstart': r'[_a-z]|(%(nonascii)s)|(%(escape)s)', 21 | 'nmchar': r'[_a-z0-9-]|(%(nonascii)s)|(%(escape)s)', 22 | 'string1': r'\"([^\n\r\f\\"]|\\(%(nl)s)|(%(escape)s))*\"', 23 | 'string2': r"\'([^\n\r\f\\']|\\(%(nl)s)|(%(escape)s))*\'", 24 | 'invalid1': r'\"([^\n\r\f\\"]|\\(%(nl)s)|(%(escape)s))*', 25 | 'invalid2': r"\'([^\n\r\f\\']|\\(%(nl)s)|(%(escape)s))*", 26 | 27 | 'comment': r'\/\*[^*]*\*+([^/*][^*]*\*+)*\/', 28 | 'ident': r'-?(%(nmstart)s)(%(nmchar)s)*', 29 | 'name': r'(%(nmchar)s)+', 30 | 'hash': r'#(%(name)s)+', 31 | 'num': r'([0-9]*\.[0-9]+|[0-9]+)', 32 | 'string': r'(%(string1)s)|(%(string2)s)', 33 | 'invalid': r'(%(invalid1)s)|(%(invalid2)s)', 34 | 'url': '([!#$%(unprintable)s&*-~]|(%(nonascii)s)|(%(escape)s))*', 35 | 's': r'[ \t\r\n\f]+', 36 | 'w': r'(%(s)s)?', 37 | 'nl': r'\n|\r\n|\r|\f', 38 | 'unprintable': '%(unprintable)s', 39 | 40 | 'important': r'!((%(w)s)|(%(comment)s))*(IMPORTANT|important)', 41 | 'em': r'(%(num)s)em', 42 | 'ex': r'(%(num)s)ex', 43 | 'length': r'(%(num)s)(px|cm|mm|in|pt|pc)', 44 | 'angle': r'(%(num)s)(deg|rad|grad)', 45 | 'time': r'(%(num)s)(ms|s)', 46 | 'freq': r'(%(num)s)(hz|khz)', 47 | 'dimension': r'(%(num)s)(%(ident)s)', 48 | 'percentage': r'(%(num)s)%(unprintable)s', 49 | 50 | 'uri': r'url\((%(w)s)((%(string)s)|(%(url)s))(%(w)s)\)', 51 | 52 | 'function': r'(%(ident)s)\(', 53 | 54 | } 55 | 56 | def applyTillDead(name, alias): 57 | s, sold = alias[name], '' 58 | 59 | while s != sold: 60 | sold = s 61 | s = s % alias 62 | return s 63 | 64 | for k in alias.keys(): 65 | alias[k] = applyTillDead(k, alias) 66 | 67 | for k in alias.keys(): 68 | alias[k] = alias[k] % {'unprintable': '%'} 69 | 70 | for k in alias.keys(): 71 | alias[k] = (alias[k],) 72 | 73 | specs = [ 74 | #spaces 75 | ('S', alias['s']), 76 | 77 | #comments 78 | ('COMMENT', alias['comment']), 79 | 80 | # html comments 81 | ('CDO', ('' ,)), 83 | 84 | ('INCLUDES', ('~=',)), 85 | ('DISMATCH', ('!=',)), 86 | 87 | 88 | #string 89 | ('STRING', alias['string']), 90 | ('INVALID', alias['invalid']), 91 | 92 | ('HASH', alias['hash']), 93 | 94 | ('IMPORT_SYM', ("@import|@IMPORT",)), 95 | ('PAGE_SYM', ("@page|@PAGE",)), 96 | ('MEDIA_SYM', ("@media|@MEDIA",)), 97 | ('CHARSET_SYM', ("@charset ",)), 98 | 99 | ('IMPORTANT_SYM', alias['important']), 100 | 101 | 102 | ('EMS', alias['em']), 103 | ('EXS', alias['ex']), 104 | ('LENGTH', alias['length']), 105 | ('ANGLE', alias['angle']), 106 | ('TIME', alias['time']), 107 | ('FREQ', alias['freq']), 108 | ('DIMENSION', alias['dimension']), 109 | 110 | ('PERCENTAGE', alias['percentage']), 111 | 112 | ('URI', alias['uri']), 113 | 114 | ('NUMBER', alias['num']), 115 | 116 | ('FUNCTION', alias['function']), 117 | 118 | ('IDENT', alias['ident']), 119 | 120 | ('CHAR', ('.',)), 121 | 122 | ] 123 | useless = ['COMMENT' ] 124 | t = make_tokenizer(specs) 125 | return [x for x in t(s) if x.type not in useless] 126 | 127 | def parse(seq): 128 | 'Sequence(Token) -> object' 129 | const = lambda x : lambda _: x 130 | tokval = lambda x: x.value 131 | toktype = lambda t: some(lambda x: x.type == t) >> tokval 132 | op = lambda s: a(Token('Op', s)) >> tokval 133 | #css_text = 134 | #css = skip(finished) 135 | number = some(lambda tok: tok.type == 'NUMBER') >> tokval >> int 136 | 137 | return type(number.parse(seq)) 138 | 139 | 140 | def loads(s): 141 | 'str -> object' 142 | return parse(tokenize(s)) 143 | 144 | def main(): 145 | try: 146 | input = sys.stdin.read().decode('utf-8') 147 | tree = loads(input) 148 | print pformat(tree) 149 | except SyntaxError, e: 150 | msg = (u'syntax error: %s' % e).encode(ENCODING) 151 | print >> sys.stderr, msg 152 | sys.exit(1) 153 | 154 | if __name__ == '__main__': 155 | main() 156 | --------------------------------------------------------------------------------