├── test.php ├── Makefile ├── simple.lex ├── COPYING ├── README ├── jlex.php └── c.lex /test.php: -------------------------------------------------------------------------------- 1 | nextToken()) { 7 | print_r($t); 8 | } 9 | 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: JLexPHP.jar simple.lex.php c.lex.php 3 | 4 | simple.lex.php: simple.lex JLexPHP.jar 5 | java -cp JLexPHP.jar JLexPHP.Main simple.lex 6 | test -s simple.lex.php || rm simple.lex.php 7 | 8 | c.lex.php: c.lex JLexPHP.jar 9 | java -cp JLexPHP.jar JLexPHP.Main c.lex 10 | test -s c.lex.php || rm c.lex.php 11 | 12 | JLexPHP.jar: JLexPHP/Main.java 13 | javac JLexPHP/Main.java 14 | jar cvf JLexPHP.jar JLexPHP/*.class 15 | 16 | clean: 17 | rm JLexPHP/*.class *.jar 18 | 19 | -------------------------------------------------------------------------------- /simple.lex: -------------------------------------------------------------------------------- 1 | L? \" (\\.|[^\\\"])* \" { $this->createToken(CParser::TK_STRING_LITERAL); } 8 | /* blah */ 9 | %} 10 | 11 | %function nextToken 12 | %line 13 | %char 14 | %state COMMENTS 15 | 16 | ALPHA=[A-Za-z_] 17 | DIGIT=[0-9] 18 | ALPHA_NUMERIC={ALPHA}|{DIGIT} 19 | IDENT={ALPHA}({ALPHA_NUMERIC})* 20 | NUMBER=({DIGIT})+ 21 | WHITE_SPACE=([\ \n\r\t\f])+ 22 | 23 | %% 24 | 25 | {NUMBER} { 26 | return $this->createToken(); 27 | } 28 | {WHITE_SPACE} { } 29 | 30 | "+" { 31 | return $this->createToken(); 32 | } 33 | "-" { 34 | return $this->createToken(); 35 | } 36 | "*" { 37 | return $this->createToken(); 38 | } 39 | "/" { 40 | return $this->createToken(); 41 | } 42 | ";" { 43 | return $this->createToken(); 44 | } 45 | "//" { 46 | $this->yybegin(self::COMMENTS); 47 | } 48 | [^\n] { 49 | } 50 | [\n] { 51 | $this->yybegin(self::YYINITIAL); 52 | } 53 | . { 54 | throw new Exception("bah!"); 55 | } 56 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright 2006 Wez Furlong, OmniTI Computer Consulting, Inc. 2 | Based on JLex which is: 3 | 4 | JLEX COPYRIGHT NOTICE, LICENSE, AND DISCLAIMER 5 | Copyright 1996-2000 by Elliot Joel Berk and C. Scott Ananian 6 | 7 | Permission to use, copy, modify, and distribute this software and its 8 | documentation for any purpose and without fee is hereby granted, 9 | provided that the above copyright notice appear in all copies and that 10 | both the copyright notice and this permission notice and warranty 11 | disclaimer appear in supporting documentation, and that the name of 12 | the authors or their employers not be used in advertising or publicity 13 | pertaining to distribution of the software without specific, written 14 | prior permission. 15 | 16 | The authors and their employers disclaim all warranties with regard to 17 | this software, including all implied warranties of merchantability and 18 | fitness. In no event shall the authors or their employers be liable 19 | for any special, indirect or consequential damages or any damages 20 | whatsoever resulting from loss of use, data or profits, whether in an 21 | action of contract, negligence or other tortious action, arising out 22 | of or in connection with the use or performance of this software. 23 | **************************************************************/ 24 | 25 | 26 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | JLexPHP: a Lexical Analyzer Generator for PHP, based on JLex. 2 | For copyright and licensing information, see the COPYING file. 3 | 4 | This is an adaptation of some Java code that generates lexers from lex style 5 | input files. 6 | 7 | The porting effort was pretty trivial, with the hardest part being the buffer management. 8 | 9 | Usage is fairly typical of lexers; you'll want to create a lexer file like this: 10 | 11 | ----8<------ 12 | yytext(), "\n"; } 22 | . { echo "Something else ", $this->yytext(), "\n"; } 23 | 24 | ----8<------ 25 | 26 | Then run process this file: 27 | 28 | java -cp JLexPHP.jar JLexPHP.Main your.lex 29 | 30 | (the supplied makefile will create the jar file for you, or you 31 | can build it with: 32 | 33 | javac JLexPHP/Main.java 34 | jar cvf JLexPHP.jar JLexPHP/*.class 35 | ) 36 | 37 | JLexPHP will output your.lex.php. It will contain a class that will recognize 38 | the input stream described in your .lex file. 39 | 40 | Usage of that class is along the lines of: 41 | 42 | $scanner = new Yylex(fopen("file", "r")); 43 | while ($scanner->yylex()) 44 | ; 45 | 46 | A more complicated scanner will use the createToken() method to create a token 47 | object that can then be fed into a parser, such as a lemon based parser. You 48 | can see an example of that in the c.lex source file. It is designed to work in 49 | conjunction with it's corresponding c.y file in my lemon port for php. 50 | 51 | 52 | You can find more information on the lexer syntax in the JLex manual: 53 | http://www.cs.princeton.edu/~appel/modern/java/JLex/current/manual.html 54 | 55 | 56 | Enjoy! 57 | 58 | --Wez. 59 | 60 | -------------------------------------------------------------------------------- /jlex.php: -------------------------------------------------------------------------------- 1 | line = $line; 36 | $this->col = $col; 37 | $this->value = $value; 38 | $this->type = $type; 39 | } 40 | } 41 | 42 | class JLexBase { 43 | const YY_F = -1; 44 | const YY_NO_STATE = -1; 45 | const YY_NOT_ACCEPT = 0; 46 | const YY_START = 1; 47 | const YY_END = 2; 48 | const YY_NO_ANCHOR = 4; 49 | const YYEOF = -1; 50 | 51 | protected $YY_BOL; 52 | protected $YY_EOF; 53 | 54 | protected $yy_reader; 55 | protected $yy_buffer; 56 | protected $yy_buffer_read; 57 | protected $yy_buffer_index; 58 | protected $yy_buffer_start; 59 | protected $yy_buffer_end; 60 | protected $yychar = 0; 61 | protected $yycol = 0; 62 | protected $yyline = 0; 63 | protected $yy_at_bol; 64 | protected $yy_lexical_state; 65 | protected $yy_last_was_cr = false; 66 | protected $yy_count_lines = false; 67 | protected $yy_count_chars = false; 68 | protected $yyfilename = null; 69 | 70 | function __construct($stream) { 71 | $this->yy_reader = $stream; 72 | $meta = stream_get_meta_data($stream); 73 | if (!isset($meta['uri'])) { 74 | $this->yyfilename = '<>'; 75 | } else { 76 | $this->yyfilename = $meta['uri']; 77 | } 78 | 79 | $this->yy_buffer = ""; 80 | $this->yy_buffer_read = 0; 81 | $this->yy_buffer_index = 0; 82 | $this->yy_buffer_start = 0; 83 | $this->yy_buffer_end = 0; 84 | $this->yychar = 0; 85 | $this->yyline = 1; 86 | $this->yy_at_bol = true; 87 | } 88 | 89 | protected function yybegin($state) { 90 | $this->yy_lexical_state = $state; 91 | } 92 | 93 | protected function yy_advance() { 94 | if ($this->yy_buffer_index < $this->yy_buffer_read) { 95 | if (!isset($this->yy_buffer[$this->yy_buffer_index])) { 96 | return $this->YY_EOF; 97 | } 98 | return ord($this->yy_buffer[$this->yy_buffer_index++]); 99 | } 100 | if ($this->yy_buffer_start != 0) { 101 | /* shunt */ 102 | $j = $this->yy_buffer_read - $this->yy_buffer_start; 103 | $this->yy_buffer = substr($this->yy_buffer, $this->yy_buffer_start, $j); 104 | $this->yy_buffer_end -= $this->yy_buffer_start; 105 | $this->yy_buffer_start = 0; 106 | $this->yy_buffer_read = $j; 107 | $this->yy_buffer_index = $j; 108 | 109 | $data = fread($this->yy_reader, 8192); 110 | if ($data === false || !strlen($data)) return $this->YY_EOF; 111 | $this->yy_buffer .= $data; 112 | $this->yy_buffer_read += strlen($data); 113 | } 114 | 115 | while ($this->yy_buffer_index >= $this->yy_buffer_read) { 116 | $data = fread($this->yy_reader, 8192); 117 | if ($data === false || !strlen($data)) return $this->YY_EOF; 118 | $this->yy_buffer .= $data; 119 | $this->yy_buffer_read += strlen($data); 120 | } 121 | return ord($this->yy_buffer[$this->yy_buffer_index++]); 122 | } 123 | 124 | protected function yy_move_end() { 125 | if ($this->yy_buffer_end > $this->yy_buffer_start && 126 | $this->yy_buffer[$this->yy_buffer_end-1] == "\n") 127 | $this->yy_buffer_end--; 128 | if ($this->yy_buffer_end > $this->yy_buffer_start && 129 | $this->yy_buffer[$this->yy_buffer_end-1] == "\r") 130 | $this->yy_buffer_end--; 131 | } 132 | 133 | protected function yy_mark_start() { 134 | if ($this->yy_count_lines || $this->yy_count_chars) { 135 | if ($this->yy_count_lines) { 136 | for ($i = $this->yy_buffer_start; $i < $this->yy_buffer_index; ++$i) { 137 | if ("\n" == $this->yy_buffer[$i] && !$this->yy_last_was_cr) { 138 | ++$this->yyline; 139 | $this->yycol = 0; 140 | } 141 | if ("\r" == $this->yy_buffer[$i]) { 142 | ++$yyline; 143 | $this->yycol = 0; 144 | $this->yy_last_was_cr = true; 145 | } else { 146 | $this->yy_last_was_cr = false; 147 | } 148 | } 149 | } 150 | if ($this->yy_count_chars) { 151 | $this->yychar += $this->yy_buffer_index - $this->yy_buffer_start; 152 | $this->yycol += $this->yy_buffer_index - $this->yy_buffer_start; 153 | } 154 | } 155 | $this->yy_buffer_start = $this->yy_buffer_index; 156 | } 157 | 158 | protected function yy_mark_end() { 159 | $this->yy_buffer_end = $this->yy_buffer_index; 160 | } 161 | 162 | protected function yy_to_mark() { 163 | #echo "yy_to_mark: setting buffer index to ", $this->yy_buffer_end, "\n"; 164 | $this->yy_buffer_index = $this->yy_buffer_end; 165 | $this->yy_at_bol = ($this->yy_buffer_end > $this->yy_buffer_start) && 166 | ("\r" == $this->yy_buffer[$this->yy_buffer_end-1] || 167 | "\n" == $this->yy_buffer[$this->yy_buffer_end-1] || 168 | 2028 /* unicode LS */ == $this->yy_buffer[$this->yy_buffer_end-1] || 169 | 2029 /* unicode PS */ == $this->yy_buffer[$this->yy_buffer_end-1]); 170 | } 171 | 172 | protected function yytext() { 173 | return substr($this->yy_buffer, $this->yy_buffer_start, 174 | $this->yy_buffer_end - $this->yy_buffer_start); 175 | } 176 | 177 | protected function yylength() { 178 | return $this->yy_buffer_end - $this->yy_buffer_start; 179 | } 180 | 181 | static $yy_error_string = array( 182 | 'INTERNAL' => "Error: internal error.\n", 183 | 'MATCH' => "Error: Unmatched input.\n" 184 | ); 185 | 186 | protected function yy_error($code, $fatal) { 187 | print self::$yy_error_string[$code]; 188 | flush(); 189 | if ($fatal) throw new Exception("JLex fatal error " . self::$yy_error_string[$code]); 190 | } 191 | 192 | /* creates an annotated token */ 193 | function createToken($type = null) { 194 | if ($type === null) $type = $this->yytext(); 195 | $tok = new JLexToken($type); 196 | $this->annotateToken($tok); 197 | return $tok; 198 | } 199 | 200 | /* annotates a token with a value and source positioning */ 201 | function annotateToken(JLexToken $tok) { 202 | $tok->value = $this->yytext(); 203 | $tok->col = $this->yycol; 204 | $tok->line = $this->yyline; 205 | $tok->filename = $this->yyfilename; 206 | } 207 | } 208 | 209 | -------------------------------------------------------------------------------- /c.lex: -------------------------------------------------------------------------------- 1 | "/*" { 29 | $this->commentTok = $this->createToken(CParser::TK_COMMENT); 30 | $this->yybegin(self::COMMENT); 31 | } 32 | //[^\r\n]* { return $this->createToken(CParser::TK_COMMENT); } 33 | 34 | "*/" { 35 | $this->commentTok->value .= $this->yytext(); 36 | $this->yybegin(self::YYINITIAL); 37 | return $this->commentTok; 38 | } 39 | (.|[\r\n]) { $this->commentTok->value .= $this->yytext(); } 40 | 41 | #[^\r\n]* { return $this->createToken(CParser::TK_PRAGMA); } 42 | 43 | "auto" { return $this->createToken(CParser::TK_AUTO); } 44 | "break" { return $this->createToken(CParser::TK_BREAK); } 45 | "case" { return $this->createToken(CParser::TK_CASE); } 46 | "char" { return $this->createToken(CParser::TK_CHAR); } 47 | "const" { return $this->createToken(CParser::TK_CONST); } 48 | "continue" { return $this->createToken(CParser::TK_CONTINUE); } 49 | "default" { return $this->createToken(CParser::TK_DEFAULT); } 50 | "do" { return $this->createToken(CParser::TK_DO); } 51 | "double" { return $this->createToken(CParser::TK_DOUBLE); } 52 | "else" { return $this->createToken(CParser::TK_ELSE); } 53 | "enum" { return $this->createToken(CParser::TK_ENUM); } 54 | "extern" { return $this->createToken(CParser::TK_EXTERN); } 55 | "float" { return $this->createToken(CParser::TK_FLOAT); } 56 | "for" { return $this->createToken(CParser::TK_FOR); } 57 | "goto" { return $this->createToken(CParser::TK_GOTO); } 58 | "if" { return $this->createToken(CParser::TK_IF); } 59 | "int" { return $this->createToken(CParser::TK_INT); } 60 | "long" { return $this->createToken(CParser::TK_LONG); } 61 | "register" { return $this->createToken(CParser::TK_REGISTER); } 62 | "return" { return $this->createToken(CParser::TK_RETURN); } 63 | "short" { return $this->createToken(CParser::TK_SHORT); } 64 | "signed" { return $this->createToken(CParser::TK_SIGNED); } 65 | "sizeof" { return $this->createToken(CParser::TK_SIZEOF); } 66 | "static" { return $this->createToken(CParser::TK_STATIC); } 67 | "struct" { return $this->createToken(CParser::TK_STRUCT); } 68 | "switch" { return $this->createToken(CParser::TK_SWITCH); } 69 | "typedef" { return $this->createToken(CParser::TK_TYPEDEF); } 70 | "union" { return $this->createToken(CParser::TK_UNION); } 71 | "unsigned" { return $this->createToken(CParser::TK_UNSIGNED); } 72 | "void" { return $this->createToken(CParser::TK_VOID); } 73 | "volatile" { return $this->createToken(CParser::TK_VOLATILE); } 74 | "while" { return $this->createToken(CParser::TK_WHILE); } 75 | 76 | {L}({L}|{D})* { return $this->createToken(CParser::TK_IDENTIFIER); } 77 | 78 | 0[xX]{H}+{IS}? { return $this->createToken(CParser::TK_CONSTANT); } 79 | 0{D}+{IS}? { return $this->createToken(CParser::TK_CONSTANT); } 80 | {D}+{IS}? { return $this->createToken(CParser::TK_CONSTANT); } 81 | L?\'(\\.|[^\\\'])+\' { return $this->createToken(CParser::TK_CONSTANT); } 82 | 83 | {D}+{E}{FS}? { return $this->createToken(CParser::TK_CONSTANT); } 84 | {D}*"."{D}+({E})?{FS}? { return $this->createToken(CParser::TK_CONSTANT); } 85 | {D}+"."{D}*({E})?{FS}? { return $this->createToken(CParser::TK_CONSTANT); } 86 | 87 | L?\"(\\.|[^\\\"])*\" { return $this->createToken(CParser::TK_STRING_LITERAL); } 88 | 89 | "..." { return $this->createToken(CParser::TK_ELLIPSIS); } 90 | ">>=" { return $this->createToken(CParser::TK_RIGHT_ASSIGN); } 91 | "<<=" { return $this->createToken(CParser::TK_LEFT_ASSIGN); } 92 | "+=" { return $this->createToken(CParser::TK_ADD_ASSIGN); } 93 | "-=" { return $this->createToken(CParser::TK_SUB_ASSIGN); } 94 | "*=" { return $this->createToken(CParser::TK_MUL_ASSIGN); } 95 | "/=" { return $this->createToken(CParser::TK_DIV_ASSIGN); } 96 | "%=" { return $this->createToken(CParser::TK_MOD_ASSIGN); } 97 | "&=" { return $this->createToken(CParser::TK_AND_ASSIGN); } 98 | "^=" { return $this->createToken(CParser::TK_XOR_ASSIGN); } 99 | "|=" { return $this->createToken(CParser::TK_OR_ASSIGN); } 100 | ">>" { return $this->createToken(CParser::TK_RIGHT_OP); } 101 | "<<" { return $this->createToken(CParser::TK_LEFT_OP); } 102 | "++" { return $this->createToken(CParser::TK_INC_OP); } 103 | "--" { return $this->createToken(CParser::TK_DEC_OP); } 104 | "->" { return $this->createToken(CParser::TK_PTR_OP); } 105 | "&&" { return $this->createToken(CParser::TK_AND_OP); } 106 | "||" { return $this->createToken(CParser::TK_OR_OP); } 107 | "<=" { return $this->createToken(CParser::TK_LE_OP); } 108 | ">=" { return $this->createToken(CParser::TK_GE_OP); } 109 | "==" { return $this->createToken(CParser::TK_EQ_OP); } 110 | "!=" { return $this->createToken(CParser::TK_NE_OP); } 111 | ";" { return $this->createToken(CParser::TK_SEMIC); } 112 | ("{"|"<%") { return $this->createToken(CParser::TK_LCURLY); } 113 | ("}"|"%>") { return $this->createToken(CParser::TK_RCURLY); } 114 | "," { return $this->createToken(CParser::TK_COMMA); } 115 | ":" { return $this->createToken(CParser::TK_COLON); } 116 | "=" { return $this->createToken(CParser::TK_EQUALS); } 117 | "(" { return $this->createToken(CParser::TK_LPAREN); } 118 | ")" { return $this->createToken(CParser::TK_RPAREN); } 119 | ("["|"<:") { return $this->createToken(CParser::TK_LSQUARE); } 120 | ("]"|":>") { return $this->createToken(CParser::TK_RSQUARE); } 121 | "." { return $this->createToken(CParser::TK_PERIOD); } 122 | "&" { return $this->createToken(CParser::TK_AMP); } 123 | "!" { return $this->createToken(CParser::TK_EXCLAM); } 124 | "~" { return $this->createToken(CParser::TK_TILDE); } 125 | "-" { return $this->createToken(CParser::TK_MINUS); } 126 | "+" { return $this->createToken(CParser::TK_PLUS); } 127 | "*" { return $this->createToken(CParser::TK_STAR); } 128 | "/" { return $this->createToken(CParser::TK_SLASH); } 129 | "%" { return $this->createToken(CParser::TK_PERCENT); } 130 | "<" { return $this->createToken(CParser::TK_LANGLE); } 131 | ">" { return $this->createToken(CParser::TK_RANGLE); } 132 | "^" { return $this->createToken(CParser::TK_CARET); } 133 | "|" { return $this->createToken(CParser::TK_PIPE); } 134 | "?" { return $this->createToken(CParser::TK_QUESTION); } 135 | 136 | [ \t\v\n\f] { } 137 | . { /* ignore bad characters */ } 138 | 139 | --------------------------------------------------------------------------------