├── cpp-sql-parser ├── head.h ├── makefile ├── treeNode.h ├── treeNode.cpp ├── sql.l └── sql.y ├── py-sql-parser ├── node.py └── yacc.py └── README.md /cpp-sql-parser/head.h: -------------------------------------------------------------------------------- 1 | #ifndef HEAD_H 2 | #define HEAD_H 3 | 4 | #include "treeNode.h" 5 | #include 6 | #include 7 | using namespace std; 8 | #define YYSTYPE TreeNode* 9 | #endif 10 | -------------------------------------------------------------------------------- /cpp-sql-parser/makefile: -------------------------------------------------------------------------------- 1 | LEX=flex 2 | YACC=yacc 3 | CC=g++ 4 | 5 | sql_parser: sql_y.o sql_l.o treeNode.o 6 | $(CC) sql_y.o sql_l.o treeNode.o -o $@ -lfl 7 | 8 | 9 | y.tab.c: sql.y lex.yy.c 10 | $(YACC) -vdt sql.y 11 | 12 | lex.yy.c: sql.l treeNode.o 13 | $(LEX) sql.l 14 | 15 | sql_y.o: y.tab.c 16 | $(CC) -c $< -o $@ 17 | 18 | sql_l.o: lex.yy.c 19 | $(CC) -c $< -o $@ 20 | treeNode.o: treeNode.cpp 21 | $(CC) -std=c++11 -c $< -o $@ 22 | clean: 23 | rm -f *.c *.o sql_parser y.output y.tab.h 24 | -------------------------------------------------------------------------------- /py-sql-parser/node.py: -------------------------------------------------------------------------------- 1 | class node: 2 | 3 | def __init__(self, data): 4 | self._data = data 5 | self._children = [] 6 | 7 | def getdata(self): 8 | return self._data 9 | 10 | def getchildren(self): 11 | return self._children 12 | 13 | def add(self, node): 14 | self._children.append(node) 15 | 16 | def print_node(self, prefix): 17 | print ' '*prefix,'+',self._data 18 | for child in self._children: 19 | child.print_node(prefix+1) 20 | -------------------------------------------------------------------------------- /cpp-sql-parser/treeNode.h: -------------------------------------------------------------------------------- 1 | #ifndef _TREENODE_H_ 2 | #define _TREENODE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | using std::string; 8 | using std::vector; 9 | 10 | class TreeNode { 11 | public: 12 | TreeNode(const string &s) : data(s) {} 13 | string getData(); 14 | vector getChildren(); 15 | void add(TreeNode *node); 16 | TreeNode* go(const string &data); 17 | void print_node(int num); 18 | 19 | private: 20 | string data; 21 | vector children; 22 | }; 23 | 24 | #endif // _TREENODE_H_ 25 | -------------------------------------------------------------------------------- /cpp-sql-parser/treeNode.cpp: -------------------------------------------------------------------------------- 1 | #include "treeNode.h" 2 | #include 3 | using std::cout; 4 | using std::endl; 5 | string TreeNode::getData() { 6 | return data; 7 | } 8 | 9 | vector TreeNode::getChildren() { 10 | return children; 11 | } 12 | 13 | void TreeNode::add(TreeNode *node) { 14 | children.push_back(node); 15 | } 16 | 17 | TreeNode* TreeNode::go(const string &data) { 18 | for(auto child : children) { 19 | if(child->getData() == data) { 20 | return child; 21 | } 22 | } 23 | return nullptr; 24 | } 25 | 26 | void TreeNode::print_node(int num) { 27 | for(int i=0; iprint_node(num+1); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cpp-sql-parser/sql.l: -------------------------------------------------------------------------------- 1 | %{ 2 | #include "head.h" 3 | #include "y.tab.h" 4 | #include 5 | #include 6 | using namespace std; 7 | extern YYSTYPE yylval; 8 | 9 | %} 10 | 11 | digit [0-9] 12 | letter [A-Za-z] 13 | word {letter}({letter}|{digit})* 14 | whitespace ([ \t]*) 15 | compare [<>=]|(!=)|(>=)|(<=)|(like) 16 | number {digit}+\.?(digit)* 17 | %% 18 | 19 | [\(] {return LEFTPARENTHESIS;} 20 | [\)] {return RIGHTPARENTHESIS;} 21 | select {return SELECT;} 22 | from {return FROM;} 23 | where {return WHERE;} 24 | {whitespace} {} 25 | and {yylval = new TreeNode("AND"); return LOGIC;} 26 | or {yylval = new TreeNode("OR"); return LOGIC;} 27 | , {return COMMA;} 28 | ; {return SEMICOLON;} 29 | {compare} {yylval = new TreeNode(yytext);return COMPARE;} 30 | {number} {yylval = new TreeNode(yytext); return NUMBER;} 31 | {word} {yylval = new TreeNode(yytext); return WORD;} 32 | . {} 33 | %% 34 | int yywrap(void) { 35 | return 1; 36 | } 37 | 38 | int yyerror(string s) 39 | { 40 | cout << s << endl; 41 | return 1; 42 | } 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | sql-parser 2 | ========== 3 | > A simple version of SQL parser written in Python and C++, the results are saved in a tree. 4 | > For now, only SELECT queries are implemented. 5 | 6 | ###Python 7 | **Prerequisites:** 8 | * PLY (Python Lex-Yacc) [HELP](http://www.dabeaz.com/ply/ply.html) 9 | 10 | **Usage** 11 | ``` 12 | python yacc.py 13 | 14 | -> SELECT a, b FROM c 15 | + QUERY 16 | + [SELECT] 17 | + [FIELDS] 18 | + [FIELD] 19 | + a 20 | + [FIELD] 21 | + b 22 | + [FROM] 23 | + [TABLE] 24 | + c 25 | 26 | -> SELECT a . b , c . d FROM aaa AS a , ccc AS c 27 | + QUERY 28 | + [SELECT] 29 | + [FIELDS] 30 | + [FIELD] 31 | + a.b 32 | + [FIELD] 33 | + c.d 34 | + [FROM] 35 | + [TABLES] 36 | + [TABLE] 37 | + aaa 38 | + AS 39 | + a 40 | + [TABLE] 41 | + ccc 42 | + AS 43 | + c 44 | 45 | -> SELECT a FROM ( SELECT b FROM c WHERE d > 1 ) ORDER BY e 46 | + QUERY 47 | + [SELECT] 48 | + [FIELD] 49 | + a 50 | + [FROM] 51 | + [TABLE] 52 | + QUERY 53 | + [SELECT] 54 | + [FIELD] 55 | + b 56 | + [FROM] 57 | + [TABLE] 58 | + c 59 | + [WHERE] 60 | + [CONDITION] 61 | + [TERM] 62 | + d 63 | + > 64 | + 1 65 | + [ORDER BY] 66 | + [FIELD] 67 | + e 68 | 69 | -> SELECT COUNT ( * ) FROM a WHERE b < 1 AND c > 2 ORDER BY d 70 | + QUERY 71 | + [SELECT] 72 | + [FIELD] 73 | + COUNT(*) 74 | + [FROM] 75 | + [TABLE] 76 | + a 77 | + [WHERE] 78 | + [CONDITIONS] 79 | + [TERM] 80 | + b 81 | + < 82 | + 1 83 | + [AND] 84 | + [TERM] 85 | + c 86 | + > 87 | + 2 88 | + [ORDER BY] 89 | + [FIELD] 90 | + d 91 | ``` 92 | 93 | ### C++ 94 | **Prerequisites:** 95 | * Lex 96 | * Yacc 97 | 98 | **Usage** 99 | ``` 100 | make 101 | ./sql_parser 102 | //enter your select queries 103 | ``` -------------------------------------------------------------------------------- /cpp-sql-parser/sql.y: -------------------------------------------------------------------------------- 1 | %{ 2 | #include "head.h" 3 | #include 4 | #include 5 | using namespace std; 6 | 7 | extern int yylex(void); 8 | extern int yyparse(void); 9 | extern int yyerror(string); 10 | 11 | %} 12 | %token SELECT FROM WHERE WORD COMMA SEMICOLON COMPARE NUMBER LOGIC LEFTPARENTHESIS RIGHTPARENTHESIS 13 | 14 | %% 15 | all: 16 | select SEMICOLON 17 | { 18 | ((TreeNode *) $$)->print_node(1); 19 | return 0; 20 | } 21 | 22 | select: 23 | SELECT tnames FROM tables 24 | { 25 | $$ = new TreeNode("QUERY"); 26 | ((TreeNode *) $$) -> add( new TreeNode("SELECT") ); 27 | ((TreeNode *) $$) -> add( (TreeNode *)$2 ); 28 | ((TreeNode *) $$) -> add( new TreeNode("FROM") ); 29 | ((TreeNode *) $$) -> add( (TreeNode *)$4 ); 30 | } 31 | 32 | | SELECT tnames FROM tables WHERE wheres 33 | { 34 | $$ = new TreeNode("QUERY"); 35 | ((TreeNode *) $$) -> add( new TreeNode("SELECT") ); 36 | ((TreeNode *) $$) -> add( (TreeNode *)$2 ); 37 | ((TreeNode *) $$) -> add( new TreeNode("FROM") ); 38 | ((TreeNode *) $$) -> add( (TreeNode *)$4 ); 39 | ((TreeNode *) $$) -> add( new TreeNode("WHERE") ); 40 | ((TreeNode *) $$) -> add( (TreeNode *)$6 ); 41 | } 42 | 43 | tnames: 44 | tname 45 | { 46 | $$ = new TreeNode("FIELD"); 47 | ((TreeNode *) $$) -> add( (TreeNode *)$1 ); 48 | 49 | } 50 | | tnames COMMA tname 51 | { 52 | $$ = new TreeNode("FIELDS"); 53 | ((TreeNode *) $$) -> add( (TreeNode *)$1 ); 54 | ((TreeNode *) $$) -> add( new TreeNode(",") ); 55 | ((TreeNode *) $$) -> add( (TreeNode *)$3 ); 56 | } 57 | 58 | tname: 59 | WORD 60 | { 61 | $$ = new TreeNode($1->getData()); 62 | } 63 | 64 | 65 | tables: 66 | table 67 | { 68 | $$ = new TreeNode("TABLE"); 69 | ((TreeNode *) $$) -> add( (TreeNode *)$1 ); 70 | } 71 | | tables COMMA table 72 | { 73 | $$ = new TreeNode("TABLES"); 74 | ((TreeNode *) $$) -> add( (TreeNode *)$1 ); 75 | ((TreeNode *) $$) -> add( new TreeNode(",") ); 76 | ((TreeNode *) $$) -> add( (TreeNode *)$3 ); 77 | } 78 | 79 | 80 | table: 81 | WORD 82 | { 83 | $$ = new TreeNode( $1->getData() ); 84 | } 85 | | LEFTPARENTHESIS select RIGHTPARENTHESIS 86 | { 87 | $$ = $2; 88 | } 89 | 90 | wheres: 91 | comp 92 | { 93 | $$ = new TreeNode("CONDITION"); 94 | ((TreeNode *) $$) -> add( (TreeNode *)$1 ); 95 | } 96 | | comp LOGIC wheres 97 | { 98 | $$ = new TreeNode("CONDITIONS"); 99 | ((TreeNode *) $$) -> add( (TreeNode *)$1 ); 100 | ((TreeNode *) $$) -> add( (TreeNode *)$2 ); 101 | ((TreeNode *) $$) -> add( (TreeNode *)$3 ); 102 | } 103 | 104 | comp: 105 | wordornum COMPARE wordornum 106 | { 107 | $$ = new TreeNode("COMPARE"); 108 | ((TreeNode *) $$) -> add( (TreeNode *)$1 ); 109 | ((TreeNode *) $$) -> add( (TreeNode *)$2 ); 110 | ((TreeNode *) $$) -> add( (TreeNode *)$3 ); 111 | } 112 | 113 | wordornum: 114 | WORD 115 | { 116 | $$ = new TreeNode( $1->getData() ); 117 | } 118 | | NUMBER 119 | { 120 | $$ = new TreeNode( $1->getData() ); 121 | } 122 | 123 | %% 124 | 125 | int main() 126 | { 127 | yyparse(); 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /py-sql-parser/yacc.py: -------------------------------------------------------------------------------- 1 | import ply.lex as lex 2 | import ply.yacc as yacc 3 | import re 4 | from math import * 5 | from node import node 6 | 7 | #TOKENS 8 | tokens=('SELECT','FROM','WHERE','ORDER','BY','NAME','AND','OR','COMMA', 9 | 'LP','RP','AVG','BETWEEN','IN','SUM','MAX','MIN','COUNT','NUMBER','AS','DOT') 10 | 11 | literals = ['=','+','-','*', '^','>','<' ] 12 | #DEFINE OF TOKENS 13 | def t_LP(t): 14 | r'\(' 15 | return t 16 | 17 | def t_DOT(t): 18 | r'\.' 19 | return t 20 | 21 | def t_AS(t): 22 | r'AS' 23 | return t 24 | 25 | def t_SUM(t): 26 | r'SUM' 27 | return t 28 | 29 | def t_MIN(t): 30 | r'MIN' 31 | return t 32 | 33 | def t_MAX(t): 34 | r'MAX' 35 | return t 36 | 37 | def t_COUNT(t): 38 | r'COUNT' 39 | return t 40 | 41 | def t_AVG(t): 42 | r'AVG' 43 | return t 44 | 45 | def t_RP(t): 46 | r'\)' 47 | return t 48 | 49 | def t_BETWEEN(t): 50 | r'BETWEEN' 51 | return t 52 | 53 | def t_IN(t): 54 | r'IN' 55 | return t 56 | 57 | def t_SELECT(t): 58 | r'SELECT' 59 | return t 60 | 61 | def t_FROM(t): 62 | r'FROM' 63 | return t 64 | 65 | def t_WHERE(t): 66 | r'WHERE' 67 | return t 68 | 69 | def t_ORDER(t): 70 | r'ORDER' 71 | return t 72 | 73 | def t_BY(t): 74 | r'BY' 75 | return t 76 | 77 | def t_OR(t): 78 | r'OR' 79 | return t 80 | 81 | def t_AND(t): 82 | r'AND' 83 | return t 84 | 85 | def t_COMMA(t): 86 | r',' 87 | return t 88 | 89 | def t_NUMBER(t): 90 | r'[0-9]+' 91 | return t 92 | 93 | def t_NAME(t): 94 | r'[A-Za-z]+|[a-zA-Z_][a-zA-Z0-9_]*|[A-Z]*\.[A-Z]$' 95 | return t 96 | 97 | # IGNORED 98 | t_ignore = " \t" 99 | def t_error(t): 100 | print("Illegal character '%s'" % t.value[0]) 101 | t.lexer.skip(1) 102 | 103 | # LEX ANALYSIS 104 | lex.lex() 105 | 106 | #PARSING 107 | def p_query(t): 108 | '''query : select 109 | | LP query RP 110 | ''' 111 | if len(t)==2: 112 | t[0]=t[1] 113 | else: 114 | t[0]=t[2] 115 | 116 | def p_select(t): 117 | '''select : SELECT list FROM table WHERE lst ORDER BY list 118 | | SELECT list FROM table WHERE lst 119 | | SELECT list FROM table ORDER BY list 120 | | SELECT list FROM table ''' 121 | if len(t)==10: 122 | t[0]=node('QUERY') 123 | t[0].add(node('[SELECT]')) 124 | t[0].add(t[2]) 125 | t[0].add(node('[FROM]')) 126 | t[0].add(t[4]) 127 | t[0].add(node('[WHERE]')) 128 | t[0].add(t[6]) 129 | t[0].add(node('[ORDER BY]')) 130 | t[0].add(t[9]) 131 | elif len(t)==8: 132 | t[0]=node('QUERY') 133 | t[0].add(node('[SELECT]')) 134 | t[0].add(t[2]) 135 | t[0].add(node('[FROM]')) 136 | t[0].add(t[4]) 137 | t[0].add(node('[ORDER BY]')) 138 | t[0].add(t[7]) 139 | elif len(t)==7: 140 | t[0]=node('QUERY') 141 | t[0].add(node('[SELECT]')) 142 | t[0].add(t[2]) 143 | t[0].add(node('[FROM]')) 144 | t[0].add(t[4]) 145 | t[0].add(node('[WHERE]')) 146 | t[0].add(t[6]) 147 | else: 148 | t[0]=node('QUERY') 149 | t[0].add(node('[SELECT]')) 150 | t[0].add(t[2]) 151 | t[0].add(node('[FROM]')) 152 | t[0].add(t[4]) 153 | 154 | def p_table(t): 155 | '''table : NAME 156 | | LP query RP 157 | | NAME AS NAME 158 | | table AS NAME 159 | | table COMMA table''' 160 | if len(t)==2: 161 | t[0]=node('[TABLE]') 162 | t[0].add(node(t[1])) 163 | elif t[2]=='AS' and isinstance(t[1], node): 164 | t[0]=node('[TABLE]') 165 | t[0].add(t[1]) 166 | t[0].add(node('AS')) 167 | t[0].add(node(t[3])) 168 | elif t[2]=='AS' and not isinstance(t[1], node): 169 | t[0]=node('[TABLE]') 170 | t[0].add(node(t[1])) 171 | t[0].add(node('AS')) 172 | t[0].add(node(t[3])) 173 | elif t[2]==',': 174 | t[0]=node('[TABLES]') 175 | t[0].add(t[1]) 176 | t[0].add(t[3]) 177 | else : 178 | t[0]=node('[TABLE]') 179 | t[0].add(t[2]) 180 | 181 | 182 | def p_lst(t): 183 | ''' lst : condition 184 | | condition AND condition 185 | | condition OR condition 186 | | NAME BETWEEN NUMBER AND NUMBER 187 | | NAME IN LP query RP 188 | | NAME '<' agg 189 | | NAME '>' agg 190 | | agg '>' NUMBER 191 | | NAME '=' agg 192 | | agg '=' NUMBER 193 | | agg '<' NUMBER 194 | ''' 195 | 196 | if len(t)==2: 197 | t[0]=node('[CONDITION]') 198 | t[0].add(t[1]) 199 | elif t[2]==',': 200 | t[0]=node('[CONDITIONS]') 201 | t[0].add(t[1]) 202 | t[0].add(t[3]) 203 | elif t[2]=='AND': 204 | t[0]=node('[CONDITIONS]') 205 | t[0].add(t[1]) 206 | t[0].add(node('[AND]')) 207 | t[0].add(t[3]) 208 | elif t[2]=='OR': 209 | t[0]=node('[CONDITIONS]') 210 | t[0].add(t[1]) 211 | t[0].add(node('[OR]')) 212 | t[0].add(t[3]) 213 | elif t[2]=='BETWEEN': 214 | temp='%s >= %s & %s <= %s'%(t[1],str(t[3]),t[1],str(t[5])) 215 | t[0]=node('[CONDITION]') 216 | t[0].add(node('[TERM]')) 217 | t[0].add(node(temp)) 218 | elif t[2]=='IN': 219 | t[0]=node('[CONDITION]') 220 | t[0].add(node(t[1])) 221 | t[0].add(node('[IN]')) 222 | t[0].add(t[4]) 223 | elif t[2]=='<' and len(t)==4: 224 | temp='%s < %s'%(str(t[1]),str(t[3])) 225 | t[0]=node('[CONDITION]') 226 | t[0].add(node('[TERM]')) 227 | t[0].add(node(temp)) 228 | elif t[2]=='=' and len(t)==4: 229 | temp='%s = %s'%(str(t[1]),str(t[3])) 230 | t[0]=node('[CONDITION]') 231 | t[0].add(node('[TERM]')) 232 | t[0].add(node(temp)) 233 | elif t[2]=='>' and len(t)==4: 234 | temp='%s > %s'%(str(t[1]),str(t[3])) 235 | t[0]=node('[CONDITION]') 236 | t[0].add(node('[TERM]')) 237 | t[0].add(node(temp)) 238 | else: 239 | t[0]=node('') 240 | 241 | 242 | def p_condition(t): 243 | ''' condition : NAME '>' NUMBER 244 | | NAME '>' agg 245 | | NAME '<' NUMBER 246 | | NAME '<' agg 247 | | NAME '=' NUMBER 248 | | NAME '=' agg 249 | | NAME '>' NAME 250 | | NAME '<' NAME 251 | | NAME '=' NAME 252 | | list '>' list 253 | | list '<' list 254 | | list '=' list 255 | | list '>' NUMBER 256 | | list '<' NUMBER 257 | | list '=' NUMBER ''' 258 | t[0]=node('[TERM]') 259 | if isinstance(t[1], node) : 260 | t[0].add(t[1]) 261 | else : 262 | t[0].add(node(str(t[1]))) 263 | t[0].add(node(t[2])) 264 | if isinstance(t[3], node) : 265 | t[0].add(t[3]) 266 | else : 267 | t[0].add(node(str(t[3]))) 268 | 269 | def p_agg(t): 270 | ''' agg : SUM LP NAME RP 271 | | AVG LP NAME RP 272 | | COUNT LP NAME RP 273 | | MIN LP NAME RP 274 | | MAX LP NAME RP 275 | | COUNT LP '*' RP ''' 276 | t[0]='%s(%s)'%(t[1],t[3]) 277 | 278 | def p_list(t): 279 | ''' list : '*' 280 | | NAME 281 | | NAME DOT NAME 282 | | list COMMA list 283 | | list AND NAME 284 | | list OR NAME 285 | | agg ''' 286 | if len(t)==2: 287 | t[0]=node('[FIELD]') 288 | t[0].add(node(t[1])) 289 | elif t[2]==',': 290 | t[0]=node('[FIELDS]') 291 | t[0].add(t[1]) 292 | t[0].add(t[3]) 293 | else: 294 | temp='%s.%s'%(t[1],t[3]) 295 | t[0]=node('[FIELD]') 296 | t[0].add(node(temp)) 297 | 298 | def p_error(t): 299 | print("Syntax error at '%s'" % t.value) 300 | 301 | yacc.yacc() 302 | 303 | while 1: 304 | try: 305 | s = raw_input('-> ') 306 | pass 307 | except EOFError: 308 | break 309 | parse=yacc.parse(s) 310 | parse.print_node(0) 311 | --------------------------------------------------------------------------------