├── .gitignore ├── gp ├── __init__.py ├── mediawiki.py ├── mysql.py └── client.py ├── README └── flatten.py /.gitignore: -------------------------------------------------------------------------------- 1 | SciTE.properties 2 | *.pyc 3 | -------------------------------------------------------------------------------- /gp/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 'client', 'mysql', 'mediawiki' ] 2 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Script for flattening parent categories of articles to make categories recursively searchable in elasticsearch. 2 | Currently used for testing on a single elasticsearch/mediawiki/graphserv host. Don't use this if you're not me. 3 | -J. 4 | -------------------------------------------------------------------------------- /flatten.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf-8 -*- 3 | import os, sys 4 | import time 5 | import pprint 6 | import traceback 7 | import argparse 8 | import json 9 | import MySQLdb 10 | import elasticsearch 11 | import elasticsearch.helpers 12 | from gp import client 13 | 14 | EL_HOST="localhost" 15 | EL_PORT=9200 16 | EL_INDEX="gptest1wiki" 17 | GP_HOST="localhost" 18 | GP_PORT=6666 19 | GP_GRAPH="gptest1wiki" 20 | SQL_HOST="localhost" 21 | SQL_PORT=3306 22 | SQL_DB="gptest1wiki" 23 | 24 | BULK_ENABLED=True 25 | BULK_CHUNK_SIZE=500 26 | TARGET_FIELD="parent_categories" 27 | 28 | pp= pprint.PrettyPrinter(indent=2) 29 | 30 | def getParentcats(categories, gp, cursor, include_titles): 31 | paramfmt= ','.join(['%s']*len(categories)) 32 | sqlstr= "select page_id from page where page_namespace=14 and page_title in (%s)" % paramfmt 33 | params= [ cat.encode('utf-8') for cat in categories ] 34 | #~ print sqlstr, params 35 | cursor.execute(sqlstr, params) 36 | sqlres= cursor.fetchall() 37 | #~ pp.pprint(sqlres) 38 | totalcats= dict() 39 | # get parent category ids for all categories 40 | for row in sqlres: 41 | cat_id= row[0] 42 | parentcats= gp.capture_traverse_predecessors_withdepth(cat_id, 1000) 43 | #~ print parentcats 44 | if parentcats!=None: 45 | for cat in parentcats: 46 | if not str(cat[0]) in totalcats or totalcats[str(cat[0])] > cat[1]+1: 47 | totalcats[str(cat[0])]= cat[1]+1 48 | # get category titles for all parent categories 49 | if include_titles and len(totalcats): 50 | ids= [] 51 | for id in totalcats: 52 | ids.append(id) 53 | sqlsel= ' or '.join( ["page_id=%s"]*len(ids) ) 54 | sqlstr= "select page_id, page_title from page where page_namespace=14 and (%s)" % sqlsel 55 | #~ print ids 56 | cursor.execute(sqlstr, ids) 57 | #~ print cursor.fetchall() 58 | for row in cursor.fetchall(): 59 | title= row[1] 60 | totalcats[title]= totalcats[str(row[0])] 61 | return totalcats 62 | 63 | # from http://stackoverflow.com/questions/1038824 64 | def strip_suffix(text, suffix): 65 | if not text.endswith(suffix): 66 | return text 67 | return text[:len(text)-len(suffix)] 68 | 69 | def makeBulkUpdateAction(hit, gp, cursor, include_titles): 70 | if "category" in hit["fields"]: 71 | parentcats= getParentcats(hit["fields"]["category"], gp, cursor, include_titles) 72 | else: 73 | parentcats= dict() 74 | #~ print("makeBulkUpdateAction: %s in index %s" % (hit["fields"]["title"], hit["_index"])) 75 | if len(parentcats)==0: 76 | parentcats["dummy"]= 1 # we add this because empty dicts confuse elasticsearch, end up as empty lists in the index, and are ignored in "q=_exists_" searches... 77 | action= { 78 | "_op_type": "update", 79 | "_index": strip_suffix(hit["_index"], "_first"), 80 | "_type": "page", 81 | "_id": hit["_id"], 82 | "script": "ctx._source.remove(\"%s\"); ctx._source.%s= %s" % (TARGET_FIELD, TARGET_FIELD, json.dumps(parentcats, ensure_ascii=False, encoding='utf-8')) 83 | } 84 | return action 85 | 86 | def updateParents(hit, es, gp, cursor, include_titles): 87 | #~ print "%s (%s) in index %s" % (hit["fields"]["title"], hit["_id"], hit["_index"]) 88 | if "category" in hit["fields"]: 89 | totalcats= getParentcats(hit["fields"]["category"], gp, cursor, include_titles) 90 | else: 91 | totalcats= dict() 92 | if len(parentcats)==0: 93 | totalcats["dummy"]= 1 # we add this because empty dicts confuse elasticsearch, end up as empty lists in the index, and are ignored in "q=_exists_" searches... 94 | body= { "script": "ctx._source.remove(\"%s\"); ctx._source.%s= %s" % (TARGET_FIELD, TARGET_FIELD, str(totalcats)) } 95 | #~ body= { "script": "ctx._source.remove(\"%s\")" % TARGET_FIELD } 96 | es.update(index=strip_suffix(hit["_index"], "_first"), doc_type="page", id=hit["_id"], body=body) 97 | 98 | if __name__=='__main__': 99 | parser= argparse.ArgumentParser(description= 'flatten.py', formatter_class= argparse.RawDescriptionHelpFormatter) 100 | parser.add_argument('-i', '--init', dest="init", action="store_true", help="process all pages") 101 | parser.add_argument('-n', '--include-titles', dest="include_titles", action="store_true", help="include category titles in target field") 102 | parser.set_defaults(init=False, include_titles=False) 103 | 104 | args= parser.parse_args() 105 | 106 | 107 | es= elasticsearch.Elasticsearch(hosts=[ { "host": EL_HOST, "port": EL_PORT } ]) 108 | gp= client.Connection(client.ClientTransport(GP_HOST, int(GP_PORT))) 109 | gp.connect() 110 | gp.use_graph(GP_GRAPH) 111 | sql= MySQLdb.connect(read_default_file=os.path.expanduser("~/.my.cnf"), host=SQL_HOST, port=SQL_PORT, db=SQL_DB, use_unicode=True) 112 | cursor= sql.cursor() 113 | 114 | es.indices.put_mapping(index="_all", doc_type="page", body= { "dynamic": "true" }) 115 | 116 | 117 | if args.init: 118 | query= "*" 119 | else: 120 | query= "!_exists_:%s" % TARGET_FIELD 121 | 122 | res= es.count(index=EL_INDEX, doc_type="page", q="*") 123 | count= res["count"] 124 | print("approx. pages to process: %s" % count) 125 | scroll= elasticsearch.helpers.scan(es, doc_type="page", fields=["_id", "title", "category"], q=query) 126 | 127 | begintime= time.time() 128 | bulkactions= [] 129 | hits_processed= 0 130 | for hit in scroll: 131 | if BULK_ENABLED: 132 | bulkactions.append(makeBulkUpdateAction(hit, gp, cursor, args.include_titles)) 133 | else: 134 | updateParents(hit, es, gp, cursor, include_titles) 135 | print "%5d/%d... (%.2f/sec.) \r" % (hits_processed, count, hits_processed/(time.time()-begintime)), 136 | hits_processed+= 1 137 | if len(bulkactions)==BULK_CHUNK_SIZE: 138 | print(" * running bulk update...") 139 | sys.stdout.flush(); 140 | r= elasticsearch.helpers.bulk(es, bulkactions, request_timeout=60*2) 141 | print " bulk result: ", 142 | pp.pprint(r) 143 | print(" * processed %s of approx %s hits at %.2f/sec" % 144 | (hits_processed, count, hits_processed/(time.time()-begintime))) 145 | bulkactions= [] 146 | if len(bulkactions): 147 | print(" * running bulk update...") 148 | sys.stdout.flush(); 149 | r= elasticsearch.helpers.bulk(es, bulkactions, request_timeout=60*2) 150 | print " bulk result: ", 151 | pp.pprint(r) 152 | es.indices.flush(index="_all") 153 | bulkactions= [] 154 | print("") 155 | 156 | -------------------------------------------------------------------------------- /gp/mediawiki.py: -------------------------------------------------------------------------------- 1 | from client import * 2 | from mysql import * 3 | 4 | import re 5 | 6 | NS_MAIN = 0 7 | NS_TALK = 1 8 | NS_USER = 2 9 | NS_USER_TALK = 3 10 | NS_PROJECT = 4 11 | NS_PROJECT_TALK = 5 12 | NS_FILE = 6 13 | NS_FILE_TALK = 7 14 | NS_MEDIAWIKI = 8 15 | NS_MEDIAWIKI_TALK = 9 16 | NS_TEMPLATE = 10 17 | NS_TEMPLATE_TALK = 11 18 | NS_HELP = 12 19 | NS_HELP_TALK = 13 20 | NS_CATEGORY = 14 21 | NS_CATEGORY_TALK = 15 22 | 23 | 24 | class MediaWikiGlue (MySQLGlue) : 25 | 26 | def __init__( self, transport, graphname = None ) : 27 | super(MediaWikiGlue, self).__init__(transport, graphname) 28 | 29 | self.table_prefix = "" 30 | 31 | #h = array( self, 'gp_mediawiki_exec_handler' ) 32 | #self.addExecHandler( h ) 33 | 34 | 35 | def set_table_prefix ( self, prefix ) : 36 | self.table_prefix = prefix 37 | 38 | 39 | def get_db_key ( self, name ) : 40 | if name is None or name == False: 41 | raise gpUsageException("name must not be empty!") 42 | 43 | #TODO: use native MediaWiki method if available 44 | name = name.strip() 45 | 46 | if name == "": 47 | raise gpUsageException("name must not be empty!") 48 | 49 | name = re.sub(' ', '_', name) 50 | 51 | result = name[0].upper() + name[1:] #FIXME: unreliable, handle unicode! 52 | 53 | return name 54 | 55 | 56 | def wiki_table ( self, name ) : 57 | return self.table_prefix + name 58 | 59 | 60 | def get_page_id ( self, ns, title ) : 61 | sql = "select page_id from " + self.wiki_table( "page" ) 62 | sql += " where page_namespace = %i" % int(ns) 63 | sql += " and page_title = " + self.quote_string( self.get_db_key(title) ) 64 | 65 | id = self.mysql_query_value( sql ) 66 | return id 67 | 68 | 69 | def add_arcs_from_category_structure ( self, ) : 70 | sql = "select C.page_id as parent, P.page_id as child" 71 | sql += " from " + self.wiki_table( "page" ) + " as P " 72 | sql += " join " + self.wiki_table( "categorylinks" ) + " as X " 73 | sql += " on X.cl_from = P.page_id " 74 | sql += " join " + self.wiki_table( "page" ) + " as C " 75 | sql += " on C.page_namespace = %i" % NS_CATEGORY 76 | sql += " and C.page_title = X.cl_to " 77 | sql += " where P.page_namespace = %i" % NS_CATEGORY 78 | 79 | src = self.make_source( MySQLSelect( sql ) ) 80 | 81 | self.add_arcs( src ) 82 | src.close() 83 | 84 | 85 | def get_subcategories ( self, cat, depth, without = None, without_depth = None ) : 86 | sink = ArraySink() 87 | 88 | id = self.get_page_id( NS_CATEGORY, cat ) 89 | if ( not id ): return 'NONE' 90 | 91 | if ( without ): without_id = self.get_page_id( NS_CATEGORY, without ) 92 | else: without_id = False 93 | 94 | temp = self.make_temp_sink( MySQLTable('?', 'id') ) 95 | 96 | if ( without_id ) : 97 | if ( not without_depth ): without_depth = depth 98 | status = self.traverse_successors_without( id, depth, without_id, without_depth, temp ) 99 | else : 100 | status = self.traverse_successors( id, depth, temp ) 101 | 102 | 103 | temp.close() 104 | 105 | if ( status == 'OK' ) : 106 | sql = "select page_title " 107 | sql += " from " + self.wiki_table( "page" ) 108 | sql += " join " + temp.getTable().get_name() 109 | sql += " on id = page_id " 110 | sql += " where page_namespace = %i" % NS_CATEGORY # should be redundant 111 | sql += " order by page_id " 112 | 113 | self.select_into( sql , sink) 114 | 115 | 116 | temp.drop() 117 | 118 | return sink.getData() 119 | 120 | @staticmethod 121 | def new_client_connection( graphname, host = False, port = False ) : 122 | return MediaWikiGlue( ClientTransport(host, port), graphname ) #FIXME: PORT graphname stuff to PHP! 123 | 124 | @staticmethod 125 | def new_slave_connection( command, cwd = None, env = None ) : 126 | return MediaWikiGlue( SlaveTransport(command, cwd, env), None ) 127 | 128 | 129 | 130 | 131 | class PageSet : 132 | 133 | def __init__ ( self, glue, table = "?", id_field = "page_id", namespace_field = "page_namespace", title_field = "page_title", big = True ) : 134 | self.big = big 135 | 136 | self.glue = glue 137 | self.table = table 138 | 139 | self.id_field = id_field 140 | self.namespace_field = namespace_field 141 | self.title_field = title_field 142 | 143 | self.table_obj = MySQLTable( self.table, self.id_field, self.namespace_field, self.title_field ) 144 | self.table_obj.set_field_definition( self.id_field, "INT NOT NULL") 145 | self.table_obj.set_field_definition( self.namespace_field, "INT DEFAULT NULL") 146 | self.table_obj.set_field_definition( self.title_field, "VARCHAR(255) BINARY DEFAULT NULL") 147 | self.table_obj.add_key_definition( "PRIMARY KEY (" + self.id_field + ")" ) 148 | self.table_obj.add_key_definition( "UNIQUE KEY (" + self.namespace_field + ", " + self.title_field + ")" ) 149 | 150 | self.table_id_obj = MySQLTable( self.table, self.id_field ) 151 | self.table_id_obj.add_key_definition( "PRIMARY KEY (" + self.id_field + ")" ) 152 | 153 | 154 | def set_expect_big ( self, big ) : 155 | self.big = big 156 | 157 | 158 | def get_table ( self, ) : 159 | return self.table_obj 160 | 161 | 162 | def create_table ( self, ) : 163 | table = self.table 164 | t = "" 165 | 166 | if ( not table or table == '?' ) : 167 | table = "gp_temp_%s" % self.glue.next_id() 168 | t = " TEMPORARY " 169 | 170 | 171 | sql = "CREATE " + t + " TABLE " + table 172 | sql += "(" 173 | sql += self.table_obj.get_field_definitions() 174 | sql += ")" 175 | 176 | self._update(sql) 177 | 178 | self.table = table 179 | self.table_obj.set_name( self.table ) 180 | self.table_id_obj.set_name( self.table ) 181 | 182 | return table 183 | 184 | 185 | 186 | def _query( self, sql, **kwargs ) : 187 | if not 'unbuffered' in kwargs: 188 | kwargs['unbuffered'] = self.big 189 | 190 | return self.glue.mysql_query(sql, **kwargs) #TODO: port kwargs to PHP 191 | 192 | def _update( self, sql, **kwargs ) : #TODO: port to PHP; use in PHP! 193 | return self.glue.mysql_update(sql, **kwargs) 194 | 195 | def add_from_select ( self, select, comment = None ) : 196 | sql= "REPLACE INTO " + self.table + " " 197 | sql += "( " 198 | sql += self.id_field + ", " 199 | sql += self.namespace_field + ", " 200 | sql += self.title_field + " ) " 201 | sql += select 202 | 203 | return self._update( sql, comment = comment ) 204 | 205 | 206 | def delete_where ( self, where, comment = None ) : 207 | sql= "DELETE FROM " + self.table + " " 208 | sql += where 209 | 210 | return self._update( sql, comment = comment ) 211 | 212 | 213 | def delete_using ( self, using, tableAlias = "T", comment = None ) : 214 | sql= "DELETE FROM " + tableAlias + " " 215 | sql += "USING " + self.table + " AS " + tableAlias + " " 216 | sql += using 217 | 218 | return self._update( sql, comment = comment ) 219 | 220 | 221 | def resolve_ids ( self, comment = None ) : 222 | #NOTE: MySQL can't perform self-joins on temp tables. so we need to copy the ids to another temp table first. 223 | t = MySQLTable("?", "page_id") 224 | t.add_key_definition("PRIMARY KEY (page_id)") 225 | 226 | tmp = self.glue.make_temp_table( t ) 227 | 228 | sql = tmp.get_insert(True) 229 | sql += "SELECT " + self.id_field 230 | sql += " FROM " + self.table 231 | sql += " WHERE page_title IS NULL" 232 | 233 | self._update( sql ); #copy page ids with no page title into temp table 234 | 235 | sql = "SELECT P.page_id, P.page_namespace, P.page_title " 236 | sql += " FROM " + self.glue.wiki_table("page") + " AS P " 237 | sql += " JOIN " + tmp.get_name() + " AS T ON T.page_id = P.page_id" 238 | 239 | self.add_from_select( sql, comment = comment ) #TODO: port comment to PHP 240 | 241 | self.glue.drop_temp_table( tmp ) 242 | return True 243 | 244 | 245 | def make_sink ( self, ) : 246 | sink = self.glue.make_sink( self.table_obj ) 247 | return sink 248 | 249 | 250 | def make_id_sink ( self, ) : 251 | sink = self.glue.make_sink( self.table_id_obj ) 252 | return sink 253 | 254 | 255 | def make_id_source ( self, ns = None ) : 256 | return self.make_source( ns, True ) 257 | 258 | 259 | def make_source ( self, ns = None, ids_only = False, auto_order = False ) : #TODO: PORT auto_order to PHP 260 | t = self.table_id_obj if ids_only else self.table_obj 261 | 262 | if ( ns is not None ) : 263 | select = t._get_select() 264 | 265 | if ( isinstance(ns, (tuple, list, set)) ): select += " where page_namespace in " + self.glue.as_list( ns ) 266 | else: select += " where page_namespace = %i" % int(ns) 267 | 268 | t = MySQLSelect(select) 269 | 270 | 271 | src = self.glue.make_source( t, big = self.big, auto_order = auto_order ) 272 | return src 273 | 274 | 275 | def capture ( self, ns = None, data = None ) : 276 | sink = ArraySink( data ) 277 | self.copy_to_sink( ns, sink ) 278 | return sink.getData() 279 | 280 | 281 | def capture_ids ( self, ns = None, data = None ) : 282 | sink = ArraySink( data ) 283 | self.copy_ids_to_sink( ns, sink ) 284 | return sink.getData() 285 | 286 | 287 | def copy_to_sink ( self, ns, sink ) : 288 | src = self.make_source(ns) 289 | c = self.glue.copy(src, sink, "~") 290 | src.close() 291 | return c 292 | 293 | 294 | def copy_ids_to_sink ( self, ns, sink ) : 295 | src = self.make_id_source(ns) 296 | c = self.glue.copy(src, sink, "~") 297 | src.close() 298 | return c 299 | 300 | 301 | def add_source ( self, src ) : 302 | sink = self.make_sink() 303 | c = self.glue.copy( src, sink, "+" ) 304 | sink.close() 305 | return c 306 | 307 | 308 | def add_page_set ( self, set ) : 309 | select = set.get_table()._get_select() 310 | return self.add_from_select( select ) 311 | 312 | 313 | def subtract_page_set ( self, set ) : 314 | t = set.get_table() 315 | return self.subtract_table( t ) 316 | 317 | 318 | def subtract_source ( self, src ): #XXX: must be a 1 column id source... 319 | t = MySQLTable("?", "page_id") 320 | sink = self.glue.make_temp_sink( t ) 321 | t = sink.getTable() 322 | 323 | self.glue.copy( src, sink, "+" ) 324 | 325 | ok = self.subtract_table(t, "page_id") 326 | 327 | self.glue.drop_temp_table(t) 328 | sink.close() 329 | 330 | return ok 331 | 332 | 333 | def retain_page_set ( self, set ) : 334 | t = set.get_table() 335 | return self.retain_table( t ) 336 | 337 | 338 | def retain_source ( self, src ) : #XXX: must be a 1 column id source... 339 | t = MySQLTable("?", "page_id") 340 | sink = self.glue.make_temp_sink( t ) 341 | t = sink.getTable() 342 | 343 | self.glue.copy( src, sink, "+" ) 344 | 345 | ok = self.retain_table(t, "page_id") 346 | 347 | self.glue.drop_temp_table(t) 348 | sink.close() 349 | 350 | return ok 351 | 352 | 353 | def subtract_table ( self, table, id_field = None ) : 354 | if ( not id_field ): id_field = table.get_field1() 355 | 356 | sql = "DELETE FROM T " 357 | sql += " USING " + self.table + " AS T " 358 | sql += " JOIN " + table.get_name() + " AS R " 359 | sql += " ON T." + self.id_field + " = R." + id_field 360 | 361 | self._update(sql) 362 | return True 363 | 364 | 365 | def retain_table ( self, table, id_field = None ) : 366 | if ( not id_field ): id_field = table.get_field1() 367 | 368 | sql = "DELETE FROM T " 369 | sql += " USING " + self.table + " AS T " 370 | sql += " LEFT JOIN " + table.get_name() + " AS R " 371 | sql += " ON T." + self.id_field + " = R." + id_field 372 | sql += " WHERE R." + id_field + " IS NULL" 373 | 374 | self._update(sql) 375 | return True 376 | 377 | 378 | def remove_page ( self, ns, title ) : 379 | sql = "DELETE FROM " + self.table 380 | sql += " WHERE " + self.namespace_field + " = %i" % int(ns) 381 | sql += " AND " + self.title_field + " = " + self.glue.quote_string(title) 382 | 383 | self._update(sql) 384 | return True 385 | 386 | 387 | def remove_page_id ( self, id ) : 388 | sql = "DELETE FROM " + self.table 389 | sql += " WHERE " + self.id_field + " = %i" % int(id) 390 | 391 | self._update(sql) 392 | return True 393 | 394 | 395 | def strip_namespace ( self, ns, inverse = False ) : 396 | sql = "DELETE FROM " + self.table 397 | sql += " WHERE " + self.namespace_field 398 | 399 | if ( isinstance(ns, (tuple, list, set)) ): sql += ( " not in " if inverse else " in " ) + self.glue.as_list( ns ) 400 | else: sql += ( " != " if inverse else " = " ) + str(int(ns)) 401 | 402 | self._update(sql) 403 | return True 404 | 405 | 406 | def retain_namespace ( self, ns ) : 407 | return self.strip_namespace( ns, True ) 408 | 409 | 410 | def add_page ( self, id, ns, title ) : 411 | if ( not id ): id = self.glue.get_page_id( NS_CATEGORY, cat ) 412 | 413 | values = array(id, ns, title) 414 | 415 | sql = self.table_obj.insert_command() 416 | sql += " VALUES " 417 | sql += self.glue.as_list(values) 418 | 419 | self._update( sql ) 420 | return True 421 | 422 | 423 | def add_page_id ( self, id ) : 424 | values = array(id) 425 | 426 | sql = "INSERT IGNORE INTO " + self.table 427 | sql += " ( " + self.id_field + " ) " 428 | sql += " VALUES " 429 | sql += self.glue.as_list(values) 430 | 431 | self._update( sql ) 432 | return True 433 | 434 | 435 | def expand_categories ( self, ns = None, comment = None ) : 436 | #NOTE: MySQL can't perform self-joins on temp tables. so we need to copy the category names to another temp table first. 437 | t = MySQLTable("?", "cat_title") 438 | t.set_field_definition("cat_title", "VARCHAR(255) BINARY NOT NULL") 439 | t.add_key_definition("PRIMARY KEY (cat_title)") 440 | 441 | tmp = self.glue.make_temp_table( t ) 442 | 443 | sql = tmp.get_insert(True) 444 | sql += " select page_title " 445 | sql += " from " + self.table + " as T " 446 | sql += " where page_namespace = %i " % NS_CATEGORY 447 | 448 | self._update( sql ) 449 | #self.glue.dump_query("select * from " +tmp.get_name()) 450 | 451 | # ---------------------------------------------------------- 452 | sql = "select P.page_id, P.page_namespace, P.page_title " 453 | sql += " from " + self.glue.wiki_table( "page" ) + " as P " 454 | sql += " join " + self.glue.wiki_table( "categorylinks" ) + " as X " 455 | sql += " on X.cl_from = P.page_id " 456 | sql += " join " + tmp.get_name() + " as T " 457 | sql += " on T.cat_title = X.cl_to " 458 | 459 | if (ns is not None) : 460 | if ( isinstance(ns, (tuple, list, set)) ): sql += " where P.page_namespace in " + self.glue.as_list( ns ) 461 | else: sql += " where P.page_namespace = %i" % int(ns) 462 | 463 | 464 | #self.glue.dump_query(sql) 465 | self.add_from_select( sql, comment = comment ) #TODO: port comment to PHP 466 | 467 | #self.glue.dump_query("select * from " +self.table) 468 | self.glue.drop_temp_table( tmp ) 469 | return True 470 | 471 | 472 | def add_subcategories ( self, cat, depth, without = None, without_depth = None ) : 473 | self._add_subcategory_ids(cat, depth, without, without_depth) 474 | self.resolve_ids() 475 | return True 476 | 477 | 478 | def _add_subcategory_ids( self, cat, depth, without = None, without_depth = None ) : 479 | id = self.glue.get_page_id( NS_CATEGORY, cat ) 480 | if ( not id ): return False 481 | 482 | if ( without ): without_id = self.glue.get_page_id( NS_CATEGORY, without ) 483 | else: without_id = False 484 | 485 | sink = self.make_id_sink() 486 | 487 | if ( without_id ) : 488 | if ( not without_depth ): without_depth = depth 489 | status = self.glue.traverse_successors_without( id, depth, without_id, without_depth, sink ) 490 | else : 491 | status = self.glue.traverse_successors( id, depth, sink ) 492 | 493 | 494 | sink.close() 495 | return True 496 | 497 | def get_size(self): 498 | res = self._query("SELECT COUNT(*) FROM " + self.table) 499 | try: 500 | row = res.fetchone() 501 | finally: 502 | res.close() 503 | 504 | return row[0] 505 | 506 | def add_pages_in ( self, cat, ns, depth, comment = None ) : 507 | self.get_size() 508 | 509 | if ( not self.add_subcategories(cat, depth) ): 510 | return False 511 | 512 | self.get_size() # ?! 513 | 514 | self.expand_categories(ns, comment = comment) 515 | return True 516 | 517 | 518 | def add_pages_transclusing ( self, tag, ns = None, comment = None ) : 519 | if ( ns is None ): ns = NS_TEMPLATE 520 | tag = self.glue.get_db_key( tag ) 521 | 522 | sql = " SELECT page_id, page_namespace, page_title " 523 | sql += " FROM " + self.glue.wiki_table( "page" ) 524 | sql += " JOIN " + self.glue.wiki_table( "templatelinks" ) 525 | sql += " ON tl_from = page_id " 526 | sql += " WHERE tl_namespace = %i" % int(ns) 527 | sql += " AND tl_title = " + self.glue.quote_string(tag) 528 | 529 | return self.add_from_select(sql, comment = comment) 530 | 531 | 532 | def clear ( self, ) : 533 | sql = "TRUNCATE " + self.table 534 | self._update(sql) 535 | return True 536 | 537 | 538 | def dispose ( self, ) : 539 | sql = "DROP TEMPORARY TABLE " + self.table 540 | self._update(sql) 541 | return True 542 | 543 | 544 | 545 | -------------------------------------------------------------------------------- /gp/mysql.py: -------------------------------------------------------------------------------- 1 | from client import * 2 | from client import __function__ 3 | 4 | import types 5 | import re 6 | import MySQLdb, MySQLdb.cursors 7 | import warnings 8 | 9 | class MySQLSource (DataSource): 10 | 11 | def __init__(self, result, table): 12 | self.result = result 13 | self.table = table 14 | 15 | 16 | def next(self): 17 | # XXX: if we knew that the order of fields in the result set is the same 18 | # as the order given in self.table, we could just use result.fetchone() 19 | 20 | raw = _fetch_dict( self.result ) 21 | 22 | if not raw: 23 | raise StopIteration() 24 | 25 | row = ( raw.get( f ) for f in self.table.get_fields() ) 26 | 27 | return tuple( row ) 28 | 29 | 30 | def close (self): 31 | self.result.close() 32 | 33 | 34 | def strip_qualifier(self, n ): 35 | return re.sub(r'^.*\.', '', n) 36 | 37 | class MySQLTable (object): 38 | 39 | def __init__(self, name, *fields): 40 | self.name = name 41 | 42 | self.field_definitions = {} 43 | self.key_definitions = [] 44 | 45 | if ( isinstance(fields[0], (tuple, list) ) ): self.fields = fields[0] 46 | else: self.fields = fields 47 | 48 | for f in self.fields: 49 | if ( not f ): raise gpUsageException( "empty field name!" ) 50 | 51 | 52 | #for ( i = count(self.fields) -1; i >= 0; i-- ): 53 | #if ( self.fields[i] ) break 54 | 55 | 56 | #if i+1 < count(self.fields) : 57 | #self.fields = array_slice(self.fields, 0, i+1) 58 | 59 | 60 | def set_name( self, name ): 61 | self.name = name 62 | 63 | 64 | def set_fields(self, field ): 65 | self.fields = fields 66 | 67 | 68 | def set_field_definition(self, field, decl ): 69 | self.field_definitions[field] = decl 70 | 71 | 72 | def add_key_definition(self, keyDef ): 73 | self.key_definitions.append( keyDef ) 74 | 75 | 76 | def get_name(self,): 77 | return self.name 78 | 79 | 80 | 81 | def get_field(self, n ): 82 | return self.fields[ n-1 ] 83 | 84 | 85 | def get_field1(self, basename_only = False ): 86 | if ( basename_only ): return strip_qualifier( self.get_field(1) ) 87 | else: return self.get_field(1) 88 | 89 | 90 | def get_field2(self, basename_only = False ): 91 | if ( basename_only ): return strip_qualifier( self.get_field(2) ) 92 | else: return self.get_field(2) 93 | 94 | 95 | def get_fields(self,): 96 | return self.fields 97 | 98 | 99 | def get_field_list(self,): 100 | return ", ".join( self.fields ) 101 | 102 | 103 | def get_field_definitions(self,): 104 | s = "" 105 | 106 | for f in self.fields: 107 | if ( not f ): continue #XXX: should not happen! 108 | if ( len(s) > 0 ) : s+= ", " 109 | 110 | if ( f in self.field_definitions ) : 111 | s += " %s %s " % (f, self.field_definitions[f]) 112 | else: 113 | s += f + " INT NOT NULL " 114 | 115 | 116 | for k in self.key_definitions: 117 | if ( len(s)>0 ): s+= ", " 118 | s += k 119 | 120 | 121 | return s 122 | 123 | 124 | def _get_select(self,): 125 | return "SELECT " + self.get_field_list() + " FROM " + self.get_name() 126 | 127 | 128 | def get_insert(self, ignore = False ): 129 | ig = "IGNORE" if ignore else "" 130 | return "INSERT " + ig + " INTO " + self.get_name() + " ( " + self.get_field_list() + " ) " 131 | 132 | 133 | def get_order_by(self,): 134 | return "ORDER BY %s" % self.get_field_list() 135 | 136 | 137 | 138 | 139 | class MySQLSelect (MySQLTable): 140 | 141 | def __init__(self, select): 142 | m = re.search(r'^\s*select\s+(.*?)\s+from\s+([^ ]+)(?:\s+(.*))?', select, flags = re.IGNORECASE + re.DOTALL) 143 | 144 | if m: 145 | self.select = select 146 | 147 | n = m.group(2) 148 | ff = re.split(r'\s*,\s*', m.group(1) ) 149 | 150 | for i in range(len(ff)): 151 | f = ff[i] 152 | f = re.sub(r'^.*\s+AS\s+', '', f, flags = re.IGNORECASE) # use alias if defined 153 | ff[i] = f 154 | 155 | super(MySQLSelect,self).__init__(n, ff) 156 | else: 157 | raise gpUsageException("can't parse statement: %s" % select) 158 | 159 | 160 | 161 | def _get_select(self,): 162 | return self.select 163 | 164 | 165 | def get_insert(self, ignore = False ): 166 | raise gpUsageEsxception("can't create insert statement for: %s" % self.select) 167 | 168 | 169 | 170 | class MySQLInserter (object): 171 | def __init__ ( self, glue, table ): 172 | self.glue = glue 173 | self.table = table 174 | self.fields = None 175 | 176 | def insert(self, values ): 177 | raise NotImplementedError( "`insert()' not implemented by %s" % self.__class__ ) 178 | 179 | def flush (self): 180 | pass 181 | 182 | def close (self): 183 | self.flush() 184 | 185 | 186 | 187 | class MySQLSimpleInserter (MySQLInserter): 188 | 189 | def as_list (self, values ): 190 | return self.glue.as_list( values ) 191 | 192 | 193 | def _insert_command(self): 194 | return self.table.get_insert() 195 | 196 | 197 | def insert (self, values ): 198 | sql = self._insert_command() 199 | sql += " VALUES " 200 | sql += self.as_list(values) 201 | 202 | self.glue.mysql_update( sql ) 203 | 204 | 205 | 206 | 207 | class MySQLBufferedInserter (MySQLSimpleInserter): 208 | 209 | def __init__(self, glue, table ): 210 | super(MySQLBufferedInserter,self).__init__(glue, table) 211 | self.buffer = "" 212 | 213 | 214 | def insert (self, values ): 215 | vlist = self.as_list(values) 216 | max = self.glue.get_max_allowed_packet() 217 | 218 | if len(self.buffer)>0 and ( len(self.buffer) + len(vlist) + 2 ) >= max : 219 | self.flush() 220 | 221 | 222 | if len(self.buffer) == 0: 223 | self.buffer = self._insert_command() 224 | self.buffer += " VALUES " 225 | else: 226 | self.buffer += ", " 227 | 228 | self.buffer += vlist 229 | 230 | if len(self.buffer) >= max : 231 | self.flush() 232 | 233 | 234 | 235 | def flush (self): 236 | if len(self.buffer)>0: 237 | self.glue.mysql_update( self.buffer ) 238 | self.buffer = "" 239 | 240 | 241 | class MySQLSink (DataSink): 242 | 243 | def __init__(self, inserter ): 244 | self.inserter = inserter 245 | 246 | 247 | def putRow (self, row ): 248 | self.inserter.insert( row ) 249 | 250 | 251 | def flush (self): 252 | self.inserter.flush() 253 | 254 | 255 | def close (self): 256 | super(MySQLSink, self).close() 257 | self.inserter.close() 258 | 259 | 260 | def drop (self): 261 | raise gpUsageException("only temporary sinks can be dropped") 262 | 263 | 264 | 265 | class MySQLTempSink (MySQLSink): 266 | def __init__( self, inserter, glue, table ): 267 | super(MySQLTempSink, self).__init__(inserter) 268 | 269 | self.glue = glue 270 | self.table = table 271 | 272 | 273 | def drop (self): 274 | sql = "DROP TEMPORARY TABLE IF EXISTS %s" % self.table.get_name() 275 | 276 | ok = self.glue.mysql_update( sql ) 277 | return ok 278 | 279 | 280 | def getTable (self): 281 | return self.table 282 | 283 | 284 | def getTableName (self): 285 | return self.table 286 | 287 | def _fetch_dict( cursor ): 288 | try: 289 | row = cursor.fetch_dict( ) 290 | return row 291 | except AttributeError: 292 | pass 293 | 294 | r = cursor.fetchone() 295 | if r is None: return None 296 | 297 | if hasattr(r, "has_key"): 298 | return r # it's a dict! 299 | 300 | row = {} 301 | 302 | for i in range(len(cursor.description)): 303 | d = cursor.description[ i ] 304 | row[ d[0] ] = r[ i ] 305 | 306 | return row 307 | 308 | class MySQLGlue (Connection): 309 | 310 | def __init__(self, transport, graphname = None ): 311 | super(MySQLGlue, self).__init__(transport, graphname) 312 | 313 | self.connection = None 314 | 315 | self.unbuffered = False 316 | self._update_cursor = None 317 | 318 | self.temp_table_prefix = "gp_temp_" 319 | self.temp_table_db = None 320 | 321 | self.addCallHandler( self.gp_mysql_call_handler ) 322 | 323 | self.max_allowed_packet = None 324 | 325 | def set_unbuffered(self, unbuffered ): 326 | self.unbuffered = unbuffered 327 | 328 | 329 | def mysql_connect( self, server, username, password, db, port = 3306 ): 330 | #FIXME: connection charset, etc! 331 | 332 | #try: 333 | self.connection = MySQLdb.connect(host=server, user=username, passwd=password, db = db, port = port) 334 | 335 | #XXX: would be nice to wrap the exception and provide additional info. 336 | # but without exception chaining, we lose the traceback. wich is bad. 337 | #except MySQLdb.Error, e: 338 | # try: 339 | # raise gpClientException( "Failed to connect! MySQL Error %s: %s" % (e.args[0], e.args[1]) ) 340 | # except IndexError: 341 | # raise gpClientException( "Failed to connect! MySQL Error: %s" % e ) 342 | 343 | if not self.connection : 344 | raise gpClientException( "Failed to connect! (unknown error)" ) 345 | 346 | # autocommit is the default. It's even needed when reading, if we want to 347 | # see changes during a persistent connection. 348 | self.mysql_autocommit(True) 349 | 350 | return True 351 | 352 | def mysql_unbuffered_query( self, sql, **kwargs ): #TODO: port kwargs to PHP 353 | return self.mysql_query( sql, unbuffered = True, **kwargs ) 354 | 355 | def mysql_update( self, sql, **kwargs ): #TODO: port to PHP; use in PHP! 356 | if 'cursor' not in kwargs or not kwargs['cursor']: 357 | if not self.update_cursor: 358 | self._update_cursor = MySQLdb.cursors.SSCursor(self.connection) 359 | 360 | kwargs['cursor'] = self._update_cursor 361 | 362 | self.mysql_query( sql, unbuffered = True, dict_rows = False, **kwargs ) 363 | 364 | return self.connection.affected_rows() 365 | 366 | def inject_query_markers( self, sql, *markers ): #TODO: port markers to PHP 367 | if markers: 368 | for m in markers: 369 | if not m: #handle explicit None, etc 370 | continue 371 | 372 | sql = re.sub( '^\s*(select|update|replace|insert|delete)\s+', '\\1 /* '+m+' */ ', sql, flags = re.IGNORECASE | re.DOTALL ) 373 | 374 | return sql 375 | 376 | def mysql_query( self, sql, unbuffered = None, dict_rows = False, cursor = None, comment = None ): #TODO: port markers to PHP 377 | if unbuffered is None: 378 | unbuffered = self.unbuffered 379 | 380 | sql = self.inject_query_markers(sql, comment) 381 | 382 | if cursor: 383 | using_new_cursor = False 384 | else: 385 | using_new_cursor = True 386 | 387 | if unbuffered: 388 | if dict_rows: 389 | # no buffering, returns dicts 390 | cursor = MySQLdb.cursors.SSDictCursor(self.connection) # TESTME 391 | else: 392 | # no buffering, returns tuples 393 | cursor = MySQLdb.cursors.SSCursor(self.connection) # TESTME 394 | else: 395 | if dict_rows: 396 | # buffers result, returns dicts 397 | cursor = MySQLdb.cursors.DictCursor(self.connection) # TESTME 398 | else: 399 | # default: buffered tuples 400 | cursor = MySQLdb.cursors.Cursor(self.connection) 401 | 402 | with warnings.catch_warnings(): 403 | #ignore MySQL warnings. use cursor.nfo() to get them. 404 | warnings.simplefilter("ignore") 405 | 406 | try: 407 | cursor.execute( sql ) 408 | except: 409 | if using_new_cursor: 410 | cursor.close() #NOTE: *always* close the cursor if an exception ocurred. 411 | raise 412 | 413 | if not dict_rows: 414 | # HACK: glue a fetch_dict method to a cursor that natively returns sequences from fetchone() 415 | # FIXME: if we do this, we for some reason retain a reference to the cursor forever! 416 | # 417 | #m = types.MethodType(_fetch_dict, cursor, cursor.__class__) 418 | #setattr(cursor, "fetch_dict", m) 419 | pass 420 | else: 421 | # make fetch_dict an alias for fetchone 422 | cursor.fetch_dict = cursor.fetchone # TESTME 423 | 424 | return cursor 425 | 426 | #XXX: would be nice to wrap the exception and provide additional info. 427 | # but without exception chaining, we lose the traceback. wich is bad. 428 | #except MySQLdb.Error as e: 429 | #q = sql.replace('/\s+/', ' ') 430 | #if ( len(q) > 255 ): q = q[:252] + '...' 431 | 432 | #try: 433 | # raise gpClientException( "Query failed! MySQL Error %s: %s\nQuery was: %s" % (e.args[0], e.args[1], q) ) 434 | #except IndexError: 435 | # raise gpClientException( "Query failed! MySQL Error: %s\nQuery was: %s" % (e, q) ) 436 | 437 | 438 | def set_mysql_connection(self, connection ): 439 | self.connection = connection 440 | 441 | 442 | def gp_mysql_call_handler( self, gp, params ): 443 | # params: cmd, args, source, sink, capture, result 444 | 445 | cmd = params['command'] 446 | args = params['arguments'] 447 | source = params['source'] 448 | sink = params['sink'] 449 | capture = params['capture'] 450 | result = params['result'] 451 | 452 | m = re.search( r'-(from|into)$', cmd ) 453 | 454 | if m: 455 | cmd = re.sub(r'-(from|into)?$', '', cmd) 456 | action = m.group(1) 457 | 458 | c = len(args) 459 | if not c : 460 | raise gpUsageException("expected last argument to be a table spec; args: %s" % (args, )) 461 | 462 | 463 | t = args[c-1] 464 | args = args[0:c-1] 465 | 466 | if isinstance(t, (str, unicode)) : 467 | if ( re.search( r'^.*select\s+', t, flags = re.IGNORECASE) ): 468 | t = MySQLSelect(t) 469 | else: 470 | t = re.split( r'\s+|\s*,\s*', t ) 471 | 472 | 473 | if ( isinstance(t, (list, tuple)) ): t = MySQLTable( t[0], t[1:] ) 474 | if ( not isinstance(t, MySQLTable) ): raise gpUsageException("expected last argument to be a table spec; found %s" % get_class(t)) 475 | 476 | if action == 'into' : 477 | if ( not t.get_name() or t.get_name() == "?" ): sink = self.make_temp_sink( t ) 478 | else: sink = self.make_sink( t ) 479 | 480 | result = sink #XXX: quite useless, but consistent with -from 481 | else: 482 | source = self.make_source( t ) 483 | 484 | result = source #XXX: a bit confusing, and only useful for temp sinks 485 | 486 | params['command'] = cmd 487 | params['arguments'] = args 488 | params['source'] = source 489 | params['sink'] = sink 490 | params['capture'] = capture 491 | params['result'] = result 492 | 493 | return True 494 | 495 | 496 | def __make_mysql_closure( self, name ): 497 | rc = False 498 | 499 | def call_mysql( *args ): 500 | if not self.connection: 501 | raise gpUsageException( "not connected to mysql, can't run mysql function %s" % (name,) ) 502 | 503 | if not hasattr(self.connection, name): 504 | raise gpUsageException( "unknown mysql function: %s, not in %s" % (name, self.connection.__class__.__name__) ) 505 | 506 | f = getattr(self.connection, name) 507 | 508 | #try: 509 | res = f( *args ) # note: f is bound to self.connection 510 | return res 511 | 512 | #XXX: would be nice to wrap the exception and provide additional info. 513 | # but without exception chaining, we lose the traceback. wich is bad. 514 | #except MySQLdb.Error, e: 515 | #try: 516 | # raise gpClientException( "MySQL %s failed! Error %s: %s" % (name, e.args[0], e.args[1]) ) 517 | #except IndexError: 518 | # raise gpClientException( "MySQL %s failed! Error: %s" % (name, e) ) 519 | 520 | return call_mysql 521 | 522 | def __getattr__( self, name ): 523 | if name.startswith('mysql_'): 524 | f = self.__make_mysql_closure(name[6:]) 525 | 526 | setattr(self, name, f) #re-use closure! 527 | 528 | return f 529 | else: 530 | return super(MySQLGlue, self).__getattr__(name) 531 | 532 | def quote_string (self, s ): #TODO: charset 533 | if type(s) not in (str, unicode): 534 | s = "%s" % s 535 | 536 | return "'" + self.connection.escape_string( s ) + "'" 537 | 538 | def as_list (self, values ): 539 | sql = "(" 540 | 541 | first = True 542 | for v in values: 543 | if ( not first ): sql += "," 544 | else: first = False 545 | 546 | t = type(v) 547 | if ( v is None ): sql+= "None" 548 | elif ( t == int ): sql+= "%i" % v 549 | elif ( t == float ): sql+= "%d" % v 550 | elif ( t == str or t == unicode ): sql+= self.quote_string(v) #TODO: charset... 551 | else: raise gpUsageException("bad value type: %s" % gettype(v)) 552 | 553 | 554 | sql += ")" 555 | 556 | return sql 557 | 558 | id = 1 559 | 560 | def next_id (self): 561 | MySQLGlue.id += 1 562 | return MySQLGlue.id 563 | 564 | def drop_temp_table (self, spec ): 565 | sql = "DROP TEMPORARY TABLE %s" % spec.get_name() 566 | self.mysql_update(sql) 567 | 568 | 569 | def make_temp_table (self, spec ): 570 | table = spec.get_name() 571 | 572 | if ( not table or table == '?' ): 573 | table = "%s%d" % (self.temp_table_prefix, self.next_id()) 574 | 575 | if self.temp_table_db: 576 | table = "%s.%s" % (self.temp_table_db, table); 577 | 578 | sql = "CREATE TEMPORARY TABLE %s" % table 579 | sql += "(" 580 | sql += spec.get_field_definitions() 581 | sql += ")" 582 | 583 | self.mysql_update(sql) 584 | 585 | return MySQLTable(table, spec.get_fields()) 586 | 587 | def mysql_select_db ( self, db ): 588 | #NOTE: the native select_db "sometimes" triggers an InterfaceError. 589 | # This is a strange issue with MySQLdb 590 | 591 | sql = "USE %s" % re.sub('[^\w]', '', db) #TODO: apply real identifier quoting! 592 | 593 | self.mysql_update( sql ) 594 | 595 | def mysql_query_value (self, sql, **kwargs ): 596 | r = self.mysql_query_record( sql, **kwargs ) #TODO: port kwargs to PHP 597 | 598 | if not r: return None 599 | else: return r[0] 600 | 601 | def mysql_query_record (self, sql, **kwargs ): 602 | cursor = self.mysql_query( sql, unbuffered = True, dict_rows = False, **kwargs ) #TODO: port kwargs to PHP 603 | 604 | try: 605 | a = cursor.fetchone() 606 | finally: 607 | cursor.close() 608 | 609 | if ( not a ): return None 610 | else: return a 611 | 612 | def set_max_allowed_packet (self, size ): 613 | self.max_allowed_packet = size 614 | 615 | def get_max_allowed_packet (self): 616 | if self.max_allowed_packet is None: 617 | self.max_allowed_packet = self.mysql_query_value("select @@max_allowed_packet") 618 | 619 | if self.max_allowed_packet is None: 620 | self.max_allowed_packet = 16 * 1024 * 1024 #fall back to MySQL's default of 16MB 621 | 622 | return self.max_allowed_packet 623 | 624 | 625 | def select_into (self, query, sink, **kwargs ): #TODO: port kwargs to PHP 626 | if isinstance(query, (str, unicode)) : 627 | table = MySQLSelect( query ) 628 | sql = query 629 | else: 630 | table = query 631 | sql = src._get_select() 632 | 633 | 634 | res = self.mysql_query( sql, **kwargs ) 635 | src = MySQLSource( res, table ) 636 | 637 | c = self.copy( src, sink, '+' ) 638 | src.close() 639 | 640 | return c 641 | 642 | 643 | def _new_inserter(self, table ): 644 | return MySQLBufferedInserter( self, table ) 645 | 646 | 647 | def make_temp_sink (self, table ): 648 | table = self.make_temp_table(table) 649 | 650 | ins = self._new_inserter(table) 651 | sink = MySQLTempSink( ins, self, table ) 652 | 653 | return sink 654 | 655 | 656 | def make_sink (self, table ): 657 | inserter = self._new_inserter(table) 658 | sink = MySQLSink( inserter ) 659 | 660 | return sink 661 | 662 | 663 | def make_source (self, table, big = False, auto_order = False, **kwargs ): #TODO: PORT auto_order to PHP 664 | sql = table._get_select() 665 | 666 | if auto_order and not re.search(r'\s+ORDER\s+BY\s+', sql, flags = re.IGNORECASE | re.DOTALL ) : #TODO: PORT auto_order to PHP 667 | sql += ' ' + table.get_order_by() 668 | 669 | if not 'unbuffered' in kwargs: 670 | kwargs['unbuffered'] = big 671 | 672 | res = self.mysql_query(sql, **kwargs) #TODO: port kwargs to PHP 673 | 674 | src = MySQLSource( res, table ) 675 | return src 676 | 677 | 678 | def query_to_file (self, query, file, remote = False, **kwargs ): 679 | r = "" if remote else "LOCAL" #TESTME 680 | 681 | query += " INTO %s DATA OUTFILE " % r #TESTME 682 | query += self.quote_string(file) 683 | 684 | cursor = self.mysql_query(query, **kwargs) #TODO: port kwargs to PHP 685 | cursor.close() 686 | 687 | return self.connection.affected_rows() 688 | 689 | 690 | def insert_from_file (self, table, file, remote = False, **kwargs ): 691 | r = "" if remote else "LOCAL" #TESTME 692 | 693 | query = "" 694 | query += " LOAD %s DATA INFILE " % r #TESTME 695 | query += self.quote_string(file) 696 | query += " INTO TABLE %s " % table 697 | 698 | cursor = self.mysql_query(query, **kwargs) #TODO: port kwargs to PHP 699 | cursor.close() 700 | 701 | return self.connection.affected_rows() 702 | 703 | 704 | def close(self): 705 | if self._update_cursor: 706 | try: 707 | self._update_cursor.close() 708 | except Exception as e: 709 | self._trace(__function__(), "failed to close mysql cursor: %s" % e) 710 | #XXX: do we really not care? can we go on? could there have been a commit pending? 711 | 712 | if self.connection: 713 | try: 714 | self._trace(__function__(), "closing mysql connection") 715 | self.mysql_close() 716 | except Exception as e: 717 | self._trace(__function__(), "failed to close mysql connection: %s" % e) 718 | #XXX: do we really not care? can we go on? could there have been a commit pending? 719 | 720 | return super(MySQLGlue, self).close() 721 | 722 | 723 | @staticmethod 724 | def new_client_connection(graphname, host = False, port = False ): 725 | return MySQLGlue( ClientTransport(host, port), graphname ) #FIXME: PORT graphname stuff to PHP! 726 | 727 | 728 | @staticmethod 729 | def new_slave_connection(command, cwd = None, env = None ): 730 | return MySQLGlue( SlaveTransport(command, cwd, env), None ) 731 | 732 | 733 | def dump_query (self, sql ): 734 | print "*** %s ***" % sql 735 | 736 | res = self.mysql_query( sql ) 737 | if ( not res ): return False 738 | 739 | c = self.dump_result( res ) 740 | res.close() 741 | 742 | return c 743 | 744 | 745 | def dump_result (self, res ): 746 | keys = None 747 | c = 0 748 | 749 | print "" 750 | while True: 751 | row = _fetch_dict(res) 752 | if not row: break 753 | 754 | if keys is None : 755 | s = "" 756 | for k in row.keys(): 757 | s += k 758 | s += "\t" 759 | 760 | 761 | print s 762 | 763 | s = "" 764 | for v in row: 765 | s += v 766 | s += "\t" 767 | 768 | print s 769 | c += 1 770 | 771 | 772 | print "-----------------------------" 773 | print "%i rows" % c 774 | 775 | return c 776 | 777 | 778 | -------------------------------------------------------------------------------- /gp/client.py: -------------------------------------------------------------------------------- 1 | """Graph Processor Client Library by Daniel Kinzler 2 | Translated from PHP to Python by Philipp Zedler 3 | Copyright (c) 2011 by Wikimedia Deutschland e.V. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of Wikimedia Deutschland nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY WIKIMEDIA DEUTSCHLAND ''AS IS'' AND ANY 18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL WIKIMEDIA DEUTSCHLAND BE LIABLE FOR ANY 21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | 29 | NOTE: This software is not released as a product. It was written primarily for 30 | Wikimedia Deutschland's own use, and is made public as is, in the hope it may 31 | be useful. Wikimedia Deutschland may at any time discontinue developing or 32 | supporting this software. There is no guarantee any new versions or even fixes 33 | for security issues will be released. 34 | 35 | This version of the Graph Processor Client Library 36 | is a Python interface to a GraphServ or GraphCore instance. 37 | 38 | @author Daniel Kinzler 39 | @author Philipp Zedler (translation) 40 | @copyright 2011, Wikimedia Deutschland 41 | 42 | @package WikiTalk #? Stimmt das? 43 | 44 | """ 45 | 46 | import re 47 | import os 48 | import socket 49 | import time 50 | import subprocess 51 | import inspect 52 | import types 53 | 54 | LINEBREAK = "\r\n" 55 | """Linebreak to use when talking to GraphServ or GraphCore instances. 56 | This is \\r\\n per spec. but \\n alone should also work.""" 57 | 58 | PORT = 6666 59 | """Default GraphServ port""" 60 | 61 | CLIENT_PROTOCOL_VERSION = 4 62 | """Implemented GraphServ protocol version. May be used to determin which 63 | features are supported. Is not used to validate the peer's protocol version, 64 | see MIN_PROTOCOL_VERSION and MAX_PROTOCOL_VERSION for that.""" 65 | 66 | MIN_PROTOCOL_VERSION = 2.0 67 | """Minimum GraphServ protocol version. If GraphServ (resp. GraphCore) 68 | reports a lower protocol version, the connection will be aborted.""" 69 | 70 | MAX_PROTOCOL_VERSION = 4.99 71 | """Maximum GraphServ protocol version. If GraphServ (resp. GraphCore) 72 | reports a higher protocol version, the connection will be aborted.""" 73 | 74 | 75 | def __function__ (shift = 1): #XXX: wtf? 76 | caller = inspect.stack()[shift] 77 | return caller[3] 78 | 79 | class gpException(Exception): 80 | """Base class for exceptions in this module.""" 81 | def __init__(self, msg): 82 | """ 83 | @param msg: the message to be displayed in case of an exception 84 | """ 85 | self.msg = msg 86 | 87 | def __str__(self): 88 | """Show type and name in the error message.""" 89 | return type(self).__name__ + ": " + self.msg 90 | 91 | def getMessage(self): 92 | """Returns the error message.""" 93 | return self.msg 94 | 95 | 96 | class gpProcessorException(gpException): 97 | """Exceptions for errors reported by the remote grap database""" 98 | def __init__(self, status, msg, command=False ): 99 | if command: 100 | msg = msg + " Command was %s" % command 101 | gpException.__init__(self, msg) 102 | self.command = command 103 | self.status = status 104 | #? self.status wird nirgendwo ausgegeben. 105 | 106 | 107 | class gpProtocolException(gpException): 108 | """Exception for the communication with the remote graph database.""" 109 | pass 110 | 111 | 112 | class gpClientException( gpException ): 113 | """Exception when gpClient encounters a problem on the client side.""" 114 | pass 115 | 116 | 117 | class gpUsageException( gpClientException ): 118 | """Exception raised when gpClient is used incorrectly.""" 119 | pass 120 | 121 | 122 | 123 | 124 | 125 | class DataSource (object): 126 | """Represents an interator of rows in a tabular data set. 127 | 128 | Data sources are used in the gpClient framework to represent origin of 129 | a data transfer. Typically, a data source is used to provide data to a 130 | GraphCore command, such as add_arcs. Derived classes must implement the 131 | next() method, which returns one row of data after another. 132 | 133 | """ 134 | def __iter__(self): 135 | """Return the iterator object. Required for the iterator protocol.""" 136 | return self 137 | 138 | def next( self ): 139 | """Returns the next row. 140 | 141 | The row is represented as an indexed array. Successive calls on the 142 | same data source should return rows of the same size, with the same 143 | array keys. 144 | @return an array representing the next row. 145 | 146 | """ 147 | raise NotImplementedError( "`next()' not implemented by %s" % self.__class__ ) 148 | 149 | def close( self ): 150 | """ Close the data source and free resources allocated by this object. 151 | 152 | close() should always be called when a data source is no longer 153 | needed, usually on the same level as the data source object was 154 | created. After close() has been called on a data source object, 155 | the behavior of calling next() on that object is undefined. 156 | 157 | """ 158 | pass # noop 159 | 160 | def drain( self ): #TODO: PORT TO PHP 161 | """ Drains the source and returns all data rows as an array. 162 | """ 163 | 164 | data = [] 165 | 166 | for r in self: 167 | data.append(r) 168 | 169 | self.close() 170 | return data 171 | 172 | 173 | class NullSource( DataSource ): 174 | """An empty data source.""" 175 | 176 | def next( self ): 177 | """Stop the iteration.""" 178 | raise StopIteration() 179 | 180 | instance = None 181 | """kind of singleton instance of NullSource""" 182 | 183 | NullSource.instance = NullSource() 184 | """A global variable of the module containing a NullSource instance. 185 | 186 | It hould be used in order to avoid the costs of crating new instaces 187 | which are not necessary for this class. 188 | 189 | """ 190 | 191 | 192 | class ArraySource( DataSource ): 193 | """A data source that iterates over an array. 194 | 195 | This is useful to use programmatically generated data as the input 196 | to some GraphCore command. 197 | 198 | The ArraySource maintains a current index pointing into the data 199 | array. Every call to next() increments that index to the next row. 200 | 201 | """ 202 | 203 | def __init__( self, data ): 204 | """ Initializes a ArraySource from the table contained in data. 205 | 206 | @param data: a list of lists or tuples, each representing a 207 | row in the data source. If the list contains integers 208 | or strings, they are wrapped and returned as one-tuples. 209 | 210 | """ 211 | self.data = data 212 | self.data_length = len(data) 213 | self.index = 0 214 | 215 | def next( self ): 216 | """Return the next row of the list provided to the constructor.""" 217 | if self.index < self.data_length: 218 | row = self.data[self.index] 219 | self.index = self.index + 1 220 | 221 | if not isinstance(row, (list,tuple)): 222 | if not isinstance(row, (str, unicode, int, long)): 223 | raise gpUsageException("data must consist of strings or integers") 224 | 225 | row = (row, ) 226 | return row 227 | else: 228 | raise StopIteration() 229 | 230 | def makeSink(self): 231 | """Returns a new instance of ArraySink. 232 | 233 | The sink can be used to write to and to fill the data list of 234 | this ArraySource. 235 | 236 | """ 237 | return ArraySink(self.data) 238 | 239 | 240 | class LimitedSource( DataSource ): #TODO: PORT to PHP 241 | """A data source that wraps another data source to limit the number 242 | of rows returned from it. 243 | 244 | This is useful to limit the number of arcs transmitted graphserv in 245 | a single command. 246 | """ 247 | 248 | def __init__( self, src, limit ): 249 | """ Initializes a LimitedSource using the given original data source. 250 | 251 | @param src: a DataSource object 252 | @param limit: the number of rows to return. 253 | """ 254 | 255 | self.source = src 256 | self.limit = limit 257 | self.index = 0 258 | 259 | def next( self ): 260 | 261 | """Return the next row of the DataSource provided to the constructor.""" 262 | 263 | if self.index < self.limit: 264 | row = self.source.next() 265 | self.index = self.index + 1 266 | 267 | return row 268 | else: 269 | raise StopIteration() 270 | 271 | def limit_reached( self ): 272 | """ returns True if next() has already been called sucessfully as many times 273 | as allowed by the limit parameter passed to the constructor. After 274 | iterating over this LimitedSource instance (i.e. after StopIteration() 275 | has been thrown by next()), this method may be used to determine 276 | whether there may be more data in the original data source. If 277 | iteration was terminated but limit_reached() returns false, then the 278 | original source was depleted and there is no more data available from it. 279 | """ 280 | 281 | return ( self.index >= self.limit ) 282 | 283 | class PipeSource( DataSource ): 284 | """Data source based on a file handle. 285 | 286 | Each line read from the file handle is interpreted as (and converted 287 | to) a data row. 288 | Note: calling close() on a PipeSource does *not* close the 289 | underlying file handle. The idea is that the handle should be closed 290 | by the same code that also opened the file. 291 | 292 | """ 293 | def __init__( self, hin ): 294 | """Initializes a new PipeSource 295 | 296 | @param resource hin a handle of an open file that allows read 297 | access, as returned by file() or ... #? translate fsockopen()! 298 | 299 | """ 300 | self.hin = hin 301 | 302 | def next(self): 303 | """Returns the next line from the file handle (using readline). 304 | 305 | The line is split using Connection.splitRow() and the result is 306 | returned as the next row. 307 | 308 | @return array the next data row, extracted from the next line 309 | read from the file handle. 310 | 311 | """ 312 | s = self.hin.readline() 313 | s = s.strip() 314 | if s: 315 | row = Connection.splitRow( s ) 316 | return row 317 | else: 318 | raise StopIteration() 319 | 320 | 321 | class FileSource( PipeSource ): 322 | """Data source based on reading from a file. 323 | 324 | Extends PipeSource to handle an actual local file. 325 | 326 | Note: calling close() on a FileSource *does* close the 327 | underlying file handle. The idea is that the handle should be closed 328 | by the same code that also opened the file. 329 | 330 | """ 331 | def __init__(self, path, mode='r'): 332 | """Creates a data source for reading from the given file. 333 | 334 | The file is opened using file(path, mode). 335 | 336 | @param string path the path of the file to read from 337 | @param string mode (default: 'r') the mode with which the file 338 | should be opened. 339 | @throws gpClientException if file() failed to open the file 340 | given by path. 341 | 342 | """ 343 | self.mode = mode 344 | self.path = path 345 | 346 | try: 347 | handle = file( self.path, self.mode ) 348 | except IOError: 349 | raise gpClientException( "failed to open " + self.path ) 350 | PipeSource.__init__(self, handle) 351 | 352 | def close(self): 353 | """Close the file handle.""" 354 | self.hin.close() 355 | 356 | 357 | 358 | 359 | 360 | class DataSink(object): #abstract 361 | """Abstract base class for "data sinks". 362 | 363 | The gpClient framework uses data sink objects to represent the 364 | endpoint of a data transfer. That is, a data sink accepts one row 365 | of tabular data after another, and handles them in some way. 366 | How the row is processed is specific to the concrete implementation. 367 | 368 | """ 369 | def putRow(self, row): 370 | raise NotImplementedError( "`putRow()' called in abstract class" ) 371 | 372 | def flush(self): 373 | """Write buffered data. 374 | 375 | In case any output has been buffered (or some other kind of 376 | action has been deferred), it should be written now (resp. 377 | deferred actions should be performed and made permanent). 378 | 379 | The default implementation of this method does nothing. Any 380 | subclass that applies any kind of buffereing to the output 381 | should override it to make all pending changes permanent. 382 | 383 | """ 384 | 385 | pass 386 | 387 | def close(self): 388 | """Close this data output and releases allocated resources. 389 | 390 | The behavior of calls to putRow() is undefined after close() 391 | was called on the same object. 392 | 393 | The default implementation of this method calls flush(). Any 394 | subclass that allocates any external resources should override 395 | this method to release those resources. 396 | 397 | """ 398 | 399 | self.flush() 400 | 401 | 402 | class NullSink( DataSink ): 403 | """A data sink that simply ignores all incoming data.""" 404 | 405 | def putRow(self, row): 406 | pass 407 | 408 | def flush(self): 409 | pass 410 | 411 | instance = None 412 | 413 | NullSink.instance = NullSink() 414 | """A global variable of the module containing a NullSink instance. 415 | 416 | It hould be used in order to avoid the costs of crating new instaces 417 | which are not necessary for this class. 418 | 419 | """ 420 | 421 | class ArraySink(DataSink): 422 | """A data sink that appends each row to a data array. 423 | 424 | This is typically used to make the data returned from a GraphCore 425 | command available for programmatic processing. It should however not 426 | be used in situations where large amounts of data are expected to be 427 | returned. 428 | 429 | """ 430 | def __init__(self, data=None): #? data war Zeiger! 431 | """Initializes a new ArraySink. 432 | 433 | @param array data (optional) an array the rows 434 | should be appended to. If not given, a new array will be 435 | created, and can be accessed using the getData() method. 436 | 437 | """ 438 | 439 | if data is None: 440 | #NOTE: don't use [] as a default param, otherwise we'll be 441 | # using the same list instance for all calls! 442 | data = [] 443 | 444 | self.data = data 445 | 446 | def putRow(self, row): 447 | """Appends the given row to the table maintained by this ArraySink. 448 | 449 | The data can be accessed using the getData() method. 450 | 451 | """ 452 | 453 | self.data.append(row) 454 | 455 | def getData(self): 456 | """Returns the array that contains this ArraySink's tabular data. 457 | 458 | This method is typically used to access the data collected by this 459 | data sink. 460 | 461 | """ 462 | return self.data 463 | 464 | def makeSource(self): 465 | """Return a new instance of ArraySource. 466 | 467 | It may be used to read the rows from the array of tabular data 468 | maintained by this ArraySink. 469 | 470 | """ 471 | return ArraySource(self.data) 472 | 473 | def getMap(self): 474 | """Return the maintained tabular data as an associative array. 475 | 476 | This only works for two column data, where each column is 477 | interpreted as a pair of key and value. The first column is 478 | used as the key and the second column is used as the value. 479 | 480 | @rtype: dictionary 481 | @return: an associative array created under the assumption 482 | that the tabular data in this ArraySink consists of 483 | key value pairs. 484 | 485 | """ 486 | return pairs2map(self.data) 487 | 488 | 489 | class PipeSink (DataSink): 490 | """Data sink based on a file handle. 491 | 492 | Each data row is written as a line to the file handle. 493 | 494 | Note: calling close() on a PipeSink does *not* close the 495 | underlying file handle. The idea is that the handle should be closed 496 | by the same code that also opened the file. 497 | 498 | """ 499 | 500 | def __init__(self, hout, linebreak=None): 501 | """Initializes a new pipe sink with the given file handle. 502 | 503 | @param resource $hout a file handle that can be written to, such as 504 | returned by fopen or fsockopen. 505 | @param string $linebreak character(s) to use to separate rows in the 506 | output (default: LINEBREAK) 507 | 508 | """ 509 | if not linebreak: 510 | linebreak = LINEBREAK 511 | self.hout = hout 512 | self.linebreak = linebreak 513 | 514 | def putRow(self, row): 515 | """Writes the given data row to the file handle. 516 | 517 | Connection.joinRow() is used to encode the data row into a 518 | line of text. PipeTransport.send_to() is used to write the 519 | line to the file handle. 520 | 521 | Note that the rows passed to successive calls to putRow() should 522 | have the same number of fields and use the same array keys. 523 | 524 | @type row: list/tuple of int/str types 525 | @param row: representation of a data row. 526 | 527 | """ 528 | s = Connection.joinRow(row) 529 | PipeTransport.send_to(self.hout, s + self.linebreak) 530 | 531 | def flush(self): 532 | """Flushes any pending data on the file handle (using fflush).""" 533 | self.hout.flush() 534 | 535 | 536 | class FileSink (PipeSink): 537 | """Data sink based on writing to a file. 538 | 539 | Extends PipeSink to handle an actual local file. 540 | 541 | Note: calling close() on a FileSink *does* close the underlying 542 | file handle. The idea is that the handle should be closed by the 543 | same code that also opened the file. 544 | 545 | """ 546 | 547 | def __init__(self, path, append=False, linebreak=None): 548 | """Creates a new FileSink around the given file. 549 | 550 | The file given by path is opened using file(). 551 | 552 | @param string path the path to the local file to write to. 553 | @param boolean append whether to append to the file, or override it 554 | @param string linebreak character(s) to use to separate lines in the 555 | resulting file (default: os.linesep) 556 | @throws gpClientException if the file could not be opened. 557 | 558 | """ 559 | if append == True: 560 | self.mode = 'a' 561 | elif append == False: 562 | self.mode = 'w' 563 | else: 564 | self.mode = append 565 | if not linebreak: 566 | linebreak = os.linesep 567 | self.path = path 568 | try: 569 | h = file(self.path, self.mode) 570 | except Error: 571 | raise gpClientException( "failed to open %s" % self.path ) 572 | PipeSink.__init__(self, h, linebreak ) 573 | 574 | def close(self): 575 | """closes the file handle (after flushing it).""" 576 | PipeSink.close(self) 577 | self.hout.close() 578 | 579 | 580 | 581 | 582 | 583 | class Transport(object): # abstract 584 | """Abstract base class of all transports used by the gpClient framework. 585 | 586 | A transport abstracts the way the framework communicates with the 587 | remote peer (i.e. the instance of GraphServ resp. GrahCore). It 588 | also implements to logic to connect to the remote instance. 589 | 590 | """ 591 | 592 | def __init__(self, *otherArguments): 593 | """The constructor.""" 594 | self.closed = False 595 | self.debug = False 596 | self._eof = False 597 | 598 | def trace(self, context, msg, obj='nothing878423really'): 599 | """Trace an error.""" 600 | if ( self.debug ): 601 | if obj != 'nothing878423really': 602 | msg = msg + ': ' + re.sub('\s+', ' ', str(obj)) 603 | 604 | print "[Transport] %s: %s" % (context, msg) 605 | 606 | def isClosed(self): 607 | """Return True if this Transport is closed, e.g. with close()""" 608 | return self.closed 609 | 610 | def close(self): 611 | """Closes this Transport 612 | 613 | Disconnect from the peer and free any resources that this object 614 | may have allocated. After close() has been called, isClosed() 615 | must always return True when called on the same object. 616 | The default implementation just marks this object as closed. 617 | 618 | """ 619 | self.closed = True 620 | 621 | def connect(self): 622 | """Connects this gptransport to its peer. 623 | 624 | Its peer is the remote instance of GraphServ resp. graphCore. 625 | The information required to connect is typically provided to the 626 | constructor of the respective subclass. 627 | 628 | """ 629 | raise NotImplementedError("`connect()' not implemented by %s" % self.__class__) 630 | 631 | def send(self, s): 632 | """Sends a string to the peer. 633 | 634 | This is the an operation of the line based communication protocol. 635 | 636 | """ 637 | raise NotImplementedError("`send()' not implemented by %s" % self.__class__) 638 | 639 | def receive(self): 640 | """Receives a string from the peer. 641 | 642 | This is the a operation of the line based communication protocol. 643 | 644 | """ 645 | raise NotImplementedError("`receive()' not implemented by %s" % self.__class__) 646 | 647 | def eof(self): 648 | """True after detection of end of data stream from the peer""" 649 | return self._eof 650 | 651 | def make_source(self): 652 | """Creates an instance of DataSource 653 | 654 | for reading data from the current position in the data stream 655 | coming from the peer. 656 | 657 | """ 658 | raise NotImplementedError("`make_source()' not implemented by %s" % self.__class__) 659 | 660 | def make_sink(self): 661 | """Create an instance of DataSink 662 | 663 | for writing data to the data stream going to the peer. 664 | 665 | """ 666 | raise NotImplementedError("`make_sink()' not implemented by %s" % self.__class__) 667 | 668 | def checkPeer(self): 669 | """Attempts to check if the peer is still responding. 670 | 671 | A static function. 672 | The default implementation does nothing. 673 | 674 | """ 675 | pass # noop 676 | 677 | def setDebug(self,debug): 678 | """Sets the debug mode on this transport object. 679 | 680 | When debugging is enabled, details about all data send or 681 | received is deumpted to stdout. 682 | 683 | """ 684 | self.debug = debug 685 | 686 | 687 | class PipeTransport(Transport): # abstract 688 | """Abstract base for file handle based implementations of Transport.""" 689 | 690 | def __init__(self): 691 | self.hout = None 692 | self.hin = None 693 | 694 | self.out_chunk_size = None 695 | 696 | Transport.__init__(self) 697 | 698 | @staticmethod 699 | def send_to(hout, s, chunk_size = None): 700 | """Utility function for sending data to a file handle. 701 | 702 | This is essentially a wrapper around file.write(), which makes sure 703 | that s is written in its entirety. After s was written out using 704 | file.write(), file.flush() is called to commit all data to the peer. 705 | 706 | @param resource hout: the file handle to write to 707 | @param string s: the data to write 708 | @raise gpProtocolException if writing fails. 709 | 710 | """ 711 | try: 712 | if chunk_size: # write small chunks 713 | i = 0 714 | while i <= len(s): 715 | hout.write(s[i:i+chunk_size]) 716 | hout.flush() # try to write the buffer 717 | i += chunk_size 718 | else: # write all at once 719 | hout.write(s) 720 | hout.flush() # try to write the buffer 721 | except IOError: 722 | raise gpClientException( 723 | "failed to send data to peer, broken pipe! " 724 | + "(Writing to the file failed.)") 725 | except: 726 | raise gpClientException( "failed to send data to peer, broken pipe! " 727 | + "(A strange error occured.)") 728 | raise 729 | 730 | def send(self, s): 731 | """Sends the given data string to the peer 732 | 733 | by writing it to the output file handle created by the connect() 734 | method. Uses PipeTransport.send_to() to send the data. 735 | 736 | """ 737 | return PipeTransport.send_to(self.hout, s, self.out_chunk_size) 738 | 739 | def receive(self): 740 | """Receives a string of data from the peer 741 | by reading a line from the input file handle created by the 742 | connect() method. Uses readline to send the data. 743 | 744 | @todo: remove hardcoded limit of 1024 bytes per line! 745 | #? Here any problem? 746 | 747 | """ 748 | re = self.hin.readline() 749 | if not re: 750 | self._eof = True 751 | return re 752 | 753 | def setTimeout(self, seconds): 754 | """Sets a read timeout on input file handle 755 | 756 | which is created by the connect() method. 757 | 758 | """ 759 | self.hin.settimeout(seconds) 760 | 761 | def make_source(self): 762 | """Returns a new instance of PipeSource 763 | 764 | and this reads from the input file handle created by the 765 | connect method(). 766 | 767 | """ 768 | return PipeSource( self.hin ) 769 | 770 | def make_sink(self): 771 | """Returns a new instance of PipeSink 772 | 773 | that writes to the output file handle created by the connect method(). 774 | 775 | """ 776 | return PipeSink(self.hout) 777 | 778 | def close(self): #TODO: port to PHP! 779 | self.trace(__function__(), "closing pipes") 780 | 781 | if self.hin: 782 | try: 783 | self.hin.close() 784 | except: 785 | pass 786 | 787 | if self.hout: 788 | self.hout.flush() 789 | 790 | try: 791 | self.hout.close() 792 | except: 793 | pass 794 | 795 | Transport.close(self) 796 | 797 | class ClientTransport(PipeTransport): 798 | """Communicate with a remote instance of GraphServ over TCP. 799 | 800 | An implementation of PipeTransport. 801 | @var host 802 | @var port 803 | @var socket = False 804 | """ 805 | 806 | def __init__(self, host='localhost', port=PORT): #OK 807 | """Initialize a new instance of ClientTransport. 808 | 809 | Responsable for a connection with GraphServ. 810 | 811 | @param string host (default: 'localhost') the host the GraphServ 812 | process is located at 813 | @param int port (default: PORT) the TCP port the GraphServ 814 | process is listening at 815 | 816 | """ 817 | self.port = port 818 | self.host = host 819 | #FIXME: PORT removal of self.graphname to php 820 | self.socket = False 821 | PipeTransport.__init__(self) 822 | 823 | def connect(self): 824 | """Connects to a remote instance of GraphServ 825 | 826 | using the host and port provided to the constructor. 827 | If the connection could be established, opens the graph 828 | specified to the constructor. 829 | B{#? In PHP werden hier noch $errno und $errstr uebergeben. Philipp.} 830 | 831 | @throws gpProtocolException if the connection failed or another 832 | communication error ocurred. 833 | 834 | """ 835 | self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 836 | try: 837 | self.socket.connect((self.host, self.port)) 838 | #XXX: configure timeout? 839 | except socket.error as (value, message): 840 | raise gpProtocolException( 841 | "failed to connect to %s:%s: %s %s" % (self.host, self.port, value, message) ) 842 | 843 | self.hin = self.socket.makefile("r") 844 | self.hout = self.socket.makefile("w") 845 | 846 | return True 847 | 848 | def close(self): 849 | """Closes the transport. 850 | 851 | Disconnect the TCP socket to the remote GraphServ instance (using 852 | fclose). Subsequent calls to close() have no further effect. 853 | @throws gpProtocolException if the connection failed or another 854 | communication error ocurred. 855 | 856 | """ 857 | 858 | PipeTransport.close(self) 859 | 860 | if not self.socket: 861 | return False 862 | 863 | self.trace(__function__(), "closing socket") 864 | 865 | #manual sais: use shutdown() before close() to close socket in a "timely fashion". 866 | try: 867 | self.socket.shutdown(socket.SHUT_RDWR) 868 | except socket.error as e: 869 | self.trace(__function__(), "socket.shutdown() failed: %s" % e) 870 | 871 | try: 872 | self.socket.close() 873 | except socket.error as e: 874 | self.trace(__function__(), "socket.close() failed: %s" % e) 875 | 876 | self.closed = True 877 | 878 | 879 | class SlaveTransport(PipeTransport): 880 | """A transport implementation for communicating 881 | 882 | with a GraphCore instance running in a local child process (i.e. as 883 | a slave to the current PHP script). 884 | 885 | @var process 886 | @var command 887 | 888 | """ 889 | 890 | def __init__(self, command, cwd=None, env=None): 891 | """Initialize a new instance of SlaveTransport. 892 | 893 | Launch a slave instance of GraphCore. 894 | 895 | @param mixed command the command line to start GraphCore. 896 | May be given as a string or as an array. If given as a 897 | string, all parameters must be duely escaped #?. If given as 898 | an array, command[0] must be the path to the GraphCore 899 | executable. See Slavetransport.makeCommand() for more 900 | details. 901 | @param string cwd (default: None) the working dir to run the 902 | slave process in. Defaults to the current working 903 | directory. #? Check! 904 | @param int $env (default: null) the environment variables to 905 | pass to the slave process. Defaults to inheriting the PHP 906 | script's environment. 907 | 908 | """ 909 | self.command = command 910 | self.cwd = cwd 911 | self.env = env 912 | self.process = None 913 | PipeTransport.__init__(self) 914 | 915 | @staticmethod 916 | def makeCommand(command): 917 | """Utility function for creating a valid command line. 918 | 919 | It is called before executing a program as a child process. 920 | 921 | @type command: str or list or tuple 922 | @param command: the command, including the executable and any 923 | parameters. If given as a string, all parameters must be 924 | duely escaped. #? If given as an array, command[0] must be 925 | the path to an executable. 926 | @rtype: str 927 | @return: A valid command line. The first part of 928 | the command is the executable, any following parts are 929 | passed as arguments to the executable. 930 | @raise: gpClientException if the command did not point to a readable, 931 | executable file. 932 | 933 | """ 934 | if not command: 935 | raise Exception('empty command given') 936 | 937 | path = None 938 | if isinstance(command, (list, tuple)): 939 | for i in command: 940 | if i == 0: 941 | cmd = command[i] 942 | # In the php-Version, escapeshellcmd is called here. 943 | # Python claims to handle arguments securely. 944 | path = command[i] 945 | else: 946 | cmd = cmd + ' ' + str(command[i]) 947 | # Here the same with escapeshellarg 948 | else: 949 | m = re.search( 950 | '!^ *([-_a-zA-Z0-9.\\\\/]+)( [^"\'|<>]$|$)!', command) 951 | if m: 952 | path = m.group(1) 953 | cmd = command.strip() 954 | 955 | 956 | if path: 957 | if not os.path.exists(path): 958 | raise gpClientException('file does not exist: ' + path) 959 | if not os.access(path, os.R_OK): 960 | raise gpClientException('file is not readable: ' + path) 961 | if not os.access(path, os.X_OK): 962 | raise gpClientException('file is not executable: ' + path) 963 | 964 | return cmd 965 | 966 | def connect(self): 967 | """Connects to the slave instance of GraphCore 968 | 969 | launched using the command provided to the constructor. 970 | proc_open() is used to launch the child process. 971 | 972 | @throws gpClientException if the command executable could not be found. 973 | @throws gpProtocolException if the child process could not be launched. 974 | 975 | @todo handle output to stderr! 976 | @todo get rid of the "wait 1/10 of a second and check" hack 977 | 978 | """ 979 | cmd = self.makeCommand(self.command) 980 | try: 981 | #pexpect.spawn(cmd,cwd=self.cwd,env=self.env) 982 | # pty.spawn(cmd, self.hin, self.hout) 983 | self.process = subprocess.Popen(cmd, cwd=self.cwd, env=self.env, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) 984 | 985 | self.hin = self.process.stdout 986 | self.hout = self.process.stdin 987 | 988 | except Exception as ex: 989 | self.trace(__function__(), "failed to execute %s" % str(self.command)) 990 | raise gpProtocolException("failed to execute %s" % str(self.command)) 991 | 992 | self.trace(__function__(), "executing command " 993 | + str(self.command) + " as %s" % str(self.process)) 994 | 995 | self.trace(__function__(), "reading from %s" % str(self.hin)) 996 | self.trace(__function__(), "writing to %s" % str(self.hout)) 997 | 998 | #time.sleep(0.1) 999 | # XXX: NASTY HACK! 1000 | # wait 1/10th of a second to see if the command actually starts 1001 | 1002 | self.checkPeer() 1003 | 1004 | return True 1005 | 1006 | @staticmethod 1007 | def send_to(a,b): 1008 | raise NotImplementdError( 1009 | "send_to should not be called for a SlaveTransport object.") 1010 | 1011 | def close(self): 1012 | """Close transport by terminating slave process using Popen.terminate().""" 1013 | 1014 | #PipeTransport.close(self) #XXX: call parent to close pipes?! 1015 | 1016 | if not self.process: 1017 | return False 1018 | 1019 | self.process.terminate() 1020 | self.process.wait() 1021 | 1022 | self.process = False 1023 | self.closed = True 1024 | 1025 | def checkPeer(self): 1026 | """Check if slave process is still alive using Popen.poll(). 1027 | 1028 | @throws gpProtocolException if the slave process is dead. 1029 | 1030 | """ 1031 | 1032 | code = self.process.poll() 1033 | if code is not None: 1034 | raise gpProtocolException('slave process is not running! exit code ' + code) 1035 | 1036 | 1037 | class Connection(object): 1038 | """This class represents an active connection to a graph. 1039 | 1040 | It can be seen 1041 | as the local interface to the graph that allows the graph to be 1042 | queried and manipulated, using the command set specified for GraphCore 1043 | and GraphServ. The communication with the peer process that manages 1044 | the actual graph (a slave GraphCore instance or a remote GraphCore 1045 | server) is performed by an instance of the appropriate subclass of 1046 | Transport. 1047 | 1048 | Instances of Connection that use the appropriate transport can be 1049 | created conveniently using the static factory methods called 1050 | new_xxx_connection. 1051 | 1052 | Besides some methods for managing the connection and some utility 1053 | functions, Connection exposes the GraphCore and GraphServ command 1054 | sets. The commands are exposed as "virtual" methods: they are not 1055 | implemented explicitely, instead, the __getattr__() method is used to map 1056 | method calls to GraphCore commands. No local checks are performed on 1057 | the command call, so it's up to the peer to decide which commands 1058 | exist. Note that this means that a Connection actually exposes more 1059 | commands if it is connected to a GraphServ instance (simply because 1060 | the peer then supports more commands). 1061 | 1062 | The mapping of method calls to commands is performed as follows: 1063 | 1064 | * underscores are converted to dashes: the method add_arcs 1065 | corresponds to the add-arcs command in GraphCore. 1066 | 1067 | * any int or string parameters passed to the method are passed on 1068 | to the command call, in the order they were specified. 1069 | 1070 | * parameters that are instances of DataSource will be used to pass 1071 | a data set to the command. That is, rows from the data source are 1072 | passed to the command as input. 1073 | 1074 | * parameters that are instances of DataSink will be used to handle 1075 | any data the command outputs. That is, rows from the command's 1076 | output data set will be passed to the data sink, one by one. 1077 | 1078 | * parameters that are arrays are wrapped in a new instance of 1079 | ArraySource and used as input for the command, as described 1080 | above. This is convenient for passing data directly to the 1081 | command. 1082 | 1083 | * parameters given as None or False are ignored. 1084 | 1085 | * other types of arguments trigger a gpUsageException 1086 | 1087 | A command called in this way, using its "plain" method counterpart, 1088 | always returns the status string from the peer's response upon 1089 | successful execution. The status may be "OK" or "NONE". Any failure on 1090 | the server side triggers a gpProcessorException. Any output of the 1091 | command is passed to the DataSink that was provided as a parameter 1092 | (or ignored if no sink was provided). 1093 | 1094 | However, modifiers can be attached to the method name to cause the 1095 | command's outcome to be treated differently: 1096 | 1097 | * if the method name is prefixed with "try_", no 1098 | gpProcessorException are thrown. Instead, errors reported by the 1099 | peer cause the method to return false. The cause of the error may 1100 | be examined using the getStatus() and getStatusMessage() methods. 1101 | Note that other exceptions like gpProtocollException or 1102 | gpUsageException are still thrown as usual. 1103 | 1104 | * if the method name is prefixed with "capture_", the command's 1105 | output is collected and returned as an array of arrays, 1106 | representing the rows of data. If the command fails, a 1107 | gpProcessorException is raised, as usual (or, if try_ is also 1108 | specified, the method returns false). 1109 | 1110 | * if the method name is suffixed with "_map" AND prefixed with 1111 | "capture_", the command's output is collected and returned as an 1112 | associative array. This is especially useful for commands like 1113 | "stats" that provide values for set of well known properties. 1114 | To build the associative array, rows from the output are 1115 | interpreted as a key-value pairs. If the _map suffix is used 1116 | without the capture_ prefix, a gpUsageException is raised. 1117 | 1118 | Modifiers can also be combined. For instance, try_capture_stats_map 1119 | would return GraphCore stats as an associative array, or null of the 1120 | call failed. 1121 | 1122 | Additional modifiers or extra virtual methods can be added by 1123 | subclasses by overriding the __getattr__() method or by registering 1124 | handlers with the addCallHandler() or addExecHandler() methods. 1125 | 1126 | """ 1127 | 1128 | 1129 | 1130 | def __init__( self, transport=None, graphname = None ): 1131 | """Initialize a new connection with the given instance of 1132 | Transport. 1133 | 1134 | Note: Instances of Connection that use the appropriate 1135 | transport can ba e created conveniently using the static 1136 | factory methods called new_xxx_connection. 1137 | 1138 | @rtype: None 1139 | 1140 | """ 1141 | self.transport = transport 1142 | """The transport used to communicate with the peer that manages the 1143 | actual graph.""" 1144 | #? Should the type of transport be checked? 1145 | #? EAFP: no. (easier to ask for forgiveness than permission) 1146 | self.tainted = False 1147 | """If true, the protocol session is "out of step" and no further 1148 | commands can be processed.""" 1149 | self.status = None 1150 | """The status string returned by the last command call.""" 1151 | self.statusMessage = None 1152 | """The status message returned by the last command call.""" 1153 | self.response = None 1154 | """The response from the last command call, including the status 1155 | string and status message.""" 1156 | self.call_handlers = [] 1157 | """call handlers, see addCallHandler()""" 1158 | self.exec_handlers = [] 1159 | """Exec handlers, see addExecHandler().""" 1160 | self.allowPipes = False 1161 | """If peer-side input and output redirection should be allowed. For 1162 | security reasons, and to avoid confusion, i/o redirection is disabled 1163 | per default.""" 1164 | self.strictArguments = True 1165 | """Whether arguments should be restricted to alphanumeric strings. 1166 | Enabled by default.""" 1167 | self.__command_has_output = None 1168 | """wheather the command performed by execute() has generated 1169 | output""" 1170 | self.debug = False 1171 | """Debug mode enables lots of output to stdout.""" 1172 | self.graphname = graphname #TODO: port this to PHP 1173 | 1174 | self._protocol_version = None 1175 | 1176 | def connect(self): 1177 | """ Connect to the peer. 1178 | 1179 | For connecting, this method relies solely on the transport 1180 | instance, which in turn uses the information passed to its 1181 | constructor to establish the connection. 1182 | 1183 | After connecting, this method calls checkProtocolVersion() 1184 | to make sure the peer speaks the correct protocol version. 1185 | If not, a gpProtocolException is raised. 1186 | 1187 | @rtype: None 1188 | 1189 | """ 1190 | self.transport.connect() 1191 | self.checkProtocolVersion() 1192 | 1193 | if self.graphname: #TODO: port this to PHP 1194 | self.use_graph( self.graphname ) 1195 | 1196 | def addCallHandler( self, handler ): #OK 1197 | """Register a call handler. 1198 | 1199 | The handler will be called before __getattr__ interprets a 1200 | method call as a GraphCore command, and can be used to add 1201 | support for additional virtual methods or extra modifiers. 1202 | 1203 | The handler must be a callable with the following signature: 1204 | handler(connection, {'command': ..., 'args': ..., 'source': ..., 1205 | 'sink': ..., 'capture': ..., 'result': ...}) 1206 | i.e. it accepts the varible connection 1207 | (this Connection instance) and a dictionary with the 1208 | following keys: 1209 | 1210 | * 'command' a reference to the command name, as a string, with 1211 | the try_, capture_ and _map modifiers removed. 1212 | * 'arguments' a reference to the argument array, unprocessed, as 1213 | passed to the method. 1214 | * 'source' a reference to a DataSource (or None), may be 1215 | altered to change the command's input. 1216 | * 'sink' a reference to a DatSink (or null), may be altered to 1217 | change the output handling for the command. 1218 | * 'capture' a reference to the capture flag. If true, output 1219 | will be captured and returned as an array. 1220 | * 'result' the result to return from the method call, used 1221 | only if the handler returns false. 1222 | * If the handler returns false, the value of 'result' will be 1223 | returned from __getattr__ and no further action is taken. 1224 | 1225 | 1226 | @rtype: None 1227 | 1228 | """ 1229 | self.call_handlers.append( handler ) 1230 | 1231 | def addExecHandler(self, handler): #OK 1232 | """Register a call handler. 1233 | 1234 | # handler(connection, {'command': ..., 'source': ..., 1235 | # 'sink': ..., 'has_output': ..., 'status': ...}) 1236 | #? I don't understand the first argument. It's not used in gpMySQL.php. 1237 | 1238 | The handler will be called before __getattr__ passes a command 1239 | to the execute() method, and can thus be used to add support 1240 | for additional "artificial" commands that use the same parameter 1241 | handling as is used for "real" GraphCore commands. 1242 | 1243 | The handler must be a callable that accepts the varible connection 1244 | (this Connection instance) and a dictionary with the 1245 | following keys: 1246 | 1247 | * 'command' a refference to the command, as an array. 1248 | The first field is the command name, 1249 | the remaining fields contain the parameters for the command. 1250 | * 'source' a reference to a gpDatSource (or null), may be altered to 1251 | change the command's input. 1252 | * 'sink' a reference to a gpDatSink (or null), may be altered to 1253 | change the output handlking for the command. 1254 | * 'status' the commands return status, used of the handler 1255 | returns false. 1256 | 1257 | If the handler returns false, the value of $status will be used as the 1258 | command's result, and no command will be sent to the peer. The value 1259 | of $status is treated the same way the status returned from the peer is: 1260 | e.g. a gpProcessorException is thrown if the status is "FAILED", etc. 1261 | Also, modifiers like capture_ are applied to the output in the same way 1262 | as they are for "normal" commands. 1263 | 1264 | """ 1265 | self.exec_handlers.append(handler) 1266 | 1267 | 1268 | def getStatus(self): #? Not consistent with command call 1269 | """Return the status string that resulted from the last command call. 1270 | 1271 | The status string is 'OK' or 'NONE' for successfull calls, or 'FAILED', 1272 | 'ERROR' or 'DENIED' for unsuccessful calls. Refer to the GraphCore and 1273 | GraphServ documentation for details. 1274 | 1275 | @rtype: str 1276 | @return: the status string 1277 | 1278 | """ 1279 | return self.status 1280 | 1281 | 1282 | def isClosed(self): # OK 1283 | """Tell if the connection is closed. 1284 | 1285 | Returns true if close() was called on this connection, or it was closed for 1286 | some other reason. No commands can be called on a closed connection. 1287 | 1288 | """ 1289 | return self.transport.isClosed() 1290 | 1291 | 1292 | def getStatusMessage(self): #? Check if consistent with commandcall 1293 | """Return the status message that resulted from the last command call. 1294 | 1295 | The status message is the informative message that follows the status 1296 | string in the response from a command call. It may be useful for human 1297 | eyes, but should not be processed programmatically. 1298 | 1299 | """ 1300 | return self.statusMessage 1301 | 1302 | 1303 | def getResponse(self): #? Ceck command call. 1304 | """Return the response that the last command call evoked. 1305 | 1306 | This consists of the status string and the status message. 1307 | 1308 | """ 1309 | return self.response 1310 | 1311 | 1312 | def _trace(self, context, msg, obj_type='nothing878423really'): #halbwegs OK 1313 | """Print messages to stdout when debug mode is enabled.""" 1314 | if self.debug: 1315 | if obj_type != 'nothing878423really' and obj_type != type(None): 1316 | #? and... appears not in the php-version 1317 | #? introduced due to lack of ...?...:... in Python. 1318 | #? Makes all other code shorter. Philipp. 1319 | msg = msg + ': ' + re.sub('\s+', ' ', str(obj_type)) 1320 | #? Check if the substitution is really necessary! 1321 | print "[gpClient] %s: %s" % (context, msg) 1322 | 1323 | def checkPeer(self): #? OK 1324 | """Attempt to check if the peer is still alive.""" 1325 | self.transport.checkPeer() 1326 | 1327 | 1328 | def setDebug(self, debug): #OK 1329 | """Enable or disable the debug mode. 1330 | 1331 | In debug mode, tons of diagnostic information are written to stdout. 1332 | @param bool debug 1333 | 1334 | """ 1335 | self.debug = debug 1336 | self.transport.setDebug(debug) 1337 | 1338 | 1339 | def getProtocolVersion(self): 1340 | """Return the protocol version reported by the peer.""" 1341 | 1342 | if not self._protocol_version: 1343 | self.protocol_version() 1344 | self._protocol_version = self.statusMessage.strip() 1345 | 1346 | return self._protocol_version 1347 | 1348 | def supportsProtocolVersion(self, min_version, max_version = None): #TODO: port to PHP 1349 | """returns True if the peer's protocol version is at least 1350 | min_version and, if given, no grater than max_version. 1351 | """ 1352 | 1353 | version = self.getProtocolVersion() 1354 | version = float(version) 1355 | 1356 | if min_version and version < min_version: 1357 | return False 1358 | 1359 | if max_version and version > max_version: 1360 | return False 1361 | 1362 | return True 1363 | 1364 | 1365 | def checkProtocolVersion(self): 1366 | """Can raise a gpProtocolException. 1367 | 1368 | It raises a gpProtocolException if the protocol version reported by the 1369 | peer is not compatible with MIN_PROTOCOL_VERSION and MAX_PROTOCOL_VERSION. 1370 | 1371 | """ 1372 | 1373 | version = self.getProtocolVersion() 1374 | version = float(version) 1375 | 1376 | if version < MIN_PROTOCOL_VERSION: 1377 | raise gpProtocolException( 1378 | "Bad protocol version: expected at least " 1379 | + str(MIN_PROTOCOL_VERSION) 1380 | + ", but peer uses %s" % str(version) ) 1381 | 1382 | if version > MAX_PROTOCOL_VERSION: 1383 | raise gpProtocolException( 1384 | "Bad protocol version: expected at most " 1385 | + str(MAX_PROTOCOL_VERSION) 1386 | + ", but peer uses %s" % str(version) ) 1387 | 1388 | 1389 | def ping(self): #? ? 1390 | """Attempt to check if the peer is still responding.""" 1391 | theVersion = self.protocol_version() 1392 | self._trace(__function__(), theVersion) 1393 | 1394 | return theVersion 1395 | 1396 | 1397 | def __getattr__(self, name): # fast OK 1398 | """Creates a closure that, when called, executes the command given 1399 | as the attribute name on the peer instanceof graphcore resp graphserv. 1400 | 1401 | Refer to the class level documentation of Connection for details 1402 | on how method calls are mapped to graphserv commands. 1403 | """ 1404 | 1405 | if re.search('_impl$', name): 1406 | raise AttributeError("no such impl: %s" % name) 1407 | 1408 | #TODO: do command name normalization outside the closure! 1409 | #TODO: allow named arguments! 1410 | 1411 | # A closure: 1412 | def exec_command(*arguments): 1413 | """Maps calls to undeclared methods on calls to graph commands. 1414 | 1415 | Refer to the class level documentation of Connection for details. 1416 | 1417 | @param arguments: the arguments passed to the method 1418 | 1419 | """ 1420 | cmd = re.sub('_', '-', name) 1421 | cmd = re.sub('^-*|-*$', '', cmd) 1422 | 1423 | source = None 1424 | sink = None 1425 | row_munger = None #TODO: PORT TO PHP 1426 | 1427 | if re.match('^try-', cmd): 1428 | cmd = cmd[4:] 1429 | try_it = True 1430 | else: 1431 | try_it = False 1432 | 1433 | 1434 | if re.match( '^capture-', cmd ): 1435 | cmd = cmd[8:] 1436 | sink = ArraySink() 1437 | capture = True 1438 | else: 1439 | capture = False 1440 | 1441 | 1442 | if re.search( '-map$', cmd ): 1443 | if not capture: 1444 | raise gpUsageException( 1445 | "using the _map suffix without the capture_ prefix" 1446 | + " is meaningless" ) 1447 | cmd = cmd[:-4] 1448 | map_it = True 1449 | else: 1450 | map_it = False 1451 | 1452 | if re.search( '-value$', cmd ): 1453 | if capture: 1454 | raise gpUsageException( "using the _value suffix together with the capture_ prefix is meaningless" ) 1455 | 1456 | cmd = cmd[:-6] 1457 | val = True 1458 | else: 1459 | val = False 1460 | 1461 | result = None 1462 | 1463 | if self.call_handlers: 1464 | handler_vars = {'command': cmd, 'arguments': arguments, 1465 | 'source': source, 'sink': sink, 1466 | 'capture': capture, 'result': result} 1467 | 1468 | for handler in self.call_handlers: 1469 | go_on = handler( self, handler_vars ) 1470 | if not go_on: 1471 | return handler_vars['result'] 1472 | 1473 | cmd = handler_vars['command'] 1474 | arguments = handler_vars['arguments'] 1475 | source = handler_vars['source'] 1476 | sink = handler_vars['sink'] 1477 | capture = handler_vars['capture'] 1478 | result = handler_vars['result'] 1479 | 1480 | command = [cmd] 1481 | 1482 | for arg in arguments: 1483 | if isinstance(arg, (tuple, list, set)): 1484 | source = ArraySource(arg) 1485 | elif type(arg) == types.GeneratorType: 1486 | source = ArraySource(arg) 1487 | elif isinstance(arg, (DataSource, DataSink)): 1488 | if isinstance(arg, DataSource): 1489 | source = arg 1490 | elif isinstance(arg, DataSink): 1491 | sink = arg 1492 | else: 1493 | raise gpUsageException( 1494 | "arguments must be primitive or a DataSource" 1495 | + " or DataSink. Found %s" % str(type(arg))) 1496 | elif not arg: 1497 | continue 1498 | elif callable(arg): 1499 | row_munger = arg 1500 | elif isinstance(arg, (str, unicode, int, long)): 1501 | command.append(arg) 1502 | else: 1503 | raise gpUsageException( 1504 | "arguments must be objects, strings or integers. " 1505 | + "Found %s" % type(arg)) 1506 | 1507 | if try_it: 1508 | catchThis = gpProcessorException 1509 | #XXX: catch more exceptions? ClientException? Protocolexception? 1510 | else: 1511 | catchThis = None 1512 | 1513 | try: 1514 | do_execute = True 1515 | self.__command_has_output = None 1516 | 1517 | if self.exec_handlers: 1518 | handler_vars = {'command': command, 'source': source, 1519 | 'sink': sink, 'has_output': has_output, 1520 | 'status': status, 'row_munger': row_munger} 1521 | 1522 | for handler in self.exec_handlers: 1523 | go_on = handler( self, handler_vars ) 1524 | 1525 | if not go_on: 1526 | do_execute = False 1527 | break 1528 | 1529 | command = handler_vars['command'] 1530 | source = handler_vars['source'] 1531 | sink = handler_vars['sink'] 1532 | has_output = handler_vars['has_output'] 1533 | status = handler_vars['status'] 1534 | row_munger = handler_vars['row_munger'] 1535 | 1536 | if do_execute: 1537 | func = re.sub('-', '_', command[0] + '_impl') 1538 | if hasattr(self, func ): 1539 | args = command[1:] 1540 | args.append(source) 1541 | args.append(sink) 1542 | 1543 | f = getattr(self, func) 1544 | status = f( *args ) 1545 | else: 1546 | status = self.execute(command, source, sink, row_munger = row_munger) 1547 | 1548 | 1549 | except catchThis as e: 1550 | return False 1551 | 1552 | #note: call modifiers like capture change the return type! 1553 | if capture: 1554 | 1555 | if status == 'OK' or status == 'VALUE': 1556 | if self.__command_has_output: 1557 | if map_it: 1558 | return sink.getMap() 1559 | else: 1560 | return sink.getData() 1561 | else: 1562 | return True 1563 | 1564 | 1565 | elif status == 'NONE': 1566 | return None 1567 | else: 1568 | return False 1569 | else: 1570 | if result: 1571 | status = result # from handler 1572 | 1573 | if val: 1574 | if status == "VALUE" or status == "OK": 1575 | return self.statusMessage; #XXX: not so pretty 1576 | else: 1577 | raise gpUsageException( "Can't apply _value modifier: command " + command + " did not return a VALUE or OK status, but this: " + status ) 1578 | 1579 | return status 1580 | 1581 | setattr(self, name, exec_command) #re-use closure! 1582 | 1583 | # Return the closure. 1584 | return exec_command 1585 | 1586 | 1587 | def execute(self, command, source=None, sink=None, row_munger=None): 1588 | """ Applies a command to the graph, i.e. runs the command on the peer. 1589 | 1590 | Note: this method implements the protocol used to interact with the peers, 1591 | based upon the line-by-line communication provided by the transport 1592 | instance. Interaction with the peer is stateless between calls to this 1593 | function (except of course for the contents of the graph itself). 1594 | 1595 | If the command generates output, the instance variable 1596 | __command_has_output will be set True, otherwise False. 1597 | 1598 | @type command: mixed 1599 | @param command: the command, as a single string or as an array 1600 | containing the command name and any arguments. 1601 | @type source: DataSource 1602 | @param source: the data source to take the commands input from 1603 | (default: null) 1604 | @type sink: DataSink 1605 | @param sink: the data sink to pass the commands output to 1606 | (default: null) 1607 | @param row_munger: a callback function to be invoked for every row 1608 | copied (optional). The return value from the munger 1609 | replaces the original row. If the munger function returns 1610 | None or False, the row is skipped. 1611 | @rtype: string 1612 | @return: the status string returned by the command 1613 | @raise: gpProtocolException if a communication error ocurred while 1614 | talking to the peer 1615 | @raise: gpProcessorException if the peer reported an error 1616 | @raise: gpUsageException if $command does not conform to the rules 1617 | for commands. Note that self.strictArguments and 1618 | self.alloPipes influence which commands are allowed. 1619 | 1620 | """ 1621 | self._trace(__function__(), "BEGIN") 1622 | 1623 | if self.tainted: 1624 | raise gpProtocolException( 1625 | "connection tainted by previous error!") 1626 | 1627 | if self.isClosed(): 1628 | raise gpProtocolException("connection already closed!") 1629 | 1630 | if self.transport.eof(): # closed by peer 1631 | self._trace(__function__(), 1632 | "connection closed by peer, closing our side too.") 1633 | self.close() 1634 | self.tainted = True 1635 | raise gpProtocolException("connection closed by peer!") 1636 | 1637 | if isinstance(command, (list, tuple, set)): 1638 | if not command: 1639 | raise gpUsageException("empty command!") 1640 | 1641 | c = command[0] 1642 | if not isinstance(c, (str, unicode)): #? Less restrictive in php. 1643 | raise gpUsageException( 1644 | "invalid command type: %s" % type(c).__name__) 1645 | 1646 | if not self.isValidCommandName(c): 1647 | raise gpUsageException("invalid command name: %s" % c) 1648 | 1649 | strictArgs = self.strictArguments 1650 | 1651 | if c == "set-meta" or c == "authorize": #XXX: ugly hack for wellknown commands 1652 | strictArgs = False 1653 | 1654 | for c in command: 1655 | if not isinstance(c, (str, unicode, int, long)): 1656 | raise gpUsageException( 1657 | "invalid argument type: %s" % type(c).__name__) 1658 | 1659 | if self.allowPipes and re.match('^[<>]$', c): 1660 | strictArgs = False 1661 | # pipe, allow lenient args after that 1662 | 1663 | if self.allowPipes and re.match('^[|&!:<>]+$', c): 1664 | continue 1665 | #operator 1666 | 1667 | if not self.isValidCommandArgument(c, strictArgs): 1668 | raise gpUsageException("invalid argument: %s" % c) 1669 | 1670 | command = ' '.join("%s" % el for el in command) 1671 | 1672 | if not command: 1673 | raise gpUsageException("command is empty!") 1674 | 1675 | command = command.strip() 1676 | 1677 | if command == "": 1678 | raise gpUsageException("command is empty!") 1679 | 1680 | self._trace(__function__(), "command", command ) 1681 | 1682 | if not self.isValidCommandString(command): 1683 | raise gpUsageException("invalid command: %s" % command) 1684 | 1685 | if (not self.allowPipes) and re.search('[<>]', command): 1686 | raise gpUsageException( 1687 | "command denied, pipes are disallowed by allowPipes = false; " 1688 | + "command: %s" % command); 1689 | 1690 | if source and (not re.search(':$', command)): 1691 | command = command + ':' 1692 | 1693 | if (not source) and re.search(':$', command): 1694 | source = NullSource.instance 1695 | 1696 | if source and re.search('<', command): 1697 | raise gpUsageException( 1698 | "can't use data input file and a local data source " 1699 | + "at the same time! %s" % command) 1700 | 1701 | if sink and re.search('>', command): 1702 | raise gpUsageException( 1703 | "can't use data output file and a local data sink " 1704 | + "at the same time! %s" % command) 1705 | 1706 | self._trace(__function__(), ">>> ", command) 1707 | self.transport.send( command + LINEBREAK ) 1708 | self._trace(__function__(), "source", type(source)) 1709 | 1710 | if ( source ): 1711 | self._copyFromSource( source, row_munger = row_munger ) 1712 | 1713 | rec = self.transport.receive() 1714 | self._trace(__function__(), "<<< ", rec) 1715 | 1716 | 1717 | if not rec: 1718 | self.tainted = True; 1719 | self.status = None; 1720 | self.statusMessage = None; 1721 | self.response = None; 1722 | 1723 | self._trace(__function__(), 1724 | "peer did not respond! Got value %s" % rec) 1725 | self.transport.checkPeer() 1726 | 1727 | raise gpProtocolException( 1728 | "peer did not respond! Got value %s" % str(rec)) 1729 | 1730 | rec = rec.strip() 1731 | self.response = rec 1732 | 1733 | match = re.match('^([a-zA-Z]+)[.:!](.*?):?$', rec) 1734 | if not match or not match.group(1): 1735 | self.tainted = True 1736 | self.close() 1737 | raise gpProtocolException( 1738 | "response should begin with status string like `OK`. Found: `" 1739 | + rec + "'") 1740 | 1741 | self.status = match.group(1) 1742 | self.statusMessage = match.group(2).strip() 1743 | 1744 | if self.status != 'OK' and self.status != 'NONE' and self.status != 'VALUE': 1745 | raise gpProcessorException( 1746 | self.status, self.statusMessage, command) 1747 | 1748 | self._trace(__function__(), "sink", type(sink)) 1749 | 1750 | if re.search(': *$', rec ): 1751 | if not sink: 1752 | sink = NullSink.instance 1753 | 1754 | # note: we need to slurp the result in any case! 1755 | self._copyToSink(sink, row_munger = row_munger) 1756 | 1757 | self.__command_has_output = True 1758 | else: 1759 | self.__command_has_output = False 1760 | 1761 | 1762 | if self.transport.eof(): # closed by peer 1763 | self._trace(__function__(), 1764 | "connection closed by peer, closing our side too.") 1765 | self.close() 1766 | 1767 | return self.status 1768 | 1769 | 1770 | 1771 | 1772 | def traverse_successors_without_impl( 1773 | self, id, depth, without, without_depth, source, sink, row_munger = None): 1774 | """Implements a 'fake' command traverse-successors-without 1775 | 1776 | which returns all decendants of onw nodes 1777 | minus the descendants of some other nodes. 1778 | This is a convenience function for a common case that 1779 | could otherwise only be covered by implementing the 1780 | set operation in php, or by using execute(). 1781 | 1782 | This method should not be called directly. 1783 | Instead, use the virtual method traverse_successors_without 1784 | in the same way as normal commands are called. 1785 | This include support for modifiers and flexible 1786 | handling of method parameters. 1787 | 1788 | """ 1789 | if not without_depth: 1790 | without_depth = depth 1791 | return self.execute( 1792 | ( "traverse-successors %s %s " + 1793 | " &&! traverse-successors %s %s " ) % (id, depth, without, without_depth), 1794 | source, sink, row_munger = row_munger) 1795 | 1796 | 1797 | @staticmethod 1798 | def isValidCommandName(name): #static #OK 1799 | """Check if the given name is a valid command name. 1800 | 1801 | Command names consist of a letter followed by any number of letters, 1802 | numbers, or dashes. 1803 | 1804 | """ 1805 | 1806 | if type(name) != str: 1807 | return False 1808 | 1809 | return re.match('^[a-zA-Z_][-\w]*$', name) 1810 | 1811 | 1812 | @staticmethod 1813 | def isValidCommandString(command): #static # fast OK 1814 | """Check if the given string passes some sanity checks. 1815 | 1816 | The command string must start with a valid command, and it must not 1817 | contain any non-printable or non-ascii characters. 1818 | 1819 | """ 1820 | 1821 | if type(command) != str: 1822 | return False 1823 | 1824 | #~ if not re.match('^[a-zA-Z_][-\w]*\s*(:?\s*$|[\s!&]+\w|[|<>#])', command): 1825 | if not re.match('^[a-zA-Z_][-\w]*\s*', command): 1826 | return False # must start with a valid command 1827 | 1828 | if re.search('[\0-\x1F\x80-\xFF]', command): 1829 | return False # bad characters 1830 | 1831 | return True 1832 | 1833 | 1834 | @staticmethod 1835 | def isValidCommandArgument(arg, strict=True): #static #OK 1836 | """ Check if the given string is a valid argument. 1837 | 1838 | If strict is set, it checks if arg consists of an alphanumeric 1839 | character followed by any number of alphanumerics, colons or dashes. 1840 | If strict is not set, this just checks that arg doesn't contain 1841 | any non-printable or non-ascii characters. 1842 | 1843 | @param string arg the argument to check 1844 | @param bool strict whether to perform a strict check (default: True). 1845 | 1846 | """ 1847 | 1848 | if not arg: 1849 | return False 1850 | 1851 | if not type(arg) in (str, unicode, int, long): 1852 | return False 1853 | 1854 | if strict: 1855 | return re.match('^\w[-\w]*$', str(arg)) 1856 | else: 1857 | return not re.search('[\s\0-\x1F\x80-\xFF|<>!&#]', str(arg)) 1858 | # low chars, high chars, and operators. 1859 | 1860 | @staticmethod 1861 | def splitRow(s): 1862 | """Convert a line from a data set into a tuple. 1863 | 1864 | If s is empty, this method returns False. if s starts with "#", it's 1865 | considered to consist of a single string field. Otherwise, the 1866 | string is split on ocurrances of TAB, semikolon or comma. Numeric 1867 | field values are converted to int, other fields remain strings. 1868 | 1869 | @param string s the row from the data set, as a string 1870 | @return array containing the fields in s, or false if s is empty. 1871 | 1872 | """ 1873 | if not s: 1874 | return False 1875 | if s[0] == '#': 1876 | row = (s[1],) #full line string text 1877 | else: 1878 | row = re.split(' *[;,\t] *', s) 1879 | 1880 | for i, entry in enumerate(row): 1881 | if re.match('^\d{1,9}$', entry): #TODO: port to python: no more than 9 chars for int conversion! 1882 | row[i] = int(entry) 1883 | 1884 | row = tuple(row) 1885 | 1886 | return row 1887 | 1888 | @staticmethod 1889 | def joinRow(row): # fertig. 1890 | """Create a string representing the data set `row'. 1891 | 1892 | joinRow tries to convert `row' to a reasonable string 1893 | representation. Numbers can be passed either as int or as str 1894 | types. If a string is passed or the list/tuple has only one 1895 | string which represents no number, this string will be marked 1896 | with a leading `#' and then be returned. 1897 | 1898 | If `row' is a tulple/list containing str or int types, those 1899 | will be returned as comma seperated values. 1900 | 1901 | @type row: str, or list/tuple of int/str types 1902 | @param row: The data row 1903 | @rtype: str 1904 | @return: string containing the fields from row 1905 | 1906 | """ 1907 | #if not row: 1908 | # return '' #? This case is covered by join(...). 1909 | if isinstance(row, str): 1910 | return '#' + row 1911 | if len(row) == 1 and isinstance(row[0], str) and \ 1912 | not re.match('^\d+$', row[0]): 1913 | return '#' + row[0] 1914 | try: 1915 | s = ','.join("%s" % el for el in row) 1916 | except: 1917 | #print row 1918 | raise 1919 | return s 1920 | 1921 | def _copyFromSource(self, source, row_munger = None): 1922 | """Pass data from source to client line by line. 1923 | 1924 | Copies all data from the given source into the current command stream, 1925 | that is, passes them to the client line by line. 1926 | 1927 | Note that this must only be called after passing a command line 1928 | terminated by ":" to the peer, so the peer expects a data set. 1929 | 1930 | This is implemented by calling the transport's make_sink() method 1931 | to create a sink for writing to the command stream, and then using 1932 | the copy() method to transfer the data. 1933 | 1934 | Note that source is not automatically closed by this method. 1935 | 1936 | """ 1937 | sink = self.transport.make_sink() 1938 | self._trace(__function__(), "source", type(source)) 1939 | self.copy(source, sink, ' > ', row_munger = row_munger) 1940 | # source.close() # to close or not to close... 1941 | self.transport.send( LINEBREAK ) #XXX: flush again?? 1942 | self._trace(__function__(), "copy complete.") 1943 | 1944 | # # 1945 | # while ( $row = $source->nextRow() ) 1946 | # $s = Connection::joinRow( $row ); 1947 | # 1948 | # fputs(self.hout, $s . LINEBREAK); 1949 | # 1950 | # 1951 | # fputs(self.hout, LINEBREAK); // blank line 1952 | 1953 | 1954 | 1955 | def _copyToSink(self, sink=None, row_munger = None): 1956 | """Pass data from peer to sink line by line. 1957 | 1958 | Copies all data from the command response into the given sink, 1959 | that is, receives data from the peer line by line. 1960 | 1961 | Note that this must only be called after the peer sent a response 1962 | line that endes with ":", so we know the peer is waiting to send a 1963 | data set. 1964 | 1965 | This is implemented by calling the transport's make_source() method 1966 | to create a source for reading from the command stream, and then 1967 | using the copy() method to transfer the data. 1968 | 1969 | Note that sink is flushed but not closed before this method returns. 1970 | 1971 | """ 1972 | source = self.transport.make_source() 1973 | self._trace(__function__(), "sink", type(sink)) 1974 | self.copy(source, sink, ' < ', row_munger = row_munger ) 1975 | self._trace(__function__(), "copy complete.") 1976 | # $source->close(); # to close or not to close... 1977 | 1978 | # # 1979 | # while ( $s = fgets(self.hin) ) 1980 | # $s = trim($s); 1981 | # if ( $s === '' ) break; 1982 | # 1983 | # $row = Connection::splitRow( $s ); 1984 | # 1985 | # if ( $sink ) 1986 | # $sink->putRow( $row ); 1987 | # 1988 | 1989 | def copy(self, source, sink=None, indicator = '<>', row_munger = None): #OK 1990 | """Transfer all rows from a data source to a data sink. 1991 | 1992 | Utility method. If sink is None, all rows are read from the 1993 | source and then discarded. 1994 | Before returning, the sink is flushed to commit any pending data. 1995 | 1996 | @type source: DataSource 1997 | @param source: source of the data rows 1998 | @type sink: DataSink 1999 | @param sink: sink where the rows are transferred to. 2000 | @type indicator: str 2001 | @param indicator: the message prefix to show in debug-mode 2002 | @param row_munger: a callback function to be invoked for every row copied. 2003 | the return value from the munger replaces the original row. 2004 | If the munger function returns None or False, the row is skipped. 2005 | 2006 | """ 2007 | for row in source: 2008 | if row_munger: #TODO: PORT TO PHP 2009 | row = row_munger(row) 2010 | 2011 | if not row: 2012 | continue 2013 | 2014 | if sink: 2015 | self._trace(__name__, indicator, row) 2016 | sink.putRow(row) 2017 | else: 2018 | self._trace(Connection.copy, "#", row) 2019 | 2020 | if ( sink ): 2021 | sink.flush() 2022 | 2023 | def close(self): #OK 2024 | """Closes this connection by closing the underlying transport.""" 2025 | self.transport.close() 2026 | 2027 | @staticmethod 2028 | def new_client_connection(graphname, host=False, port=False): # static #OK 2029 | """Return a new ClientTransport connection. 2030 | 2031 | Create a new connection for accessing a remote graph 2032 | managed by a GraphServ service. Returns a Connection 2033 | that uses a ClientTransport to talk to the remote graph. 2034 | 2035 | @param string graphname the name of the graph to connect to 2036 | @param string host (default: 'localhost') the host the 2037 | GraphServ process is located on. 2038 | @param int port (default: PORT) the TCP port the 2039 | GraphServ process is listening on. 2040 | 2041 | """ 2042 | conn = Connection( ClientTransport(host, port), graphname ) #FIXME: PORT graphname stuff to PHP! 2043 | 2044 | return conn 2045 | 2046 | @staticmethod 2047 | def new_slave_connection(command, cwd=None, env=None): #static #OK 2048 | """Return a new SlaveTransport connection. 2049 | 2050 | Create a new connection for accessing a graph managed by a 2051 | slave GraphCore process. Returns a Connection that uses a 2052 | SlaveTransport to talk to the local graph. 2053 | 2054 | @param mixed command the command line to start GraphCore. 2055 | May be given as a string or as an array. 2056 | If given as a string, all parameters must be duely 2057 | escaped #?. If given as an array, command[0] must be 2058 | the path to the GraphCore executable. 2059 | See Slavetransport.makeCommand() for more details. 2060 | @param string cwd (default: None) the working dir to run 2061 | the slave process in. Defaults to the current working directory. 2062 | @param int env (default: None) the environment variables 2063 | to pass to the slave process. Defaults to inheriting 2064 | the PHP script's environment. 2065 | 2066 | """ 2067 | return Connection(SlaveTransport(command, cwd, env)) 2068 | 2069 | def array_column(a, col): 2070 | """Extracts a column from a tabular structure 2071 | 2072 | @type a: list/tuple 2073 | @param a: an array of equal-sized arrays, representing 2074 | a table as a list of rows. 2075 | @type col: usually int or str 2076 | @param col: the column key of the column to extract 2077 | 2078 | @rtype: list 2079 | @return: the values of column col from each row in a 2080 | 2081 | """ 2082 | column = [] 2083 | 2084 | for r in a: 2085 | column.append( r[col] ) 2086 | 2087 | return column 2088 | 2089 | def pairs2map( pairs, key_col=0, value_col=1): 2090 | """Converts a list of key value pairs to a dictionary. 2091 | 2092 | @type pairs: array 2093 | @param pairs: an array of key value paris, 2094 | representing a map as a list of tuples. 2095 | @type key_col: mixed 2096 | @param key_col: the column that contains the key (default: 0) 2097 | @type value_col: mixed 2098 | @param value_col: the column that contains the value (default: 1) 2099 | 2100 | @rtype: dictionary 2101 | @return: the key value pairs in pairs. 2102 | 2103 | """ 2104 | 2105 | m = {} 2106 | for p in pairs: 2107 | k = p[key_col] 2108 | m[ k ] = p[value_col] 2109 | 2110 | return m 2111 | 2112 | def escapeshellcmd(command): 2113 | return '"%s"' % ( 2114 | command 2115 | .replace('#', '\#') 2116 | .replace('&', '\&') 2117 | .replace(';', '\.') 2118 | .replace('`', '\`') 2119 | .replace('|', '\|') 2120 | .replace('*', '\*') 2121 | .replace('~', '\~') 2122 | .replace('<', '\<') 2123 | .replace('>', '\>') 2124 | .replace('^', '\^') 2125 | .replace('(', '\(') 2126 | .replace(')', '\)') 2127 | .replace('[', '\[') 2128 | .replace(']', '\]') 2129 | .replace('{', '\{') 2130 | .replace('}', '\}') 2131 | .replace('$', '\$') 2132 | .replace(',', '\,') 2133 | .replace('\'', '\\\'') 2134 | .replace('\"', '\\"') 2135 | ) 2136 | def escapeshellarg(arg): 2137 | return '\'' + arg.replace('\'', '\'' + '\\' + '\'' + '\'') + '\'' 2138 | --------------------------------------------------------------------------------