├── gp ├── tests │ ├── __init__.py │ ├── gp.test.data │ ├── test_config.py │ ├── client_test.py │ ├── crud_fuzzer.py │ ├── arc_fuzzer.py │ ├── tcp_fuzzer.py │ ├── fuzzer_base.py │ ├── core_test.py │ ├── test_base.py │ ├── slave_test.py │ ├── mysql_test.py │ ├── server_test.py │ └── mediawiki_test.py ├── __init__.py ├── mediawiki.py └── mysql.py ├── README.md └── gpfeeder.py /gp/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /gp/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 'client', 'mysql', 'mediawiki' ] 2 | -------------------------------------------------------------------------------- /gp/tests/gp.test.data: -------------------------------------------------------------------------------- 1 | 1,11 2 | 1, 12 3 | 11, 111 4 | 11 112 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Catgraph Service 2 | Service for catgraph 3 | 4 | ## Issue tracker 5 | Please file bugs and feature requests on [Phabricator] (https://phabricator.wikimedia.org/maniphest/task/create/?projects=tcb-team,catgraph&title=%5BCatGraph%5D). 6 | -------------------------------------------------------------------------------- /gp/tests/test_config.py: -------------------------------------------------------------------------------- 1 | from gp.client import * 2 | 3 | #graphcore binary for slave tests 4 | test_graphcore_path = '/home/daniel/src/graphserv/graphcore/graphcore'; 5 | 6 | #graphserv coordinates for client tests 7 | test_graphserv_host = 'localhost'; 8 | test_graphserv_port = PORT; 9 | 10 | #logins for test accounts 11 | test_admin = 'fred'; 12 | test_admin_password = 'test'; 13 | 14 | test_master = 'jules'; 15 | test_master_password = 'test'; 16 | 17 | #mysql login for testing MySQLGlue 18 | test_mysql_host = 'localhost'; 19 | test_mysql_user = 'gptest'; 20 | test_mysql_password = 'gptest'; 21 | test_mysql_database = 'gptest'; 22 | 23 | #mediawiki database info 24 | test_mediawiki_table_prefix = 'mw_'; 25 | -------------------------------------------------------------------------------- /gp/tests/client_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 3 | 4 | import unittest 5 | import os 6 | import tempfile 7 | from test_base import * 8 | from gp.client import * 9 | 10 | test_graph_name = 'test' + str(os.getpid()) 11 | TestFilePrefix = '/tmp/gptest-' + str(os.getpid()) 12 | 13 | class ClientTest (ClientTestBase, unittest.TestCase): 14 | """Test the TCP client connection. 15 | 16 | Client Connection Tests 17 | currently none. Could test handling of TCP issues, etc 18 | 19 | @TODO: (optionally) start server instance here! 20 | let it die when the test script dies. 21 | 22 | @TODO: CLI interface behaviour of server (port config, etc) 23 | 24 | """ 25 | 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /gp/tests/crud_fuzzer.py: -------------------------------------------------------------------------------- 1 | from fuzzer_base import * 2 | from gp.client import * 3 | from test_config import * 4 | import sys 5 | 6 | fuzz_commands = ( "list-roots", "list-successors", "add-arcs", "stats", 7 | "22", "-11111", "xyz", "0", "!", "&", "%", "#", "",) 8 | 9 | fuzz_args = ( "1", "2", "11", "22", "-11111", "xyz", "0", "!", 10 | "&", "%", "#", "",) 11 | 12 | 13 | class CrudFuzzer (FuzzerBase): 14 | """Test the TCP client connection""" 15 | 16 | def prepare(self): 17 | #self.gp.add_arcs(((1, 2), (1, 11), (2, 22),)) 18 | self.gp.add_arcs(((1, 2), (1, 11), (2, 22),)) 19 | 20 | 21 | 22 | def doFuzz(self): 23 | global fuzz_commands 24 | global fuzz_args 25 | 26 | cmd = "" 27 | 28 | cmd = cmd + fuzz_pick(fuzz_commands) 29 | cmd = cmd + " " 30 | cmd = cmd + fuzz_pick(fuzz_args) 31 | cmd = cmd + " " 32 | cmd = cmd + fuzz_pick(fuzz_args) 33 | 34 | try: 35 | self.gp.execute(cmd) 36 | return True 37 | except gpProcessorException, ex: 38 | pass # noop 39 | except gpUsageException, ex: 40 | pass # noop 41 | 42 | return False 43 | 44 | if __name__ == '__main__': 45 | 46 | fuzzer = CrudFuzzer() 47 | 48 | fuzzer.run(sys.argv) 49 | -------------------------------------------------------------------------------- /gp/tests/arc_fuzzer.py: -------------------------------------------------------------------------------- 1 | from fuzzer_base import FuzzerBase, test_graph_name 2 | from gp.client import Connection 3 | from test_config import * 4 | import random 5 | import sys 6 | 7 | class ArcFuzzer(FuzzerBase): 8 | """Tests the TCP client connection""" 9 | 10 | def __init__(self): 11 | self.offset = 1 12 | FuzzerBase.__init__(self) 13 | 14 | def prepare(self): 15 | # in case we use a persistent graph, fund an unused offset 16 | Range = range(self.offset,10) 17 | for i in Range: 18 | if not self.gp.capture_list_successors(i): 19 | self.gp.add_arcs(((i, i+1),)) 20 | print "fuzz offset: %d (%s)" % (i, test_graph_name) 21 | return 22 | 23 | self.offset = i + 1 24 | #? self.offset verstehen! 25 | 26 | exit("no free offset left (or " 27 | + test_graph_name + "needs purging)") 28 | 29 | 30 | def random_node(self): 31 | return random.randint(10, 1000) * 10 + self.offset 32 | 33 | 34 | def random_arcs(self, n=0): 35 | if not n: 36 | n = random.randint(2, 80) 37 | arcs = [] 38 | for i in range(0, n): 39 | a = self.random_node() 40 | b = self.random_node() 41 | arcs.append((a, b)) 42 | return arcs 43 | 44 | def random_set(self, n=0): 45 | if not n: 46 | n = random.randint(2, 80) 47 | arcs = [] 48 | for i in range(0, n): 49 | x = self.random_node() 50 | arcs.append(x) 51 | return arcs 52 | 53 | def doFuzz(self): 54 | self.gp.add_arcs(self.random_arcs()) 55 | self.gp.remove_arcs(self.random_arcs()) 56 | 57 | self.gp.replace_successors(self.random_node(), self.random_set()) 58 | self.gp.replace_predecessors(self.random_node(), self.random_set()) 59 | 60 | return True 61 | 62 | 63 | if __name__ == '__main__': 64 | 65 | fuzzer = ArcFuzzer() 66 | 67 | fuzzer.run(sys.argv) 68 | -------------------------------------------------------------------------------- /gp/tests/tcp_fuzzer.py: -------------------------------------------------------------------------------- 1 | from crud_fuzzer import * 2 | from gp.client import * 3 | from test_config import * 4 | import sys 5 | import threading 6 | import random 7 | import time 8 | 9 | class KillerThread ( threading.Thread ): 10 | """randomly kills client socket""" 11 | 12 | def __init__(self, fuzzer): 13 | super(KillerThread, self).__init__() 14 | 15 | self.stopped = False 16 | self.fuzzer = fuzzer 17 | self.delay = (0.01, 0.2) 18 | 19 | def kill_connection(self): 20 | if ( self.fuzzer.gp 21 | and self.fuzzer.gp.transport.socket ): 22 | 23 | try: 24 | self.fuzzer.kill_lock.acquire() 25 | 26 | self.fuzzer.gp.transport.hin.close() 27 | self.fuzzer.gp.transport.hout.close() 28 | self.fuzzer.gp.transport.socket.close() 29 | 30 | self.fuzzer.killed = True 31 | self.fuzzer.blip("!") 32 | finally: 33 | self.fuzzer.kill_lock.release() 34 | 35 | def run ( self ): 36 | while not self.stopped: 37 | d = random.random() * ( self.delay[1] - self.delay[0] ) + self.delay[0] 38 | time.sleep( d ) 39 | 40 | if not self.stopped: 41 | self.kill_connection() 42 | 43 | def stop(self): 44 | self.stopped = True 45 | 46 | 47 | class TcpFuzzer (CrudFuzzer): 48 | """Test server stability with unstable TCP client connection""" 49 | 50 | def __init__(self): 51 | super(TcpFuzzer, self).__init__() 52 | 53 | self.kill_lock = threading.Lock() 54 | 55 | self.killer = KillerThread( self ) 56 | self.killer.daemon = True 57 | 58 | self.killed = False 59 | 60 | def run(self, argv): 61 | try: 62 | super(TcpFuzzer, self).run(argv) 63 | finally: 64 | self.killer.stop() 65 | 66 | def connect(self): 67 | super(TcpFuzzer, self).connect() 68 | 69 | # force 1 byte chunks, to increase the probability of 70 | # incomplete writes. 71 | self.gp.transport.chunk_size = 1 72 | 73 | def doFuzz(self): 74 | try: 75 | self.kill_lock.acquire() 76 | 77 | if not self.killer.is_alive(): 78 | self.killer.start() 79 | 80 | if self.killed: 81 | self.connect() 82 | self.prepare() 83 | self.killed = False 84 | 85 | self.blip("*") 86 | finally: 87 | self.kill_lock.release() 88 | 89 | try: 90 | return super(TcpFuzzer, self).doFuzz() 91 | except: 92 | pass #whatever 93 | 94 | return False 95 | 96 | if __name__ == '__main__': 97 | fuzzer = TcpFuzzer() 98 | 99 | fuzzer.run(sys.argv) 100 | -------------------------------------------------------------------------------- /gp/tests/fuzzer_base.py: -------------------------------------------------------------------------------- 1 | from gp.client import Connection 2 | from gp.client import gpException 3 | from test_config import * 4 | import test_config 5 | import os, sys 6 | import random 7 | import time 8 | 9 | test_graph_name = 'test' + str(os.getpid()) 10 | 11 | def fuzz_pick( a ): 12 | i = random.randint(0, len(a)-1) 13 | return a[i] 14 | 15 | 16 | class FuzzerBase (object): # abstract 17 | """Test the TCP client connection""" 18 | 19 | def __init__(self): 20 | self.graph = None 21 | self.useTempGraph = True 22 | 23 | def blip( self, s ): 24 | sys.stdout.write( s ) 25 | sys.stdout.flush() 26 | 27 | def newConnection(self): 28 | gp = Connection.new_client_connection(None, 29 | test_graphserv_host, test_graphserv_port ) 30 | gp.connect() 31 | return gp 32 | 33 | def connect(self): 34 | if not self.graph: 35 | self.graph = test_graph_name 36 | 37 | try: 38 | self.gp = self.newConnection() 39 | except gpException as ex: 40 | print("Unable to connect to " 41 | + test_graphserv_host + ":" + str(test_graphserv_port) 42 | + ", please make sure the graphserv process is running " 43 | + "and check the test_graphserv_host and " 44 | + "test_graphserv_port configuration options in " 45 | + "test_config.py.") 46 | print("Original error: " + str(ex)) 47 | quit(11) 48 | 49 | try: 50 | self.gp.authorize( 'password', 51 | test_admin + ":" + test_admin_password) 52 | except gpException, ex: 53 | print("Unable to connect to authorize as " 54 | + test_admin + ", please check the test_admin and " 55 | + "test_admin_password configuration options in " 56 | + "test_config.py.") 57 | print("Original error: " + str(ex)) 58 | quit(12) 59 | 60 | if self.useTempGraph: 61 | self.gp.try_create_graph( self.graph ) 62 | 63 | try: 64 | self.gp.use_graph( self.graph ) 65 | except gpException, ex: 66 | print("Unable to use graph self.graph, please check the " 67 | + "test_graph_name configuration option in test_config.py " 68 | + "as well as the privileges of user " + test_admin + ".") 69 | print("Original error: " + ex.getMessage()) 70 | quit(13) 71 | 72 | def disconnect(self): 73 | global test_admin, test_admin_password 74 | 75 | if self.useTempGraph and self.graph: 76 | self.gp.try_drop_graph(self.graph) #? gp OK? 77 | 78 | def prepare(self): 79 | pass #noop 80 | 81 | def doFuzz(self): #abstract 82 | raise NotImplementedError( 83 | "FuzzerBase.doFuzz() not implemented.") 84 | 85 | def run(self, argv): 86 | self.connect() 87 | self.prepare() 88 | 89 | n = None 90 | if len(argv) > 1: 91 | n = int(argv[1]) 92 | if not n: 93 | n = 100 94 | 95 | for k in range(n): 96 | for i in range(100): 97 | ok = self.doFuzz() 98 | if ok: 99 | self.blip("+") 100 | else: 101 | self.blip("-") 102 | 103 | self.blip("\n"); 104 | # time.sleep(1) #? Muss wieder rein! 105 | 106 | self.disconnect() 107 | -------------------------------------------------------------------------------- /gp/tests/core_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 3 | 4 | import unittest 5 | import os 6 | import tempfile 7 | from test_base import * 8 | from test_config import * 9 | from gp.client import * 10 | 11 | class CoreTest (SlaveTestBase, unittest.TestCase): 12 | """ 13 | Tests core functions via client lib 14 | """ 15 | 16 | #### Core Functions ###############################/ 17 | 18 | def test_addArcs(self): 19 | self.gp.add_arcs( ( 20 | ( 1, 11 ), 21 | ( 1, 12 ), 22 | ( 11, 111 ), 23 | ( 11, 112 ), 24 | ) ) 25 | 26 | self.assertStatsValue( 'ArcCount', 4 ) 27 | 28 | arcs = self.gp.capture_list_successors( 1 ) 29 | 30 | self.assertTrue( TestBase.setEquals( arcs, ( 31 | ( 11, ), 32 | ( 12, ), 33 | ) ), "sucessors of (1)" ) 34 | 35 | arcs = self.gp.capture_list_successors( 11 ) 36 | self.assertTrue( TestBase.setEquals( arcs, ( 37 | ( 111, ), 38 | ( 112, ), 39 | ) ), "sucessors of (2)" ) 40 | 41 | # ------------------------------------------------------ 42 | 43 | self.gp.add_arcs( ( 44 | ( 1, 11 ), 45 | ( 11, 112 ), 46 | ( 2, 21 ), 47 | ) ) 48 | 49 | self.assertStatsValue( 'ArcCount', 5 ) 50 | 51 | arcs = self.gp.capture_list_successors( 2 ) 52 | self.assertTrue( TestBase.setEquals( arcs, ( 53 | ( 21, ), 54 | ) ), "sucessors of (2)" ) 55 | 56 | 57 | 58 | def test_clear(self): 59 | self.gp.add_arcs( ( 60 | ( 1, 11 ), 61 | ( 1, 12 ), 62 | ( 11, 111 ), 63 | ( 11, 112 ), 64 | ) ) 65 | 66 | self.assertStatsValue( 'ArcCount', 4 ) 67 | 68 | self.gp.clear() 69 | 70 | arcs = self.gp.capture_list_successors( 1 ) 71 | 72 | self.assertEmpty( arcs ) 73 | self.assertStatsValue( 'ArcCount', 0 ) 74 | 75 | #-------------------------------------------- 76 | self.gp.add_arcs( ( 77 | ( 1, 11 ), 78 | ( 1, 12 ), 79 | ( 11, 111 ), 80 | ( 11, 112 ), 81 | ) ) 82 | 83 | self.assertStatsValue( 'ArcCount', 4 ) 84 | 85 | 86 | def test_traverseSuccessors(self): 87 | self.gp.add_arcs( ( 88 | ( 1, 11 ), 89 | ( 1, 12 ), 90 | ( 11, 111 ), 91 | ( 11, 112 ), 92 | ( 111, 1111 ), 93 | ( 111, 1112 ), 94 | ( 112, 1121 ), 95 | ) ) 96 | 97 | self.assertStatsValue( 'ArcCount', 7 ) 98 | 99 | #-------------------------------------------- 100 | succ = self.gp.capture_traverse_successors( 11, 5 ) 101 | 102 | self.assertEquals( [ (11,), (111,), (112,), (1111,), (1112,), (1121,), ], succ ) 103 | 104 | 105 | def test_traverseSuccessorsWithout(self): 106 | self.gp.add_arcs( [ 107 | ( 1, 11 ), 108 | ( 1, 12 ), 109 | ( 11, 111 ), 110 | ( 11, 112 ), 111 | ( 111, 1111 ), 112 | ( 111, 1112 ), 113 | ( 112, 1121 ), 114 | ] ) 115 | 116 | self.assertStatsValue( 'ArcCount', 7 ) 117 | 118 | #-------------------------------------------- 119 | succ = self.gp.capture_traverse_successors_without( 11, 5, 111, 5 ) 120 | 121 | self.assertEquals( [ (11,), (112,), (1121,), ], succ ) 122 | 123 | def test_setMeta(self): 124 | #define var 125 | self.gp.set_meta("foo", 1234) 126 | val = self.gp.get_meta_value("foo") 127 | self.assertEquals( "1234", val ) 128 | 129 | #redefine var 130 | self.gp.set_meta("foo", "bla/bla") 131 | val = self.gp.get_meta_value("foo") 132 | self.assertEquals( "bla/bla", val ) 133 | 134 | # test bad ----------------------------------------- 135 | try: 136 | self.gp.set_meta("...", 1234) 137 | self.fail( "exception expected" ) 138 | except gpException as ex: 139 | pass 140 | 141 | try: 142 | self.gp.set_meta("x y", 1234) 143 | self.fail( "exception expected" ) 144 | except gpException as ex: 145 | pass 146 | 147 | try: 148 | self.gp._set_meta(" ", 1234) 149 | self.fail( "exception expected" ) 150 | except gpException as ex: 151 | pass 152 | 153 | try: 154 | self.gp.set_meta("foo", "bla bla") 155 | self.fail( "exception expected" ) 156 | except gpException as ex: 157 | pass 158 | 159 | try: 160 | self.gp.set_meta("foo", "2<3") 161 | self.fail( "exception expected" ) 162 | except gpException as ex: 163 | pass 164 | 165 | def test_getMeta(self): 166 | #get undefined 167 | val = self.gp.try_get_meta_value("foo") 168 | self.assertEquals( False, val ) 169 | 170 | #set var, and get value 171 | self.gp.set_meta("foo", "xxx") 172 | val = self.gp.get_meta_value("foo") 173 | self.assertEquals( "xxx", val ) 174 | 175 | #remove var, then get value 176 | self.gp.remove_meta("foo") 177 | val = self.gp.try_get_meta_value("foo") 178 | self.assertEquals( False, val ) 179 | 180 | def test_removeMeta(self): 181 | #remove undefined 182 | ok = self.gp.try_remove_meta("foo") 183 | self.assertEquals( False, ok ) 184 | 185 | #set var, then remove it 186 | self.gp.set_meta("foo", "xxx") 187 | ok = self.gp.try_remove_meta("foo") 188 | self.assertEquals( "OK", ok ) 189 | 190 | def test_listMeta(self): 191 | # assert empty 192 | meta = self.gp.capture_list_meta() 193 | self.assertEmpty( meta ) 194 | 195 | # add one, assert list 196 | self.gp.set_meta("foo", 1234) 197 | meta = self.gp.capture_list_meta_map() 198 | self.assertEquals( { "foo": 1234}, meta ) 199 | 200 | # remove one, assert empty 201 | self.gp.remove_meta("foo") 202 | meta = self.gp.capture_list_meta() 203 | self.assertEmpty( meta ) 204 | 205 | 206 | #TODO: add all the tests we have in the talkback test suit 207 | 208 | 209 | if __name__ == '__main__': 210 | unittest.main() 211 | 212 | 213 | -------------------------------------------------------------------------------- /gp/tests/test_base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os, sys, traceback 3 | from gp.client import * 4 | from test_config import * 5 | 6 | test_graph_name = 'test' + str(os.getpid()) 7 | 8 | def suicide( code = 1 ): 9 | os._exit(code) 10 | 11 | class TestBase: 12 | """A few static methods to compare lists and sets recursively, 13 | so the equality of entire data structures can be asserted. 14 | """ 15 | 16 | @staticmethod 17 | def setContains( a, w ): 18 | """ Checks if a is an element of w. If a is a list, tuple, set or dict, 19 | a recursive comparison is performed. 20 | """ 21 | 22 | found = False 23 | for v in a: 24 | if type(v) in (tuple, list): 25 | try: 26 | if TestBase.arrayEquals( v, w ): 27 | found = True 28 | break 29 | except (TypeError, AttributeError), e: 30 | #perhaps w wasn't iterable 31 | pass 32 | elif type(v) == set: 33 | try: 34 | if TestBase.setEquals( v, w ): 35 | found = True 36 | break 37 | except (TypeError, AttributeError), e: 38 | #perhaps w wasn't iterable 39 | pass 40 | elif type(v) == dict: 41 | raise Exception("deep dictionary comparison not yet implemented") 42 | else: 43 | if v == w: 44 | found = True 45 | 46 | return found 47 | 48 | 49 | @staticmethod 50 | def setEquals( a, b ): 51 | """ determins if a and b contain the same elements. a and b my be sets, 52 | lists or tuples, but do not need to have the same type. 53 | """ 54 | 55 | if len(a) != len(b): 56 | return False 57 | 58 | for v in a: 59 | if not TestBase.setContains(b, v): 60 | return False 61 | 62 | return True 63 | 64 | @staticmethod 65 | def arrayEquals( a, b ): 66 | if len(a) != len(b): 67 | return False 68 | 69 | for k in range(len(a)): 70 | v = a[k] 71 | w = b[k] 72 | 73 | if type(v) in (tuple, list, set): 74 | #WARNING: no protection against circular array references 75 | 76 | try: 77 | if not TestBase.arrayEquals( w, v ): 78 | return False 79 | except TypeError, AttributeError: 80 | #perhaps w wasn't iterable 81 | return False 82 | elif type(v) == dict: 83 | try: 84 | if not TestBase.dictEquals( w, v ): 85 | return False 86 | except TypeError, AttributeError: 87 | #perhaps w wasn't a dict 88 | return False 89 | elif w != v: 90 | return False 91 | 92 | return True 93 | 94 | @staticmethod 95 | def dictEquals( a, b ): 96 | if len(a) != len(b): 97 | return False 98 | 99 | for k, v in a.items(): 100 | if not k in b: 101 | return False 102 | 103 | w = b[k] 104 | 105 | if type(v) in (tuple, list, set): 106 | #WARNING: no protection against circular array references 107 | 108 | try: 109 | if not TestBase.arrayEquals( w, v ): 110 | return False 111 | except TypeError, AttributeError: 112 | #perhaps w wasn't iterable 113 | return False 114 | elif type(v) == dict: 115 | try: 116 | if not TestBase.dictEquals( w, v ): 117 | return False 118 | except TypeError, AttributeError: 119 | #perhaps w wasn't a dict 120 | return False 121 | elif w != v: 122 | return False 123 | 124 | return True 125 | 126 | 127 | class ConnectionTestBase(TestBase): 128 | """Abstract base class with basic Connection tests. 129 | 130 | These need to pass for all types of connections. 131 | @Note: lib functionality like try and capture is tested in 132 | SlaveTest, because it doesn't need to be tested separately 133 | for all types of connections 134 | 135 | """ 136 | 137 | def setUp(self): #abstract 138 | raise NotImplementedError('subclasses must override setUp() to' 139 | + ' store a Connection in self.gp') 140 | 141 | def tearDown(self): 142 | if self.gp: 143 | self.gp.close() 144 | 145 | def test_ping(self): 146 | pong = self.gp.ping() 147 | #? nix assert? 148 | 149 | def test_stats(self): 150 | stats = self.gp.capture_stats() 151 | stats = pairs2map(stats) 152 | self.assertEqual(stats['ArcCount'], 0, "arc count should be zero") 153 | 154 | def test_dataSetHandling(self): 155 | self.gp.add_arcs((( 1, 11 ),( 1, 12 ),( 11, 111 ),( 11, 112 ),)) 156 | self.assertStatus('OK') 157 | self.assertStatsValue('ArcCount', 4) 158 | arcs = self.gp.capture_list_successors(1) 159 | self.assertTrue(ConnectionTestBase.setEquals( 160 | arcs, [(11,), (12,),]), "sucessors of (1)" ) 161 | arcs = self.gp.capture_list_successors(11) 162 | self.assertTrue(ConnectionTestBase.setEquals( 163 | arcs, [(111,), (112,),]), "sucessors of (2)" ) 164 | 165 | 166 | #### utility ###################################################### 167 | def assertNone(self, value, msg = None): 168 | if value is not None: 169 | if msg is None: 170 | msg = "expected None, found %s" % value 171 | 172 | self.fail(msg) 173 | 174 | def assertEmpty(self, value, msg = None): 175 | if value: 176 | if msg is None: 177 | msg = "expected value to be empty, found %s" % value 178 | 179 | self.fail(msg) 180 | 181 | def assertNotNone(self, value, msg = None): 182 | if value is None: 183 | if msg is None: 184 | msg = "found None where not expected" 185 | 186 | self.fail(msg) 187 | 188 | def assertContains(self, k, array, msg = None): 189 | if not k in array: 190 | if msg is None: 191 | msg = "Key %s not found in %s" % (k, array) 192 | 193 | self.fail(msg) 194 | 195 | def assertStatsValue(self, field, value): 196 | stats = self.gp.capture_stats() 197 | stats = pairs2map(stats) 198 | self.assertEquals(value, stats[field], "status[" + field + "]") 199 | 200 | def assertSessionValue(self, field, value): 201 | stats = self.gp.capture_session_info() 202 | stats = pairs2map(stats) 203 | self.assertEquals(value, stats[field], "session_info[" + field + "]") 204 | 205 | def assertStatus(self, value, message=None): 206 | status = self.gp.getStatus() 207 | self.assertEquals(value, status, message) 208 | 209 | 210 | class SlaveTestBase(ConnectionTestBase): #abstract 211 | 212 | def setUp(self): 213 | self.dump = PipeSink(sys.stdout) 214 | 215 | try: 216 | self.gp = Connection.new_slave_connection(test_graphcore_path) 217 | self.gp.connect() 218 | except gpException, ex: 219 | print("Unable to launch graphcore instance from " 220 | + "%s, please make sure graphcore is " % test_graphcore_path 221 | + "installed and check the test_graphcore_path " 222 | + "configuration options in test_config.py.") 223 | print("Original error: " + str(ex)) 224 | traceback.print_exc(); 225 | suicide(10) 226 | 227 | 228 | class ClientTestBase(ConnectionTestBase): #abstract 229 | 230 | def setUp(self): 231 | try: 232 | self.gp = self.newConnection() 233 | except gpException, ex: 234 | print("Unable to connect to " 235 | + "%s:%s, please make sure " % (test_graphserv_host, test_graphserv_port) 236 | + "the graphserv process is running and check the " 237 | + "test_graphserv_host and test_graphserv_port configuration " 238 | + "options in test_config.py.") 239 | print("Original error: " + str(ex)) 240 | traceback.print_exc(); 241 | suicide(11) 242 | try: 243 | self.gp.authorize( 244 | 'password', test_admin + ":" + test_admin_password) 245 | except gpException, ex: 246 | print("Unable to connect to authorize as %s " % test_admin 247 | + ", please check the test_admin and test_admin_password " 248 | + "configuration options in test_config.py.") 249 | print("Original error: " + str(ex)) 250 | traceback.print_exc(); 251 | suicide(12) 252 | try: 253 | self.gp.create_graph(test_graph_name) 254 | except gpException, ex: 255 | print("Unable to create graph %s" % test_graph_name 256 | + ", please check the test_graph_name configuration option " 257 | + "in test_config.py as well as the privileges of user " 258 | + test_admin + ".") 259 | print("Original error: " + str(ex)) 260 | traceback.print_exc(); 261 | suicide(13) 262 | 263 | self.gp.use_graph(test_graph_name) 264 | # if use_graph throws an error, let it rip. it really shouldn't 265 | # happen and it's not a confiugration problem 266 | 267 | def newConnection(self): 268 | gp = Connection.new_client_connection( 269 | None, test_graphserv_host, test_graphserv_port) 270 | gp.connect() 271 | return gp 272 | 273 | def tearDown(self): 274 | try: 275 | self.gp.drop_graph(test_graph_name) 276 | except gpProtocolException, ex: 277 | #failed to remove graph, maybe the connection is gone? try again. 278 | try: 279 | gp = self.newConnection() 280 | gp.authorize('password', 281 | test_admin + ":" + test_admin_password) 282 | gp.drop_graph(test_graph_name) 283 | except gpException, ex: 284 | # just give up 285 | # pass 286 | raise ex 287 | 288 | ConnectionTestBase.tearDown(self) 289 | 290 | -------------------------------------------------------------------------------- /gp/tests/slave_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 3 | 4 | import unittest 5 | import tempfile 6 | from test_base import * 7 | from gp.client import * 8 | 9 | 10 | def read_lines( filepath ): 11 | f = open(filepath, 'ra') 12 | lines = [ s for s in f ] 13 | 14 | f.close() 15 | return lines 16 | 17 | 18 | # Tests a connection to a slave process, as well as general client 19 | # lib functionality. 20 | 21 | class SlaveTest (SlaveTestBase, unittest.TestCase): 22 | """Client Lib Functions 23 | 24 | Tested here, not in ConnectionTestBase, because we only need to test 25 | is once, not for every type of connection 26 | 27 | @TODO: test getStatusMessage, isClosed, etc 28 | 29 | """ 30 | 31 | def test_try(self): 32 | x = self.gp.try_foo() 33 | self.assertFalse(x) 34 | self.assertEquals('FAILED', self.gp.getStatus()) 35 | 36 | def test_capture(self): 37 | """empty data""" 38 | a = self.gp.capture_list_roots() 39 | self.assertStatus( 'OK' ) 40 | self.assertNotNone( a ) 41 | self.assertIsInstance(a, (list,tuple)) 42 | 43 | self.assertEquals( 0, len(a), 44 | "number of items in the result should be 0!" ) 45 | 46 | # single column data 47 | self.gp.add_arcs(((1, 11 ),(1, 12 ), (11, 111 ), (11, 112 ),)) 48 | a = self.gp.capture_list_successors( 1 ) 49 | self.assertStatus( 'OK' ) 50 | self.assertNotNone( a ) 51 | self.assertIsInstance(a, (list,tuple)) 52 | 53 | a = array_column( a, 0 ) 54 | self.assertEquals( [11, 12], a, 55 | "unexpected response for list_successors(1): %s" % a ) 56 | 57 | # two column data 58 | a = self.gp.capture_stats() 59 | self.assertNotNone( a ) 60 | 61 | a = pairs2map( a ) 62 | self.assertContains( 'ArcCount', a, 63 | "contents: %s" % a ) 64 | self.assertEquals( a['ArcCount'], 4 ) 65 | 66 | # two column data as map 67 | a = self.gp.capture_stats_map() 68 | self.assertContains( 'ArcCount', a, 69 | "contents: %s" % a ) 70 | self.assertEquals( a['ArcCount'], 4 ) 71 | 72 | # capture none 73 | a = self.gp.capture_traverse_successors( 77, 5 ) 74 | self.assertStatus( 'NONE' ) 75 | self.assertNone( a ) 76 | 77 | # capture on command with no output 78 | a = self.gp.capture_clear() 79 | self.assertStatus( 'OK' ) 80 | self.assertTrue( a ) 81 | 82 | # capture throwing error 83 | try: 84 | x = self.gp.capture_foo() 85 | self.fail("capturing output of an unknown command should " 86 | + "trigger an error") 87 | except gpProcessorException, e: 88 | # this is the expected outcome: the connection is closed. 89 | pass 90 | 91 | # capture with try 92 | x = self.gp.try_capture_foo() 93 | # should not trigger an exception... 94 | 95 | def dummyCallHandler(self, gp, params): 96 | if ( params['command'] == 'dummy' ): 97 | params['result'] = "test" 98 | return False 99 | return True 100 | 101 | 102 | def test_callHandler(self): 103 | h = self.dummyCallHandler 104 | self.gp.addCallHandler(h) 105 | 106 | st = self.gp.capture_stats() 107 | 108 | if st == False: 109 | self.fail("capture_stats failed and returned False") 110 | elif st == True: 111 | self.fail("capture_stats failed to capture, returned True") 112 | 113 | self.assertTrue(type(st) in (list, tuple), 114 | 'capture_stats is expected to return a list, got %s!' % type(st)) 115 | 116 | x = self.gp.dummy() 117 | self.assertEquals('test', x) 118 | 119 | 120 | def assertCommandAccepted(self, cmd, src=None, sink=None): 121 | s = re.sub('/\s+/s', ' ', str( cmd )) 122 | 123 | try: 124 | x = self.gp.execute(cmd, src, sink) 125 | raise Exception( 126 | "dummy command should have failed in core: %s" % s) 127 | except gpUsageException, e: 128 | self.fail("command syntax should be accepted by client: %s" % s) 129 | except gpProcessorException, e: 130 | # ok, should fail in core, but be accepted by client side validator 131 | pass 132 | 133 | def assertCommandRejected(self, cmd, src=None, sink=None): 134 | s = re.sub('/\s+/s', ' ', str(cmd)) 135 | 136 | try: 137 | x = self.gp.execute(cmd, src, sink) 138 | self.fail("bad command should be detected: %s" % s) 139 | except gpUsageException, e: 140 | pass # ok 141 | except gpProcessorException, e: 142 | self.fail( 143 | ( "bad command should have been detected by the client: %s" 144 | + "; core message: %s" ) % (s, e.getMessage()) ) 145 | 146 | def test_commandValidation(self): 147 | self.assertCommandRejected( '' ) 148 | self.assertCommandRejected(('', 23) ) 149 | 150 | self.assertCommandRejected( None ) 151 | self.assertCommandRejected((None, 23) ) 152 | 153 | self.assertCommandRejected( False ) 154 | self.assertCommandRejected((False, 23) ) 155 | 156 | self.assertCommandRejected(() ) 157 | self.assertCommandRejected((('foo',), ) ) 158 | 159 | self.assertCommandRejected( '123' ) 160 | self.assertCommandRejected(('123',) ) 161 | 162 | self.assertCommandAccepted( ' x ' ) 163 | self.assertCommandRejected((' x ',) ) 164 | 165 | self.assertCommandRejected( 'y' ) 166 | self.assertCommandRejected((' y ',) ) 167 | 168 | self.assertCommandRejected(('a:b') ) 169 | # 'a:b' is legal as an argument, but nut as a command name! 170 | 171 | self.assertCommandAccepted( 'x' ) 172 | self.assertCommandAccepted(('x',) ) 173 | 174 | self.assertCommandAccepted( 'xyz' ) 175 | self.assertCommandAccepted(('xyz',) ) 176 | 177 | self.assertCommandAccepted( 'x7y' ) 178 | self.assertCommandAccepted(('x7y',) ) 179 | 180 | chars = "\r\n\t\0\x09^\"§\$%/()[]\{\}=?'`\\*+~.,;@\xDD" 181 | for ch in chars: 182 | s = "a " + ch + " b" 183 | 184 | self.assertCommandRejected( s ) 185 | self.assertCommandRejected((s,) ) 186 | 187 | 188 | chars = " !&<>|#:" 189 | for ch in chars: 190 | s = "a " + ch + " b" 191 | 192 | self.assertCommandRejected((s,) ) 193 | 194 | 195 | # operators ----------------------------------------- 196 | self.assertCommandAccepted( 'clear && clear' ) 197 | self.assertCommandAccepted( 'clear !&& clear' ) 198 | 199 | 200 | # pipes disallowed ----------------------------------------- 201 | self.gp.allowPipes = False 202 | 203 | self.assertCommandRejected( 'clear > /tmp/test' ) 204 | self.assertCommandRejected( 'clear < /tmp/test' ) 205 | 206 | # pipes allowed ----------------------------------------- 207 | self.gp.allowPipes = True 208 | 209 | self.assertCommandAccepted( 'clear > /tmp/test' ) 210 | self.assertCommandAccepted( 'clear < /tmp/test' ) 211 | 212 | # pipes conflict ----------------------------------------- 213 | self.assertCommandRejected( 'clear > /tmp/test', None, 214 | NullSink.instance ) 215 | self.assertCommandRejected( 'clear < /tmp/test', 216 | NullSource.instance, None ) 217 | 218 | 219 | def assertArgumentAccepted(self, arg ): 220 | s = re.sub('/\s+/s', ' ', str(arg)) 221 | 222 | try: 223 | x = self.gp.execute(('foo', arg) ) 224 | raise Exception("dummy command should have failed in core: " 225 | + "foo %s" % s) 226 | except gpUsageException, e: 227 | self.fail("argument should be accepted by client: %s" % s) 228 | except gpProcessorException, e: 229 | pass 230 | # ok, should fail in core, but be accepted by client side 231 | # validator 232 | 233 | def assertArgumentRejected( self, arg ): 234 | s = re.sub('/\s+/s', ' ', str(arg)) 235 | try: 236 | x = self.gp.execute(('foo', arg) ) 237 | self.fail("malformed argument should be detected: %s" % s) 238 | except gpUsageException, e: 239 | pass 240 | # ok 241 | except gpProcessorException, e: 242 | self.fail("malformed argument should have been detected " 243 | + "by the client: %s; core message: %s" 244 | % (s, e.getMessage()) ) 245 | 246 | 247 | def test_argumentValidation(self): 248 | self.assertArgumentRejected( '' ) 249 | self.assertArgumentRejected( None ) 250 | self.assertArgumentRejected( False ) 251 | self.assertArgumentRejected( ' x ' ) 252 | 253 | # self.gp.setTimeout(2); # has no effect for pipes 254 | self.assertArgumentAccepted( 'x:y' ) 255 | # needed for password auth! 256 | # NOTE: This is broken in graphcore (but works via graphserv)! 257 | 258 | self.assertArgumentAccepted( '123' ) 259 | self.assertArgumentAccepted( 'x' ) 260 | self.assertArgumentAccepted( 'xyz' ) 261 | self.assertArgumentAccepted( 'x7y' ) 262 | self.assertArgumentAccepted( '7x7' ) 263 | 264 | chars = " \r\n\t\0\x09^!\"§\$%&/()[]\ \ =?'#`\\*+~., ;<>|@\xDD" 265 | for ch in chars: 266 | s = "a " + ch + " b" 267 | 268 | self.assertArgumentRejected(s) 269 | 270 | 271 | 272 | # // Client Lib I/O /////////////////////////////////////////////////////////////// 273 | # Tested here, not in ConnectionTestBase, because we only need to test is once, not for every type of connection 274 | # Note: ArraySource and ArraySink are used implicitly all the time in the tests, no need to test them separately. 275 | 276 | def test_fileSource(self): 277 | f = os.path.dirname(os.path.abspath(__file__)) + '/gp.test.data' 278 | src = FileSource(f) 279 | 280 | self.gp.add_arcs( src ) 281 | 282 | self.assertStatus( 'OK' ) 283 | self.assertStatsValue( 'ArcCount', 4 ) 284 | 285 | arcs = self.gp.capture_list_successors( 1 ) 286 | 287 | self.assertTrue( ConnectionTestBase.setEquals( 288 | arcs, [(11, ), (12, ),]), "sucessors of (1): expected [(11,), (12,)], got %s" % arcs ) 289 | 290 | arcs = self.gp.capture_list_successors( 11 ) 291 | self.assertTrue( ConnectionTestBase.setEquals( 292 | arcs, [(111, ), (112, ),]), "sucessors of (2): expected [(111,), (112,)], got %s" % arcs ) 293 | 294 | 295 | def test_fileSink(self): 296 | 297 | # set up the sink 298 | f = tempfile.mktemp(suffix='gpt') 299 | sink = FileSink(f, False, "\n") 300 | 301 | # generate output 302 | self.gp.add_arcs(((1, 11 ), (1, 12 ), (11, 111 ), (11, 112 ),)) 303 | 304 | ok = self.gp.traverse_successors(1, 2, sink) 305 | sink.close() 306 | 307 | # make sure we can read the file 308 | self.assertStatus('OK') 309 | self.assertEquals('OK', ok) 310 | 311 | # compare actual file contents 312 | rows = read_lines(f) 313 | 314 | self.assertNotEquals(False, rows, 315 | "could not get file contents of %s" % f) 316 | self.assertNotNone(rows, "could not get file contents of %s " % f) 317 | 318 | expected =("1\n", "11\n", "12\n", "111\n", "112\n",) 319 | self.assertTrue( ConnectionTestBase.setEquals( 320 | expected, rows), 'bad content in outfile: %s, expected %s' 321 | % ( rows, expected ) ) 322 | 323 | #cleanup 324 | try: 325 | unlink(f) 326 | except: 327 | pass 328 | 329 | def test_nullSource(self): 330 | self.gp.add_arcs( NullSource.instance ) 331 | self.assertStatus( 'OK' ) 332 | 333 | 334 | def test_nullSink(self): 335 | # generate output 336 | self.gp.add_arcs(((1, 11 ), (1, 12 ), (11, 111 ), (11, 112 ),)) 337 | 338 | ok = self.gp.traverse_successors(1, 2, NullSink.instance) 339 | self.assertStatus('OK') 340 | 341 | 342 | # //// Slave Connection Tests /////////////////////////////////////////////////// 343 | # currently none. could check if the process really dies after quit, etc 344 | # TODO: test checkPeer, etc 345 | 346 | if __name__ == '__main__': 347 | unittest.main() 348 | -------------------------------------------------------------------------------- /gp/tests/mysql_test.py: -------------------------------------------------------------------------------- 1 | from test_base import * 2 | from gp.client import * 3 | from gp.mysql import * 4 | from gp.mysql import _fetch_dict 5 | 6 | import unittest 7 | import sys 8 | 9 | class MySQLTest (SlaveTestBase, unittest.TestCase): 10 | mysql = None 11 | 12 | def setUp(self): 13 | self.dump = PipeSink( sys.stdout ) 14 | 15 | try: 16 | self.gp = MySQLGlue.new_slave_connection( test_graphcore_path ) 17 | self.gp.connect() 18 | except gpException as ex: 19 | print "Unable to launch graphcore instance from %s, please make sure graphcore is installed and check the test_graphcore_path configuration options in test_config.py.\nOriginal error: %s " % (test_graphcore_path, ex.getMessage() ) 20 | suicide(10) 21 | 22 | try: 23 | self.gp.mysql_connect(test_mysql_host, test_mysql_user, test_mysql_password, test_mysql_database) 24 | except gpException as ex: 25 | print "Unable to connect to database %s on MySQL host %s as %s, please make sure MySQL is running and check the test_mysql_host and related configuration options in test_cofig.py.\nOriginal error: %s " % (test_mysql_database, test_mysql_host, test_mysql_user, ex.getMessage() ) 26 | suicide(10) 27 | 28 | def _make_table( self, table, fieldSpec ): 29 | sql = "CREATE TEMPORARY TABLE IF NOT EXISTS " + table 30 | sql += "(" 31 | sql += fieldSpec 32 | sql += ")" 33 | 34 | self.gp.mysql_query(sql) 35 | 36 | sql = "TRUNCATE TABLE " + table 37 | self.gp.mysql_query(sql) 38 | 39 | 40 | def test_Source(self): 41 | self._make_table( "test", "a INT NOT NULL, b INT NOT NULL" ) 42 | self.gp.mysql_query( "INSERT INTO test VALUES (3, 8)" ) 43 | self.gp.mysql_query( "INSERT INTO test VALUES (7, 9)" ) 44 | self.gp.mysql_query( "INSERT INTO test VALUES (11, 11)" ) 45 | 46 | #----------------------------------------------------------- 47 | src = self.gp.make_source( MySQLTable("test", "a", "b") ) 48 | 49 | self.assertEquals(( 3, 8 ), src.next() , "expected row to be 3,8 " ) 50 | self.assertEquals(( 7, 9 ), src.next() , "expected row to be 7,9" ) 51 | self.assertEquals(( 11, 11 ), src.next() , "expected row to be 11,11" ) 52 | 53 | try: 54 | r = src.next() 55 | self.fail( "expected no more rows, got %s " % (r, ) ) 56 | except StopIteration: 57 | pass 58 | 59 | src.close() 60 | 61 | #----------------------------------------------------------- 62 | src = self.gp.make_source( MySQLSelect("select a from test where a > 7") ) 63 | 64 | self.assertEquals((11,), src.next() , "expected row to be 11" ) 65 | 66 | try: 67 | r = src.next() 68 | self.fail( "expected no more rows, got %s " % (r, ) ) 69 | except StopIteration: 70 | pass 71 | 72 | src.close() 73 | 74 | 75 | def test_SelectInto(self): 76 | self._make_table( "test", "a INT NOT NULL, b INT NOT NULL" ) 77 | self.gp.mysql_query( "INSERT INTO test VALUES (3, 8)" ) 78 | self.gp.mysql_query( "INSERT INTO test VALUES (7, 9)" ) 79 | self.gp.mysql_query( "INSERT INTO test VALUES (11, 11)" ) 80 | 81 | #----------------------------------------------------------- 82 | sink = ArraySink() 83 | self.gp.select_into( "select a, b from test order by a, b", sink ) 84 | 85 | data = sink.getData() 86 | 87 | self.assertEquals([( 3, 8 ), ( 7, 9 ), ( 11, 11 )], data ) 88 | 89 | 90 | def test_UnbufferedSelectInto(self): 91 | self._make_table( "test", "a INT NOT NULL, b INT NOT NULL" ) 92 | self.gp.set_unbuffered(True) 93 | self.gp.mysql_query( "INSERT INTO test VALUES (3, 8)" ) 94 | self.gp.mysql_query( "INSERT INTO test VALUES (7, 9)" ) 95 | self.gp.mysql_query( "INSERT INTO test VALUES (11, 11)" ) 96 | 97 | #----------------------------------------------------------- 98 | sink = ArraySink() 99 | self.gp.select_into( "select a, b from test order by a, b", sink ) 100 | 101 | data = sink.getData() 102 | 103 | self.assertEquals( [ ( 3, 8 ), ( 7, 9 ), ( 11, 11 ) ], data ) 104 | 105 | def assertNextRowEquals(self, expected, res): 106 | row = _fetch_dict(res) 107 | self.assertTrue( TestBase.dictEquals(expected, row), "expected row to be %s, got %s" % (expected, row) ) 108 | 109 | def test_TempSink(self): 110 | snk = self.gp.make_temp_sink( MySQLTable("?", "a", "b") ) 111 | table = snk.getTable() 112 | 113 | snk.putRow( (4,5) ) 114 | snk.putRow( (6,7) ) 115 | snk.close() 116 | 117 | res = self.gp.mysql_query( "SELECT a, b FROM " + table.get_name() + " ORDER BY a, b") 118 | 119 | self.assertNextRowEquals({ 'a': 4, 'b': 5 }, res ) 120 | self.assertNextRowEquals({ 'a': 6, 'b': 7 }, res ) 121 | self.assertFalse( res.fetchone(), "expected next row to be false" ) 122 | 123 | res.close() 124 | 125 | snk.drop() 126 | 127 | 128 | def test_AddArcsFromSourceObject(self): 129 | self._make_table( "test", "a INT NOT NULL, b INT NOT NULL" ) 130 | self.gp.mysql_query( "INSERT INTO test VALUES (1, 11)" ) 131 | self.gp.mysql_query( "INSERT INTO test VALUES (1, 12)" ) 132 | self.gp.mysql_query( "INSERT INTO test VALUES (11, 111)" ) 133 | self.gp.mysql_query( "INSERT INTO test VALUES (11, 112)" ) 134 | 135 | #----------------------------------------------------------- 136 | src = self.gp.make_source( MySQLTable("test", "a", "b") ) 137 | self.gp.add_arcs( src ) 138 | src.close() 139 | 140 | self.assertStatus( 'OK' ) 141 | self.assertStatsValue( 'ArcCount', 4 ) 142 | 143 | arcs = self.gp.capture_list_successors( 1 ); 144 | self.assertTrue( ConnectionTestBase.setEquals( arcs, [ 145 | ( 11, ), 146 | ( 12, ), 147 | ] ), "sucessors of (1)" ) 148 | 149 | arcs = self.gp.capture_list_successors( 11 ) 150 | self.assertTrue( ConnectionTestBase.setEquals( arcs, [ 151 | ( 111, ), 152 | ( 112, ), 153 | ] ), "sucessors of (2)" ) 154 | 155 | 156 | def test_AddArcsFromSourceShorthand(self): 157 | self._make_table( "test", "a INT NOT NULL, b INT NOT NULL" ) 158 | self.gp.mysql_query( "INSERT INTO test VALUES (1, 11)" ) 159 | self.gp.mysql_query( "INSERT INTO test VALUES (1, 12)" ) 160 | self.gp.mysql_query( "INSERT INTO test VALUES (11, 111)" ) 161 | self.gp.mysql_query( "INSERT INTO test VALUES (11, 112)" ) 162 | 163 | #----------------------------------------------------------- 164 | src = self.gp.add_arcs_from( "test a b" ) 165 | src.close() 166 | 167 | self.assertStatus( 'OK' ) 168 | self.assertStatsValue( 'ArcCount', 4 ) 169 | 170 | arcs = self.gp.capture_list_successors( 1 ); 171 | self.assertTrue( ConnectionTestBase.setEquals( arcs, [ 172 | ( 11, ), 173 | ( 12, ), 174 | ] ), "sucessors of (1)" ) 175 | 176 | arcs = self.gp.capture_list_successors( 11 ) 177 | self.assertTrue( ConnectionTestBase.setEquals( arcs, [ 178 | ( 111, ), 179 | ( 112, ), 180 | ] ), "sucessors of (2)" ) 181 | 182 | #----------------------------------------------------------- 183 | self.gp.clear() 184 | stats = self.gp.capture_stats_map() 185 | self.assertEquals( 0, stats['ArcCount'], "ArcCount" ) 186 | 187 | #self.gp.setDebug(True) 188 | src = self.gp.add_arcs_from( "select a, b from test" ) 189 | src.close() 190 | 191 | stats = self.gp.capture_stats_map() 192 | self.assertEquals( 4, stats['ArcCount'], "ArcCount" ) 193 | 194 | #----------------------------------------------------------- 195 | self.gp.clear() 196 | 197 | src = self.gp.add_arcs_from( ("test", "a", "b") ) 198 | src.close() 199 | 200 | self.assertStatsValue( 'ArcCount', 4 ) 201 | 202 | #----------------------------------------------------------- 203 | self.gp.clear() 204 | 205 | src = self.gp.add_arcs_from( MySQLTable("test", "a", "b") ) 206 | src.close() 207 | 208 | self.assertStatsValue( 'ArcCount', 4 ) 209 | 210 | 211 | def testSuccessorsToSinkObject(self): 212 | self.gp.add_arcs( [ 213 | ( 1, 11 ), 214 | ( 1, 12 ), 215 | ( 11, 111 ), 216 | ( 11, 112 ), 217 | ] ) 218 | 219 | #----------------------------------------------------------- 220 | snk = self.gp.make_temp_sink( MySQLTable("?", "n") ) 221 | src = self.gp.traverse_successors( 1, 8, snk ) 222 | snk.close() 223 | table = snk.getTable() 224 | 225 | res = self.gp.mysql_query( "SELECT n FROM "+table.get_name()+" ORDER BY n") 226 | 227 | self.assertNextRowEquals({ 'n': 1L }, res ) 228 | self.assertNextRowEquals({ 'n': 11L }, res ) 229 | self.assertNextRowEquals({ 'n': 12L }, res ) 230 | self.assertNextRowEquals({ 'n': 111L }, res ) 231 | self.assertNextRowEquals({ 'n': 112L }, res ) 232 | self.assertFalse( res.fetchone(), "expected next row to be False" ) 233 | 234 | res.close() 235 | 236 | #----------------------------------------------------------- 237 | self.gp.set_max_allowed_packet(6); #force inserter to flush intermittedly 238 | 239 | snk = self.gp.make_temp_sink( MySQLTable("?", "n") ) 240 | src = self.gp.traverse_successors( 1, 8, snk ) 241 | snk.close() 242 | table = snk.getTable() 243 | 244 | res = self.gp.mysql_query( "SELECT n FROM "+table.get_name()+" ORDER BY n") 245 | 246 | self.assertNextRowEquals({ 'n': 1 }, res ) 247 | self.assertNextRowEquals({ 'n': 11 }, res ) 248 | self.assertNextRowEquals({ 'n': 12 }, res ) 249 | self.assertNextRowEquals({ 'n': 111 }, res ) 250 | self.assertNextRowEquals({ 'n': 112 }, res ) 251 | self.assertFalse( res.fetchone(), "expected next row to be False" ) 252 | 253 | res.close() 254 | 255 | 256 | def test_SuccessorsToSinkShorthand(self): 257 | self.gp.add_arcs( [ 258 | ( 1, 11 ), 259 | ( 1, 12 ), 260 | ( 11, 111 ), 261 | ( 11, 112 ), 262 | ] ) 263 | 264 | #----------------------------------------------------------- 265 | snk = self.gp.traverse_successors_into( 1, 8, "? n" ) 266 | snk.close() 267 | table = snk.getTable() 268 | 269 | res = self.gp.mysql_query( "SELECT n FROM "+table.get_name()+" ORDER BY n") 270 | 271 | self.assertNextRowEquals({ 'n': 1 }, res ) 272 | self.assertNextRowEquals({ 'n': 11 }, res ) 273 | self.assertNextRowEquals({ 'n': 12 }, res ) 274 | self.assertNextRowEquals({ 'n': 111 }, res ) 275 | self.assertNextRowEquals({ 'n': 112 }, res ) 276 | self.assertFalse( res.fetchone(), "expected next row to be False" ) 277 | 278 | res.close() 279 | snk.drop() 280 | 281 | #--------------------------------------------------------- 282 | snk = self.gp.traverse_successors_into( 1, 8, ( "?", "n" ) ) 283 | snk.close() 284 | table = snk.getTable() 285 | 286 | res = self.gp.mysql_query( "SELECT n FROM "+table.get_name()+" ORDER BY n") 287 | 288 | self.assertNextRowEquals({ 'n': 1 }, res ) 289 | self.assertNextRowEquals({ 'n': 11 }, res ) 290 | self.assertNextRowEquals({ 'n': 12 }, res ) 291 | self.assertNextRowEquals({ 'n': 111 }, res ) 292 | self.assertNextRowEquals({ 'n': 112 }, res ) 293 | self.assertFalse( res.fetchone(), "expected next row to be False" ) 294 | 295 | res.close() 296 | snk.drop() 297 | 298 | #--------------------------------------------------------- 299 | snk = self.gp.traverse_successors_into( 1, 8, MySQLTable("?", "n") ) 300 | snk.close() 301 | table = snk.getTable() 302 | 303 | res = self.gp.mysql_query( "SELECT n FROM "+table.get_name()+" ORDER BY n") 304 | 305 | self.assertNextRowEquals({ 'n': 1 }, res ) 306 | self.assertNextRowEquals({ 'n': 11 }, res ) 307 | self.assertNextRowEquals({ 'n': 12 }, res ) 308 | self.assertNextRowEquals({ 'n': 111 }, res ) 309 | self.assertNextRowEquals({ 'n': 112 }, res ) 310 | self.assertFalse( res.fetchone(), "expected next row to be False" ) 311 | 312 | res.close() 313 | snk.drop() 314 | 315 | #--------------------------------------------------------- 316 | self._make_table( "test_n", "n INT NOT NULL" ) 317 | 318 | table = MySQLTable("test_n", "n") 319 | snk = self.gp.traverse_successors_into( 1, 8, table ) 320 | snk.close() 321 | 322 | res = self.gp.mysql_query( "SELECT n FROM "+table.get_name()+" ORDER BY n") 323 | 324 | self.assertNextRowEquals({ 'n': 1 }, res ) 325 | self.assertNextRowEquals({ 'n': 11 }, res ) 326 | self.assertNextRowEquals({ 'n': 12 }, res ) 327 | self.assertNextRowEquals({ 'n': 111 }, res ) 328 | self.assertNextRowEquals({ 'n': 112 }, res ) 329 | self.assertFalse( res.fetchone(), "expected next row to be False" ) 330 | 331 | res.close() 332 | 333 | if __name__ == '__main__': 334 | unittest.main() 335 | -------------------------------------------------------------------------------- /gp/tests/server_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 3 | 4 | import unittest 5 | import os 6 | import tempfile 7 | from test_base import * 8 | from gp.client import * 9 | 10 | test_graph_name = 'test' + str(os.getpid()) 11 | TestFilePrefix = '/tmp/gptest-' + str(os.getpid()) 12 | 13 | 14 | class ServerTest (ClientTestBase, unittest.TestCase): 15 | """Test server functions via client lib.""" 16 | 17 | def test_createGraph(self): 18 | """Graph management functions""" 19 | 20 | global test_graph_name 21 | 22 | name = test_graph_name + "_2" 23 | 24 | # create the graph 25 | self.gp.create_graph(name) 26 | 27 | #make sure we can't create it twice 28 | ok = self.gp.try_create_graph(name) 29 | self.assertFalse( ok, "should not be able to create graph again when it already exists" ) 30 | 31 | # see if we can use the graph from another connection 32 | gp2 = self.newConnection() 33 | 34 | gp2.use_graph(name) 35 | 36 | # see if we can drop the graph while it's used 37 | self.gp.drop_graph(name) 38 | 39 | #TODO: gp2 should now report errors, because the grpah is gone. test that. 40 | 41 | # see if we can re-create the graph after it was dropped 42 | self.gp.create_graph(name) 43 | self.gp.drop_graph(name) 44 | 45 | #TODO: test name restrictions 46 | 47 | 48 | def test_createNameRestrictions(self): 49 | global test_graph_name 50 | 51 | self.gp.strictArguments = False 52 | # disable strict client-side validation 53 | 54 | try: 55 | n = '' 56 | ok = self.gp.create_graph(n) 57 | self.fail("empty graph names should be forbidden!" ) 58 | except gpException, ex: 59 | pass 60 | # ok 61 | 62 | 63 | n = '1337' 64 | ok = self.gp.try_create_graph(n) 65 | self.assertFalse(ok, "numeric graph names should be forbidden! (name: `" + n + "`)" ) 66 | 67 | n = '1337' + test_graph_name 68 | ok = self.gp.try_create_graph(n) 69 | self.assertFalse( ok, 70 | "graph names starting with a number should be forbidden! (name: `" 71 | + n + "`)" ) 72 | 73 | chars = " \r\n\t\0\x09^!\"§\$%&/()[]\ \ =?'#`\\*+~.:, ;<>|@" 74 | for ch in chars: 75 | try: 76 | n = test_graph_name + ch + "test" 77 | ok = self.gp.create_graph(n) 78 | self.fail("graph names containing `" 79 | + ch + "` should be forbidden! (name: `" 80 | + n + "`)" ) 81 | except gpException, ex: 82 | pass 83 | # ok 84 | try: 85 | n = ch + test_graph_name 86 | ok = self.gp.create_graph(n) 87 | self.fail("graph names starting with `" 88 | + ch + "` should be forbidden! (name: `" + n + "`)") 89 | except gpException, ex: 90 | pass 91 | # ok 92 | 93 | n = 'test1337' + test_graph_name 94 | ok = self.gp.try_create_graph(n) 95 | self.assertEquals( 'OK', ok, 96 | "graph names containing numbers should be allowd! (name: `" 97 | + n+ "`)") 98 | self.gp.try_drop_graph(n) 99 | 100 | chars = '-_8' 101 | for ch in chars: 102 | n = 'test' + ch + test_graph_name 103 | ok = self.gp.try_create_graph(n) 104 | self.assertEquals( 'OK', ok, "graph names containing `" 105 | + ch + "` should be allowd! (name: `" + n + "`)") 106 | self.gp.try_drop_graph(n) 107 | 108 | def test_dropGraph(self): 109 | global test_graph_name 110 | 111 | name = test_graph_name + "_2" 112 | 113 | self.gp.create_graph(name) 114 | self.gp.drop_graph(name) 115 | 116 | ok = self.gp.try_use_graph(name) 117 | self.assertFalse( ok, 118 | "should not be able to use graph after dropping it" ) 119 | ok = self.gp.try_drop_graph(name) 120 | self.assertEquals( 'NONE', ok, "should not be able to drop " 121 | + "graph again after it was already dropped." ) 122 | 123 | def test_listGraphs(self): 124 | global test_graph_name 125 | 126 | gp2 = self.newConnection() 127 | graphs = gp2.capture_list_graphs() 128 | graphs = array_column(graphs, 0) 129 | self.assertTrue( test_graph_name in graphs, 130 | "test table test_graph_name should be in the list" ) 131 | 132 | self.gp.drop_graph(test_graph_name) 133 | graphs = gp2.capture_list_graphs() 134 | #print "graphs: " . var_export($graphs, true) . "\n" 135 | 136 | graphs = array_column( graphs, 0 ) 137 | 138 | #print "graphs: " . var_export($graphs, true) . "\n" 139 | 140 | #print "containes: " . var_export(ConnectionTestBase::setContains( $graphs, test_graph_name ), true) . "\n" 141 | 142 | self.assertFalse( 143 | ConnectionTestBase.setContains(graphs, test_graph_name), 144 | "test table test_graph_name should no longer be in the list" ) 145 | 146 | def test_shutdown(self): 147 | global test_graph_name 148 | 149 | gp2 = self.newConnection() 150 | gp2.use_graph(test_graph_name) 151 | gp2.stats() 152 | 153 | self.assertSessionValue('ConnectedGraph', test_graph_name) 154 | 155 | self.gp.shutdown() # <------------------ 156 | # self.assertSessionValue('ConnectedGraph', 'None'); 157 | # nice, but not reliable. race condition. 158 | 159 | self.gp.try_stats() 160 | self.assertEquals( 'FAILED', self.gp.getStatus(), 161 | 'fetching stats should fail after shutdown' ) 162 | 163 | gp2.try_stats() 164 | self.assertEquals( 'FAILED', gp2.getStatus(), 165 | 'fetching stats should fail after shutdown' ) 166 | gp2.close() 167 | 168 | gp3 = self.newConnection() 169 | gp3.try_use_graph(test_graph_name) 170 | self.assertEquals( 'FAILED', gp3.getStatus(), 171 | 'graph should be unavailable after shutdown' ) 172 | gp3.close() 173 | 174 | 175 | def test_quit(self): 176 | global test_graph_name 177 | 178 | gp2 = self.newConnection() 179 | gp2.use_graph(test_graph_name) 180 | gp2.stats() 181 | 182 | self.assertSessionValue('ConnectedGraph', test_graph_name) 183 | 184 | self.gp.quit() # <------------------ 185 | self.assertStatus('OK') 186 | 187 | try: 188 | self.gp.try_stats() 189 | self.fail( 'connection should be unusable after quit' ) 190 | except gpProtocolException, e: 191 | pass 192 | # ok 193 | 194 | 195 | gp2.stats() 196 | self.assertEquals( 'OK', gp2.getStatus(), 197 | 'connection should still be usable by others after quit; response: %s' % gp2.getResponse() ) 198 | gp2.close() 199 | 200 | gp3 = self.newConnection() 201 | gp3.use_graph(test_graph_name) 202 | self.assertEquals( 'OK', gp3.getStatus(), 203 | 'graph should still be available to others after quit; response: %s' % gp2.getResponse() ) 204 | gp3.close() 205 | 206 | 207 | # privileges 208 | def test_createGraphPrivilege(self): 209 | global test_graph_name 210 | global test_admin, test_admin_password 211 | global test_master, test_master_password 212 | 213 | name = test_graph_name + "_2" 214 | 215 | gp = self.newConnection() 216 | 217 | ok = gp.try_create_graph(name) 218 | self.assertFalse( ok, 219 | "should not be able to create a graph without authorizing" ) 220 | 221 | gp.authorize('password', 222 | test_master + ":" + test_master_password) 223 | ok = gp.try_create_graph(name) 224 | self.assertFalse( ok, 225 | "should not be able to create a graph without admin privileges" ) 226 | 227 | gp.authorize('password', 228 | test_admin + ":" + test_admin_password) 229 | # re-authenticate 230 | ok = gp.create_graph(name) 231 | self.assertEquals( ok, 'OK', 232 | "should be able to create graph with admin privileges; response: %s" % gp.getResponse() ) 233 | 234 | gp.try_drop_graph(name) 235 | # cleanup 236 | 237 | 238 | def test_dropGraphPrivilege(self): 239 | global test_graph_name 240 | global test_admin, test_admin_password 241 | global test_master, test_master_password 242 | 243 | name = test_graph_name 244 | 245 | gp = self.newConnection() 246 | 247 | ok = gp.try_drop_graph(name) 248 | self.assertFalse( ok, "should not be able to drop a graph without authorizing" ) 249 | 250 | gp.authorize('password', 251 | test_master + ":" + test_master_password) 252 | ok = gp.try_drop_graph(name) 253 | self.assertFalse( ok, 254 | "should not be able to drop a graph without admin privileges" ) 255 | 256 | gp.authorize('password', 257 | test_admin + ":" + test_admin_password) 258 | # re-authenticate 259 | ok = gp.drop_graph(name) 260 | self.assertEquals( ok, 'OK', 261 | "should be able to drop graph with admin privileges; response: %s" % gp.getResponse() ) 262 | 263 | def test_inputPipingPrivilege(self): 264 | global test_graph_name, test_graphserv_host 265 | global test_admin, test_admin_password 266 | global test_master, test_master_password 267 | 268 | #XXX: this uses local files, so it will always fail 269 | # if the server isn't on localhost! 270 | if test_graphserv_host != 'localhost': 271 | return None 272 | 273 | f = os.path.dirname(os.path.abspath(__file__)) + '/gp.test.data' 274 | 275 | gp = self.newConnection() 276 | gp.use_graph(test_graph_name) 277 | gp.allowPipes = True 278 | 279 | gp.authorize('password', 280 | test_master + ":" + test_master_password) 281 | 282 | try: 283 | ok = gp.execute("add-arcs < " + f) 284 | self.fail( 285 | "should not be able to pipe without admin privileges!" ) 286 | except gpProcessorException, ex: 287 | self.assertEquals( 'DENIED', gp.getStatus(), 288 | "piping should be denied, not fail. Message: " 289 | + str(ex)) 290 | 291 | 292 | gp.authorize('password', test_admin + ":" + test_admin_password) 293 | # re-authenticate 294 | ok = gp.execute("add-arcs < " + f) 295 | self.assertEquals( ok, 'OK', 296 | "should be able to pipe with admin privileges; response: %s" % gp.getResponse() ) 297 | 298 | 299 | def test_outputPipingPrivilege(self): 300 | global test_graph_name, test_graphserv_host 301 | global test_admin, test_admin_password 302 | global test_master, test_master_password 303 | 304 | #XXX: this uses local files, so it will always fail 305 | # if the server isn't on localhost! 306 | if test_graphserv_host != 'localhost': 307 | return None 308 | 309 | f = tempfile.mktemp(suffix='gpt') 310 | 311 | gp = self.newConnection() 312 | gp.use_graph(test_graph_name) 313 | gp.allowPipes = True 314 | 315 | try: 316 | ok = gp.execute("list-roots > " + f) 317 | self.fail( 318 | "should not be able to pipe without admin privileges!" ) 319 | except gpProcessorException, ex: 320 | self.assertEquals( 'DENIED', gp.getStatus(), 321 | "piping should be denied, not fail. Message: " 322 | + str(ex)) 323 | 324 | gp.authorize( 325 | 'password', test_admin + ":" + test_admin_password) 326 | # re-authenticate 327 | ok = gp.execute("list-roots > " + f) 328 | self.assertEquals( 329 | ok, 'OK', "should be able to pipe with admin privileges; response: %s" % gp.getResponse() ) 330 | 331 | try: 332 | unlink(f) 333 | # cleanup 334 | except: 335 | pass 336 | 337 | def test_addArcsPrivilege(self): 338 | global test_graph_name 339 | global test_master, test_master_password 340 | 341 | gp = self.newConnection() 342 | gp.use_graph(test_graph_name) 343 | 344 | ok = gp.try_add_arcs(((1, 11 ), (1, 12 ) ) ) 345 | self.assertFalse( 346 | ok, "should not be able to add arcs without authorizing" ) 347 | self.assertEquals('DENIED', gp.getStatus(), 348 | "command should be denied, not fail" ) 349 | 350 | gp.authorize('password', 351 | test_master + ":" + test_master_password) 352 | ok = gp.try_add_arcs(((1, 11 ), (1, 12 ) ) ) 353 | self.assertEquals( 'OK', ok, 354 | "should be able to add arcs with updater privileges; response: %s" % gp.getResponse() ) 355 | 356 | def test_removeArcsPrivilege(self): 357 | global test_graph_name 358 | global test_master, test_master_password 359 | 360 | self.gp.add_arcs(((1, 11 ), (1, 12 ) ) ) 361 | # add some arcs as admin 362 | 363 | gp = self.newConnection() 364 | gp.use_graph(test_graph_name) 365 | 366 | ok = gp.try_remove_arcs(((1, 11 ), ) ) 367 | self.assertFalse( ok, 368 | "should not be able to delete arcs without authorizing" ) 369 | self.assertEquals( 'DENIED', gp.getStatus(), 370 | "command should be denied, not fail" ) 371 | 372 | gp.authorize('password', 373 | test_master + ":" + test_master_password) 374 | 375 | ok = gp.try_remove_arcs(((1, 11 ), ) ) 376 | self.assertEquals( 'OK', ok, 377 | "should be able to delete arcs with updater privileges; response: %s" % gp.getResponse() ) 378 | 379 | def test_replaceSuccessorsPrivilege(self): 380 | global test_graph_name 381 | global test_master, test_master_password 382 | 383 | self.gp.add_arcs(((1, 11 ), (1, 12 ) ) ) 384 | # add some arcs as admin 385 | 386 | gp = self.newConnection() 387 | gp.use_graph(test_graph_name) 388 | 389 | ok = gp.try_replace_successors( 1, (17, ) ) 390 | self.assertFalse( ok, 391 | "should not be able to replace arcs without authorizing" ) 392 | self.assertEquals( 'DENIED', gp.getStatus(), 393 | "command should be denied, not fail" ) 394 | 395 | gp.authorize('password', 396 | test_master + ":" + test_master_password) 397 | ok = gp.try_replace_successors( 1, (17, ) ) 398 | self.assertEquals( 'OK', ok, 399 | "should be able to replace arcs with updater privileges; response: %s" % gp.getResponse() ) 400 | 401 | def test_replacePredecessorsPrivilege(self): 402 | global test_graph_name 403 | global test_master, test_master_password 404 | 405 | self.gp.add_arcs(((1, 11 ), (1, 12 ) ) ) 406 | # add some arcs as admin 407 | 408 | gp = self.newConnection() 409 | gp.use_graph(test_graph_name) 410 | 411 | ok = gp.try_replace_predecessors( 1, (17, ) ) 412 | self.assertFalse( ok, 413 | "should not be able to replace arcs without authorizing" ) 414 | self.assertEquals( 'DENIED', gp.getStatus(), 415 | "command should be denied, not fail" ) 416 | 417 | gp.authorize('password', 418 | test_master + ":" + test_master_password) 419 | ok = gp.try_replace_predecessors( 1, (17, ) ) 420 | self.assertEquals( 'OK', ok, 421 | "should be able to replace arcs with updater privileges; response: %s" % gp.getResponse() ) 422 | 423 | def testClearPrivilege(self): 424 | global test_graph_name 425 | global test_admin, test_admin_password 426 | global test_master, test_master_password 427 | 428 | gp = self.newConnection() 429 | gp.use_graph(test_graph_name) 430 | 431 | ok = gp.try_clear() 432 | self.assertFalse( ok, 433 | "should not be able to clear a graph without authorizing" ) 434 | 435 | gp.authorize('password', 436 | test_master + ":" + test_master_password) 437 | ok = gp.try_clear() 438 | self.assertEquals( ok, 'OK', 439 | "should be able to clear graph with updater privileges" ) 440 | 441 | gp.authorize('password', 442 | test_admin + ":" + test_admin_password) 443 | # re-authenticate 444 | ok = gp.try_clear() 445 | self.assertEquals( ok, 'OK', 446 | "should be able to clear graph with admin privileges" ) 447 | 448 | def test_shutdownPrivilege(self): 449 | global test_graph_name 450 | global test_admin, test_admin_password 451 | global test_master, test_master_password 452 | 453 | gp = self.newConnection() 454 | gp.use_graph(test_graph_name) 455 | 456 | ok = gp.try_shutdown() 457 | self.assertFalse( ok, 458 | "should not be able to shut down a graph without authorizing" ) 459 | 460 | gp.authorize('password', 461 | test_master + ":" + test_master_password) 462 | ok = gp.try_shutdown() 463 | self.assertFalse( ok, "should not be able to shut down a graph " 464 | + "without admin privileges" ) 465 | 466 | gp.authorize('password', 467 | test_admin + ":" + test_admin_password) 468 | # re-authenticate 469 | ok = gp.try_shutdown() 470 | self.assertEquals( ok, 'OK', 471 | "should be able to shut down graph with admin privileges" ) 472 | 473 | 474 | def test_traverseSuccessorsWithout(self): 475 | self.gp.add_arcs( [ 476 | ( 1, 11 ), 477 | ( 1, 12 ), 478 | ( 11, 111 ), 479 | ( 11, 112 ), 480 | ( 111, 1111 ), 481 | ( 111, 1112 ), 482 | ( 112, 1121 ), 483 | ] ) 484 | 485 | self.assertStatsValue( 'ArcCount', 7 ) 486 | 487 | #-------------------------------------------- 488 | succ = self.gp.capture_traverse_successors_without( 11, 5, 111, 5 ) 489 | 490 | self.assertEquals( [ (11,), (112,), (1121,), ], succ ) 491 | 492 | 493 | 494 | 495 | #TODO: (optionally) start server instance here! let it die when the test script dies. 496 | 497 | #TODO: CLI interface behaviour of server (port config, etc) 498 | 499 | if __name__ == '__main__': 500 | unittest.main() 501 | -------------------------------------------------------------------------------- /gp/mediawiki.py: -------------------------------------------------------------------------------- 1 | from client import * 2 | from mysql import * 3 | 4 | import re 5 | 6 | NS_MAIN = 0 7 | NS_TALK = 1 8 | NS_USER = 2 9 | NS_USER_TALK = 3 10 | NS_PROJECT = 4 11 | NS_PROJECT_TALK = 5 12 | NS_FILE = 6 13 | NS_FILE_TALK = 7 14 | NS_MEDIAWIKI = 8 15 | NS_MEDIAWIKI_TALK = 9 16 | NS_TEMPLATE = 10 17 | NS_TEMPLATE_TALK = 11 18 | NS_HELP = 12 19 | NS_HELP_TALK = 13 20 | NS_CATEGORY = 14 21 | NS_CATEGORY_TALK = 15 22 | 23 | 24 | class MediaWikiGlue (MySQLGlue) : 25 | 26 | def __init__( self, transport, graphname = None ) : 27 | super(MediaWikiGlue, self).__init__(transport, graphname) 28 | 29 | self.table_prefix = "" 30 | 31 | #h = array( self, 'gp_mediawiki_exec_handler' ) 32 | #self.addExecHandler( h ) 33 | 34 | 35 | def set_table_prefix ( self, prefix ) : 36 | self.table_prefix = prefix 37 | 38 | 39 | def get_db_key ( self, name ) : 40 | if name is None or name == False: 41 | raise gpUsageException("name must not be empty!") 42 | 43 | #TODO: use native MediaWiki method if available 44 | name = name.strip() 45 | 46 | if name == "": 47 | raise gpUsageException("name must not be empty!") 48 | 49 | name = re.sub(' ', '_', name) 50 | 51 | result = name[0].upper() + name[1:] #FIXME: unreliable, handle unicode! 52 | 53 | return name 54 | 55 | 56 | def wiki_table ( self, name ) : 57 | return self.table_prefix + name 58 | 59 | 60 | def get_page_id ( self, ns, title ) : 61 | sql = "select page_id from " + self.wiki_table( "page" ) 62 | sql += " where page_namespace = %i" % int(ns) 63 | sql += " and page_title = " + self.quote_string( self.get_db_key(title) ) 64 | 65 | id = self.mysql_query_value( sql ) 66 | return id 67 | 68 | 69 | def add_arcs_from_category_structure ( self, ) : 70 | sql = "select C.page_id as parent, P.page_id as child" 71 | sql += " from " + self.wiki_table( "page" ) + " as P " 72 | sql += " join " + self.wiki_table( "categorylinks" ) + " as X " 73 | sql += " on X.cl_from = P.page_id " 74 | sql += " join " + self.wiki_table( "page" ) + " as C " 75 | sql += " on C.page_namespace = %i" % NS_CATEGORY 76 | sql += " and C.page_title = X.cl_to " 77 | sql += " where P.page_namespace = %i" % NS_CATEGORY 78 | 79 | src = self.make_source( MySQLSelect( sql ) ) 80 | 81 | self.add_arcs( src ) 82 | src.close() 83 | 84 | 85 | def get_subcategories ( self, cat, depth, without = None, without_depth = None ) : 86 | sink = ArraySink() 87 | 88 | id = self.get_page_id( NS_CATEGORY, cat ) 89 | if ( not id ): return 'NONE' 90 | 91 | if ( without ): without_id = self.get_page_id( NS_CATEGORY, without ) 92 | else: without_id = False 93 | 94 | temp = self.make_temp_sink( MySQLTable('?', 'id') ) 95 | 96 | if ( without_id ) : 97 | if ( not without_depth ): without_depth = depth 98 | status = self.traverse_successors_without( id, depth, without_id, without_depth, temp ) 99 | else : 100 | status = self.traverse_successors( id, depth, temp ) 101 | 102 | 103 | temp.close() 104 | 105 | if ( status == 'OK' ) : 106 | sql = "select page_title " 107 | sql += " from " + self.wiki_table( "page" ) 108 | sql += " join " + temp.getTable().get_name() 109 | sql += " on id = page_id " 110 | sql += " where page_namespace = %i" % NS_CATEGORY # should be redundant 111 | sql += " order by page_id " 112 | 113 | self.select_into( sql , sink) 114 | 115 | 116 | temp.drop() 117 | 118 | return sink.getData() 119 | 120 | @staticmethod 121 | def new_client_connection( graphname, host = False, port = False ) : 122 | return MediaWikiGlue( ClientTransport(host, port), graphname ) #FIXME: PORT graphname stuff to PHP! 123 | 124 | @staticmethod 125 | def new_slave_connection( command, cwd = None, env = None ) : 126 | return MediaWikiGlue( SlaveTransport(command, cwd, env), None ) 127 | 128 | 129 | 130 | 131 | class PageSet : 132 | 133 | def __init__ ( self, glue, table = "?", id_field = "page_id", namespace_field = "page_namespace", title_field = "page_title", big = True ) : 134 | self.big = big 135 | 136 | self.glue = glue 137 | self.table = table 138 | 139 | self.id_field = id_field 140 | self.namespace_field = namespace_field 141 | self.title_field = title_field 142 | 143 | self.table_obj = MySQLTable( self.table, self.id_field, self.namespace_field, self.title_field ) 144 | self.table_obj.set_field_definition( self.id_field, "INT NOT NULL") 145 | self.table_obj.set_field_definition( self.namespace_field, "INT DEFAULT NULL") 146 | self.table_obj.set_field_definition( self.title_field, "VARCHAR(255) BINARY DEFAULT NULL") 147 | self.table_obj.add_key_definition( "PRIMARY KEY (" + self.id_field + ")" ) 148 | self.table_obj.add_key_definition( "UNIQUE KEY (" + self.namespace_field + ", " + self.title_field + ")" ) 149 | 150 | self.table_id_obj = MySQLTable( self.table, self.id_field ) 151 | self.table_id_obj.add_key_definition( "PRIMARY KEY (" + self.id_field + ")" ) 152 | 153 | 154 | def set_expect_big ( self, big ) : 155 | self.big = big 156 | 157 | 158 | def get_table ( self, ) : 159 | return self.table_obj 160 | 161 | 162 | def create_table ( self, ) : 163 | table = self.table 164 | t = "" 165 | 166 | if ( not table or table == '?' ) : 167 | table = "gp_temp_%s" % self.glue.next_id() 168 | t = " TEMPORARY " 169 | 170 | 171 | sql = "CREATE " + t + " TABLE " + table 172 | sql += "(" 173 | sql += self.table_obj.get_field_definitions() 174 | sql += ")" 175 | 176 | self._update(sql) 177 | 178 | self.table = table 179 | self.table_obj.set_name( self.table ) 180 | self.table_id_obj.set_name( self.table ) 181 | 182 | return table 183 | 184 | 185 | 186 | def _query( self, sql, **kwargs ) : 187 | if not 'unbuffered' in kwargs: 188 | kwargs['unbuffered'] = self.big 189 | 190 | return self.glue.mysql_query(sql, **kwargs) #TODO: port kwargs to PHP 191 | 192 | def _update( self, sql, **kwargs ) : #TODO: port to PHP; use in PHP! 193 | return self.glue.mysql_update(sql, **kwargs) 194 | 195 | def add_from_select ( self, select, comment = None ) : 196 | sql= "REPLACE INTO " + self.table + " " 197 | sql += "( " 198 | sql += self.id_field + ", " 199 | sql += self.namespace_field + ", " 200 | sql += self.title_field + " ) " 201 | sql += select 202 | 203 | return self._update( sql, comment = comment ) 204 | 205 | 206 | def delete_where ( self, where, comment = None ) : 207 | sql= "DELETE FROM " + self.table + " " 208 | sql += where 209 | 210 | return self._update( sql, comment = comment ) 211 | 212 | 213 | def delete_using ( self, using, tableAlias = "T", comment = None ) : 214 | sql= "DELETE FROM " + tableAlias + " " 215 | sql += "USING " + self.table + " AS " + tableAlias + " " 216 | sql += using 217 | 218 | return self._update( sql, comment = comment ) 219 | 220 | 221 | def resolve_ids ( self, comment = None ) : 222 | #NOTE: MySQL can't perform self-joins on temp tables. so we need to copy the ids to another temp table first. 223 | t = MySQLTable("?", "page_id") 224 | t.add_key_definition("PRIMARY KEY (page_id)") 225 | 226 | tmp = self.glue.make_temp_table( t ) 227 | 228 | sql = tmp.get_insert(True) 229 | sql += "SELECT " + self.id_field 230 | sql += " FROM " + self.table 231 | sql += " WHERE page_title IS NULL" 232 | 233 | self._update( sql ); #copy page ids with no page title into temp table 234 | 235 | sql = "SELECT P.page_id, P.page_namespace, P.page_title " 236 | sql += " FROM " + self.glue.wiki_table("page") + " AS P " 237 | sql += " JOIN " + tmp.get_name() + " AS T ON T.page_id = P.page_id" 238 | 239 | self.add_from_select( sql, comment = comment ) #TODO: port comment to PHP 240 | 241 | self.glue.drop_temp_table( tmp ) 242 | return True 243 | 244 | 245 | def make_sink ( self, ) : 246 | sink = self.glue.make_sink( self.table_obj ) 247 | return sink 248 | 249 | 250 | def make_id_sink ( self, ) : 251 | sink = self.glue.make_sink( self.table_id_obj ) 252 | return sink 253 | 254 | 255 | def make_id_source ( self, ns = None ) : 256 | return self.make_source( ns, True ) 257 | 258 | 259 | def make_source ( self, ns = None, ids_only = False, auto_order = False ) : #TODO: PORT auto_order to PHP 260 | t = self.table_id_obj if ids_only else self.table_obj 261 | 262 | if ( ns is not None ) : 263 | select = t._get_select() 264 | 265 | if ( isinstance(ns, (tuple, list, set)) ): select += " where page_namespace in " + self.glue.as_list( ns ) 266 | else: select += " where page_namespace = %i" % int(ns) 267 | 268 | t = MySQLSelect(select) 269 | 270 | 271 | src = self.glue.make_source( t, big = self.big, auto_order = auto_order ) 272 | return src 273 | 274 | 275 | def capture ( self, ns = None, data = None ) : 276 | sink = ArraySink( data ) 277 | self.copy_to_sink( ns, sink ) 278 | return sink.getData() 279 | 280 | 281 | def capture_ids ( self, ns = None, data = None ) : 282 | sink = ArraySink( data ) 283 | self.copy_ids_to_sink( ns, sink ) 284 | return sink.getData() 285 | 286 | 287 | def copy_to_sink ( self, ns, sink ) : 288 | src = self.make_source(ns) 289 | c = self.glue.copy(src, sink, "~") 290 | src.close() 291 | return c 292 | 293 | 294 | def copy_ids_to_sink ( self, ns, sink ) : 295 | src = self.make_id_source(ns) 296 | c = self.glue.copy(src, sink, "~") 297 | src.close() 298 | return c 299 | 300 | 301 | def add_source ( self, src ) : 302 | sink = self.make_sink() 303 | c = self.glue.copy( src, sink, "+" ) 304 | sink.close() 305 | return c 306 | 307 | 308 | def add_page_set ( self, set ) : 309 | select = set.get_table()._get_select() 310 | return self.add_from_select( select ) 311 | 312 | 313 | def subtract_page_set ( self, set ) : 314 | t = set.get_table() 315 | return self.subtract_table( t ) 316 | 317 | 318 | def subtract_source ( self, src ): #XXX: must be a 1 column id source... 319 | t = MySQLTable("?", "page_id") 320 | sink = self.glue.make_temp_sink( t ) 321 | t = sink.getTable() 322 | 323 | self.glue.copy( src, sink, "+" ) 324 | 325 | ok = self.subtract_table(t, "page_id") 326 | 327 | self.glue.drop_temp_table(t) 328 | sink.close() 329 | 330 | return ok 331 | 332 | 333 | def retain_page_set ( self, set ) : 334 | t = set.get_table() 335 | return self.retain_table( t ) 336 | 337 | 338 | def retain_source ( self, src ) : #XXX: must be a 1 column id source... 339 | t = MySQLTable("?", "page_id") 340 | sink = self.glue.make_temp_sink( t ) 341 | t = sink.getTable() 342 | 343 | self.glue.copy( src, sink, "+" ) 344 | 345 | ok = self.retain_table(t, "page_id") 346 | 347 | self.glue.drop_temp_table(t) 348 | sink.close() 349 | 350 | return ok 351 | 352 | 353 | def subtract_table ( self, table, id_field = None ) : 354 | if ( not id_field ): id_field = table.get_field1() 355 | 356 | sql = "DELETE FROM T " 357 | sql += " USING " + self.table + " AS T " 358 | sql += " JOIN " + table.get_name() + " AS R " 359 | sql += " ON T." + self.id_field + " = R." + id_field 360 | 361 | self._update(sql) 362 | return True 363 | 364 | 365 | def retain_table ( self, table, id_field = None ) : 366 | if ( not id_field ): id_field = table.get_field1() 367 | 368 | sql = "DELETE FROM T " 369 | sql += " USING " + self.table + " AS T " 370 | sql += " LEFT JOIN " + table.get_name() + " AS R " 371 | sql += " ON T." + self.id_field + " = R." + id_field 372 | sql += " WHERE R." + id_field + " IS NULL" 373 | 374 | self._update(sql) 375 | return True 376 | 377 | 378 | def remove_page ( self, ns, title ) : 379 | sql = "DELETE FROM " + self.table 380 | sql += " WHERE " + self.namespace_field + " = %i" % int(ns) 381 | sql += " AND " + self.title_field + " = " + self.glue.quote_string(title) 382 | 383 | self._update(sql) 384 | return True 385 | 386 | 387 | def remove_page_id ( self, id ) : 388 | sql = "DELETE FROM " + self.table 389 | sql += " WHERE " + self.id_field + " = %i" % int(id) 390 | 391 | self._update(sql) 392 | return True 393 | 394 | 395 | def strip_namespace ( self, ns, inverse = False ) : 396 | sql = "DELETE FROM " + self.table 397 | sql += " WHERE " + self.namespace_field 398 | 399 | if ( isinstance(ns, (tuple, list, set)) ): sql += ( " not in " if inverse else " in " ) + self.glue.as_list( ns ) 400 | else: sql += ( " != " if inverse else " = " ) + str(int(ns)) 401 | 402 | self._update(sql) 403 | return True 404 | 405 | 406 | def retain_namespace ( self, ns ) : 407 | return self.strip_namespace( ns, True ) 408 | 409 | 410 | def add_page ( self, id, ns, title ) : 411 | if ( not id ): id = self.glue.get_page_id( NS_CATEGORY, cat ) 412 | 413 | values = array(id, ns, title) 414 | 415 | sql = self.table_obj.insert_command() 416 | sql += " VALUES " 417 | sql += self.glue.as_list(values) 418 | 419 | self._update( sql ) 420 | return True 421 | 422 | 423 | def add_page_id ( self, id ) : 424 | values = array(id) 425 | 426 | sql = "INSERT IGNORE INTO " + self.table 427 | sql += " ( " + self.id_field + " ) " 428 | sql += " VALUES " 429 | sql += self.glue.as_list(values) 430 | 431 | self._update( sql ) 432 | return True 433 | 434 | 435 | def expand_categories ( self, ns = None, comment = None ) : 436 | #NOTE: MySQL can't perform self-joins on temp tables. so we need to copy the category names to another temp table first. 437 | t = MySQLTable("?", "cat_title") 438 | t.set_field_definition("cat_title", "VARCHAR(255) BINARY NOT NULL") 439 | t.add_key_definition("PRIMARY KEY (cat_title)") 440 | 441 | tmp = self.glue.make_temp_table( t ) 442 | 443 | sql = tmp.get_insert(True) 444 | sql += " select page_title " 445 | sql += " from " + self.table + " as T " 446 | sql += " where page_namespace = %i " % NS_CATEGORY 447 | 448 | self._update( sql ) 449 | #self.glue.dump_query("select * from " +tmp.get_name()) 450 | 451 | # ---------------------------------------------------------- 452 | sql = "select P.page_id, P.page_namespace, P.page_title " 453 | sql += " from " + self.glue.wiki_table( "page" ) + " as P " 454 | sql += " join " + self.glue.wiki_table( "categorylinks" ) + " as X " 455 | sql += " on X.cl_from = P.page_id " 456 | sql += " join " + tmp.get_name() + " as T " 457 | sql += " on T.cat_title = X.cl_to " 458 | 459 | if (ns is not None) : 460 | if ( isinstance(ns, (tuple, list, set)) ): sql += " where P.page_namespace in " + self.glue.as_list( ns ) 461 | else: sql += " where P.page_namespace = %i" % int(ns) 462 | 463 | 464 | #self.glue.dump_query(sql) 465 | self.add_from_select( sql, comment = comment ) #TODO: port comment to PHP 466 | 467 | #self.glue.dump_query("select * from " +self.table) 468 | self.glue.drop_temp_table( tmp ) 469 | return True 470 | 471 | 472 | def add_subcategories ( self, cat, depth, without = None, without_depth = None ) : 473 | self._add_subcategory_ids(cat, depth, without, without_depth) 474 | self.resolve_ids() 475 | return True 476 | 477 | 478 | def _add_subcategory_ids( self, cat, depth, without = None, without_depth = None ) : 479 | id = self.glue.get_page_id( NS_CATEGORY, cat ) 480 | if ( not id ): return False 481 | 482 | if ( without ): without_id = self.glue.get_page_id( NS_CATEGORY, without ) 483 | else: without_id = False 484 | 485 | sink = self.make_id_sink() 486 | 487 | if ( without_id ) : 488 | if ( not without_depth ): without_depth = depth 489 | status = self.glue.traverse_successors_without( id, depth, without_id, without_depth, sink ) 490 | else : 491 | status = self.glue.traverse_successors( id, depth, sink ) 492 | 493 | 494 | sink.close() 495 | return True 496 | 497 | def get_size(self): 498 | res = self._query("SELECT COUNT(*) FROM " + self.table) 499 | try: 500 | row = res.fetchone() 501 | finally: 502 | res.close() 503 | 504 | return row[0] 505 | 506 | def add_pages_in ( self, cat, ns, depth, comment = None ) : 507 | self.get_size() 508 | 509 | if ( not self.add_subcategories(cat, depth) ): 510 | return False 511 | 512 | self.get_size() # ?! 513 | 514 | self.expand_categories(ns, comment = comment) 515 | return True 516 | 517 | 518 | def add_pages_transclusing ( self, tag, ns = None, comment = None ) : 519 | if ( ns is None ): ns = NS_TEMPLATE 520 | tag = self.glue.get_db_key( tag ) 521 | 522 | sql = " SELECT page_id, page_namespace, page_title " 523 | sql += " FROM " + self.glue.wiki_table( "page" ) 524 | sql += " JOIN " + self.glue.wiki_table( "templatelinks" ) 525 | sql += " ON tl_from = page_id " 526 | sql += " WHERE tl_namespace = %i" % int(ns) 527 | sql += " AND tl_title = " + self.glue.quote_string(tag) 528 | 529 | return self.add_from_select(sql, comment = comment) 530 | 531 | 532 | def clear ( self, ) : 533 | sql = "TRUNCATE " + self.table 534 | self._update(sql) 535 | return True 536 | 537 | 538 | def dispose ( self, ) : 539 | sql = "DROP TEMPORARY TABLE " + self.table 540 | self._update(sql) 541 | return True 542 | 543 | 544 | 545 | -------------------------------------------------------------------------------- /gp/tests/mediawiki_test.py: -------------------------------------------------------------------------------- 1 | from gp.mediawiki import * 2 | from gp.client import * 3 | from gp.mysql import * 4 | from test_base import * 5 | 6 | import unittest 7 | import sys 8 | 9 | class MediaWikiTest (SlaveTestBase, unittest.TestCase): 10 | 11 | def setUp(self) : 12 | self.dump = PipeSink( sys.stdout ) 13 | 14 | try : 15 | self.gp = MediaWikiGlue.new_slave_connection( test_graphcore_path ) 16 | self.gp.connect() 17 | except gpException as ex: 18 | print "Unable to launch graphcore instance from %s, please make sure graphcore is installed and check the test_graphcore_path configuration options in test_config.py.\nOriginal error: %s " % (test_graphcore_path, ex.getMessage() ) 19 | suicide(10) 20 | 21 | 22 | try : 23 | self.gp.mysql_connect( test_mysql_host, test_mysql_user, test_mysql_password, test_mysql_database ) 24 | self.gp.set_table_prefix( test_mediawiki_table_prefix ) 25 | except gpException as ex: 26 | print "Unable to connect to database %s on MySQL host %s as %s, please make sure MySQL is running and check the test_mysql_host and related configuration options in test_cofig.py.\nOriginal error: %s " % (test_mysql_database, test_mysql_host, test_mysql_user, ex.getMessage() ) 27 | suicide(10) 28 | 29 | 30 | 31 | def _makeTable( self, table, fieldSpec, temp = False ) : 32 | t = " TEMPORARY " if temp else "" 33 | sql = "CREATE " + t + " TABLE IF NOT EXISTS " + table 34 | sql += "(" 35 | sql += fieldSpec 36 | sql += ")" 37 | 38 | self.gp.mysql_query(sql) 39 | 40 | sql = "TRUNCATE TABLE " + table 41 | self.gp.mysql_query(sql) 42 | 43 | 44 | def _makeWikiTable( self, name, spec ) : 45 | name = test_mediawiki_table_prefix + name 46 | 47 | self._makeTable( name, spec ) 48 | return name 49 | 50 | 51 | def _makeWikiStructure( self ) : 52 | p = self._makeWikiTable( "page", "page_id INT NOT NULL, page_namespace INT NOT NULL, page_title VARCHAR(255) NOT NULL, PRIMARY KEY (page_id), UNIQUE KEY (page_namespace, page_title)" ) 53 | self.gp.mysql_query( "TRUNCATE " + p ) 54 | 55 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (1, " + str(NS_MAIN) + ", 'Main_Page')" ) 56 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (2, " + str(NS_PROJECT) + ", 'Help_Out')" ) 57 | 58 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (10, " + str(NS_CATEGORY) + ", 'ROOT')" ) 59 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (20, " + str(NS_CATEGORY) + ", 'Portals')" ) 60 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (110, " + str(NS_CATEGORY) + ", 'Topics')" ) 61 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (1110, " + str(NS_CATEGORY) + ", 'Beer')" ) 62 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (1111, " + str(NS_MAIN) + ", 'Lager')" ) 63 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (1112, " + str(NS_MAIN) + ", 'Pils')" ) 64 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (2110, " + str(NS_CATEGORY) + ", 'Cheese')" ) 65 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (120, " + str(NS_CATEGORY) + ", 'Maintenance')" ) 66 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (1120, " + str(NS_CATEGORY) + ", 'Bad_Cheese')" ) 67 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (1122, " + str(NS_MAIN) + ", 'Toe_Cheese')" ) 68 | self.gp.mysql_query( "INSERT INTO " + p + " VALUES (333, " + str(NS_TEMPLATE) + ", 'Yuck')" ) 69 | 70 | cl = self._makeWikiTable( "categorylinks", "cl_from INT NOT NULL, cl_to VARCHAR(255) NOT NULL, PRIMARY KEY (cl_from, cl_to), INDEX cl_to (cl_to)" ) 71 | self.gp.mysql_query( "TRUNCATE " + cl ) 72 | 73 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1, 'Portals')" ) 74 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (2, 'Portals')" ) 75 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (20, 'ROOT')" ) 76 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (120, 'ROOT')" ) 77 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (110, 'ROOT')" ) 78 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1110, 'Topics')" ) 79 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (2110, 'Topics')" ) 80 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1111, 'Beer')" ) 81 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1112, 'Beer')" ) 82 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1120, 'Maintenance')" ) 83 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1120, 'Cheese')" ) 84 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1120, 'Cruft')" ) 85 | self.gp.mysql_query( "INSERT INTO " + cl + " VALUES (1122, 'Bad_Cheese')" ) 86 | 87 | tl = self._makeWikiTable( "templatelinks", "tl_from INT NOT NULL, tl_namespace INT NOT NULL, tl_title VARCHAR(255) NOT NULL, PRIMARY KEY (tl_from, tl_namespace, tl_title), INDEX tl_to (tl_namespace, tl_title)" ) 88 | self.gp.mysql_query( "TRUNCATE " + tl ) 89 | 90 | self.gp.mysql_query( "INSERT INTO " + tl + " VALUES (1122, " + str(NS_TEMPLATE) + ", 'Yuck')" ) 91 | self.gp.mysql_query( "INSERT INTO " + tl + " VALUES (1111, " + str(NS_TEMPLATE) + ", 'Yuck')" ) 92 | 93 | 94 | 95 | ########################################### 96 | 97 | def test_TraverseSuccessors( self ) : 98 | self.gp.add_arcs( [ 99 | ( 1, 11 ), 100 | ( 1, 12 ), 101 | ( 11, 111 ), 102 | ( 11, 112 ), 103 | ( 111, 1111 ), 104 | ( 111, 1112 ), 105 | ( 112, 1121 ), 106 | ] ) 107 | 108 | self.assertStatsValue( 'ArcCount', 7 ) 109 | 110 | #-------------------------------------------- 111 | succ = self.gp.capture_traverse_successors( 11, 5 ) 112 | 113 | self.assertEquals( [ ( 11, ), ( 111, ), ( 112, ), ( 1111, ), ( 1112, ), ( 1121, ), ], succ ) 114 | 115 | 116 | ########################################### 117 | 118 | def test_AddArcsFromCategoryStructure( self ) : 119 | self._makeWikiStructure() 120 | 121 | #----------------------------------------------------------- 122 | self.gp.add_arcs_from_category_structure() 123 | 124 | #----------------------------------------------------------- 125 | a = self.gp.capture_list_successors( 10 ) 126 | self.assertEquals([( 20, ), ( 110, ), ( 120, )], a ) 127 | 128 | a = self.gp.capture_list_predecessors( 1120 ) 129 | self.assertEquals([( 120, ), ( 2110, )], a ) 130 | 131 | a = self.gp.capture_traverse_successors( 110, 5 ) 132 | self.assertEquals([( 110, ), ( 1110, ), ( 2110, ), ( 1120, )], a ) 133 | 134 | 135 | def test_GetSubcategories( self ) : 136 | self._makeWikiStructure() 137 | self.gp.add_arcs_from_category_structure() 138 | 139 | #----------------------------------------------------------- 140 | a = self.gp.get_subcategories("topics", 5) 141 | self.assertEquals([( "Topics", ), 142 | ( "Beer", ), 143 | ( "Bad_Cheese", ), 144 | ( "Cheese", )], a ) 145 | 146 | #----------------------------------------------------------- 147 | a = self.gp.get_subcategories("topics", 5, "maintenance") 148 | self.assertEquals([( "Topics", ), 149 | ( "Beer", ), 150 | ( "Cheese", )], a ) 151 | 152 | 153 | ########################################### 154 | def test_AddSubcategories( self ) : 155 | self._makeWikiStructure() 156 | self.gp.add_arcs_from_category_structure() 157 | 158 | pages = PageSet(self.gp) 159 | pages.create_table() 160 | 161 | #----------------------------------------------------------- 162 | pages.clear() 163 | ok = pages.add_subcategories("topics", 5) 164 | self.assertTrue( ok ) 165 | 166 | a = pages.capture() 167 | self.assertEquals([(110, NS_CATEGORY, "Topics"), 168 | (1110, NS_CATEGORY, "Beer"), 169 | (1120, NS_CATEGORY, "Bad_Cheese"), 170 | (2110, NS_CATEGORY, "Cheese")], a ) 171 | 172 | #----------------------------------------------------------- 173 | pages.clear() 174 | ok = pages.add_subcategories("Portals", 5) 175 | self.assertTrue( ok ) 176 | 177 | a = pages.capture() 178 | self.assertEquals([(20, NS_CATEGORY, "Portals")], a ) 179 | 180 | #----------------------------------------------------------- 181 | pages.dispose() 182 | 183 | 184 | def test_AddPagesTranscluding( self ) : 185 | self._makeWikiStructure() 186 | self.gp.add_arcs_from_category_structure() 187 | 188 | pages = PageSet(self.gp) 189 | pages.create_table() 190 | 191 | #----------------------------------------------------------- 192 | pages.clear() 193 | ok = pages.add_pages_transclusing("yuck") 194 | self.assertTrue( ok ) 195 | 196 | a = pages.capture() 197 | self.assertEquals([(1111, NS_MAIN, "Lager"), 198 | (1122, NS_MAIN, "Toe_Cheese")], a ) 199 | 200 | #----------------------------------------------------------- 201 | pages.dispose() 202 | 203 | 204 | def test_AddPagesIn( self ) : 205 | self._makeWikiStructure() 206 | self.gp.add_arcs_from_category_structure() 207 | 208 | pages = PageSet(self.gp) 209 | pages.create_table() 210 | 211 | #----------------------------------------------------------- 212 | pages.clear() 213 | ok = pages.add_pages_in("topics", None, 5) 214 | self.assertTrue( ok ) 215 | 216 | a = pages.capture() 217 | expected = [ (110, NS_CATEGORY, "Topics"), 218 | (1110, NS_CATEGORY, "Beer"), 219 | (1111, NS_MAIN, "Lager"), 220 | (1112, NS_MAIN, "Pils"), 221 | (1120, NS_CATEGORY, "Bad_Cheese"), 222 | (1122, NS_MAIN, "Toe_Cheese"), 223 | (2110, NS_CATEGORY, "Cheese") ] 224 | 225 | self.assertEquals(expected, a ) 226 | 227 | #----------------------------------------------------------- 228 | pages.clear() 229 | ok = pages.add_pages_in("topics", None, 5) 230 | self.assertTrue( ok ) 231 | 232 | a = pages.capture( NS_MAIN ) 233 | self.assertEquals([(1111, NS_MAIN, "Lager"), 234 | (1112, NS_MAIN, "Pils"), 235 | (1122, NS_MAIN, "Toe_Cheese")], a ) 236 | 237 | #----------------------------------------------------------- 238 | pages.clear() 239 | ok = pages.add_pages_in("Portals", NS_MAIN, 5) 240 | self.assertTrue( ok ) 241 | 242 | a = pages.capture() 243 | self.assertEquals([(1, NS_MAIN, "Main_Page"), 244 | (20, NS_CATEGORY, "Portals")], a ) 245 | 246 | #----------------------------------------------------------- 247 | pages.clear() 248 | ok = pages.add_pages_in("portals", (NS_MAIN, NS_PROJECT), 5) 249 | self.assertTrue( ok ) 250 | 251 | a = pages.capture( (NS_MAIN, NS_PROJECT) ) 252 | self.assertEquals([(1, NS_MAIN, "Main_Page"), 253 | (2, NS_PROJECT, "Help_Out")], a ) 254 | 255 | #----------------------------------------------------------- 256 | pages.dispose() 257 | 258 | 259 | def test_BufferedAddPagesIn( self ) : 260 | self._makeWikiStructure() 261 | self.gp.add_arcs_from_category_structure() 262 | 263 | pages = PageSet(self.gp) 264 | pages.set_expect_big(False) 265 | pages.create_table() 266 | 267 | #----------------------------------------------------------- 268 | pages.clear() 269 | ok = pages.add_pages_in("topics", None, 5) 270 | self.assertTrue( ok ) 271 | 272 | a = pages.capture() 273 | expected = [(110, NS_CATEGORY, "Topics"), 274 | (1110, NS_CATEGORY, "Beer"), 275 | (1111, NS_MAIN, "Lager"), 276 | (1112, NS_MAIN, "Pils"), 277 | (1120, NS_CATEGORY, "Bad_Cheese"), 278 | (1122, NS_MAIN, "Toe_Cheese"), 279 | (2110, NS_CATEGORY, "Cheese") ] 280 | 281 | self.assertEquals(expected, a ) 282 | 283 | #----------------------------------------------------------- 284 | pages.dispose() 285 | 286 | 287 | def test_SubtractPageSet( self ) : 288 | self._makeWikiStructure() 289 | self.gp.add_arcs_from_category_structure() 290 | 291 | pages = PageSet(self.gp) 292 | pages.create_table() 293 | 294 | rpages = PageSet(self.gp) 295 | rpages.create_table() 296 | 297 | #----------------------------------------------------------- 298 | ok = pages.add_pages_in("topics", None, 5) 299 | ok = rpages.add_pages_in("Maintenance", None, 5) 300 | 301 | ok = pages.subtract_page_set( rpages ) 302 | self.assertTrue( ok ) 303 | 304 | a = pages.capture() 305 | expected = [ (110, NS_CATEGORY, "Topics"), 306 | (1110, NS_CATEGORY, "Beer"), 307 | (1111, NS_MAIN, "Lager"), 308 | (1112, NS_MAIN, "Pils"), 309 | (2110, NS_CATEGORY, "Cheese") ] 310 | 311 | self.assertEquals(expected, a ) 312 | 313 | #----------------------------------------------------------- 314 | pages.dispose() 315 | rpages.dispose() 316 | 317 | 318 | def test_RetainPageSet( self ) : 319 | self._makeWikiStructure() 320 | self.gp.add_arcs_from_category_structure() 321 | 322 | pages = PageSet(self.gp) 323 | pages.create_table() 324 | 325 | rpages = PageSet(self.gp) 326 | rpages.create_table() 327 | 328 | #----------------------------------------------------------- 329 | ok = pages.add_pages_in("topics", None, 5) 330 | ok = rpages.add_pages_in("Maintenance", None, 5) 331 | 332 | ok = pages.retain_page_set( rpages ) 333 | self.assertTrue( ok ) 334 | 335 | a = pages.capture() 336 | expected = [ (1120, NS_CATEGORY, "Bad_Cheese"), 337 | (1122, NS_MAIN, "Toe_Cheese") ] 338 | 339 | self.assertEquals(expected, a ) 340 | 341 | #----------------------------------------------------------- 342 | pages.dispose() 343 | rpages.dispose() 344 | 345 | 346 | def test_AddPageSet( self ) : 347 | self._makeWikiStructure() 348 | self.gp.add_arcs_from_category_structure() 349 | 350 | beer = PageSet(self.gp) 351 | beer.create_table() 352 | 353 | cheese = PageSet(self.gp) 354 | cheese.create_table() 355 | 356 | #----------------------------------------------------------- 357 | ok = cheese.add_pages_in("Cheese", None, 5) 358 | ok = beer.add_pages_in("Beer", None, 5) 359 | 360 | ok = cheese.add_page_set( beer ) 361 | self.assertTrue( ok ) 362 | 363 | a = cheese.capture() 364 | expected = [ (1110, NS_CATEGORY, "Beer"), 365 | (1111, NS_MAIN, "Lager"), 366 | (1112, NS_MAIN, "Pils"), 367 | (1120, NS_CATEGORY, "Bad_Cheese"), 368 | (1122, NS_MAIN, "Toe_Cheese"), 369 | (2110, NS_CATEGORY, "Cheese") ] 370 | 371 | self.assertEquals(expected, a ) 372 | 373 | #----------------------------------------------------------- 374 | beer.dispose() 375 | cheese.dispose() 376 | 377 | 378 | def test_DeleteWhere( self ) : 379 | self._makeWikiStructure() 380 | self.gp.add_arcs_from_category_structure() 381 | 382 | pages = PageSet(self.gp) 383 | pages.create_table() 384 | 385 | pages.add_pages_in("topics", None, 5) 386 | 387 | #----------------------------------------------------------- 388 | pages.delete_where( "where page_namespace = %i" % NS_CATEGORY ) 389 | 390 | a = pages.capture() 391 | expected = [ (1111, NS_MAIN, "Lager"), 392 | (1112, NS_MAIN, "Pils"), 393 | (1122, NS_MAIN, "Toe_Cheese") ] 394 | 395 | self.assertEquals(expected, a ) 396 | 397 | #----------------------------------------------------------- 398 | pages.dispose() 399 | 400 | 401 | def test_DeleteUsing( self ) : 402 | self._makeWikiStructure() 403 | self.gp.add_arcs_from_category_structure() 404 | 405 | pages = PageSet(self.gp) 406 | pages.create_table() 407 | 408 | pages.add_pages_in("topics", None, 5) 409 | 410 | #----------------------------------------------------------- 411 | sql = " JOIN " + self.gp.wiki_table("templatelinks") + " as X " 412 | sql += " ON T.page_id = X.tl_from " 413 | sql += " WHERE X.tl_namespace = %i" % NS_TEMPLATE 414 | sql += " AND X.tl_title = " + self.gp.quote_string("Yuck") 415 | 416 | pages.delete_using( sql ) 417 | 418 | a = pages.capture(NS_MAIN) 419 | expected = [ (1112, NS_MAIN, "Pils") ] 420 | 421 | self.assertEquals(expected, a ) 422 | 423 | #----------------------------------------------------------- 424 | pages.dispose() 425 | 426 | 427 | def test_StripNamespace( self ) : 428 | self._makeWikiStructure() 429 | self.gp.add_arcs_from_category_structure() 430 | 431 | pages = PageSet(self.gp) 432 | pages.create_table() 433 | 434 | #----------------------------------------------------------- 435 | pages.clear() 436 | pages.add_pages_in("topics", None, 5) 437 | pages.strip_namespace( NS_CATEGORY ) 438 | 439 | a = pages.capture() 440 | expected = [ (1111, NS_MAIN, "Lager"), 441 | (1112, NS_MAIN, "Pils"), 442 | (1122, NS_MAIN, "Toe_Cheese") ] 443 | 444 | self.assertEquals(expected, a ) 445 | 446 | #----------------------------------------------------------- 447 | pages.clear() 448 | pages.add_pages_in("Portals", None, 5) 449 | pages.strip_namespace( (NS_CATEGORY, NS_PROJECT) ) 450 | 451 | a = pages.capture() 452 | expected = [ (1, NS_MAIN, "Main_Page") ] 453 | 454 | self.assertEquals(expected, a ) 455 | 456 | #----------------------------------------------------------- 457 | pages.dispose() 458 | 459 | 460 | def test_RetainNamespace( self ) : 461 | self._makeWikiStructure() 462 | self.gp.add_arcs_from_category_structure() 463 | 464 | pages = PageSet(self.gp) 465 | pages.create_table() 466 | 467 | #----------------------------------------------------------- 468 | pages.clear() 469 | pages.add_pages_in("topics", None, 5) 470 | pages.retain_namespace( (NS_MAIN,) ) 471 | 472 | a = pages.capture() 473 | expected = [ (1111, NS_MAIN, "Lager"), 474 | (1112, NS_MAIN, "Pils"), 475 | (1122, NS_MAIN, "Toe_Cheese") ] 476 | 477 | self.assertEquals(expected, a ) 478 | 479 | #----------------------------------------------------------- 480 | pages.clear() 481 | pages.add_pages_in("Portals", None, 5) 482 | pages.retain_namespace( NS_MAIN ) 483 | 484 | a = pages.capture() 485 | expected = [ (1, NS_MAIN, "Main_Page") ] 486 | 487 | self.assertEquals(expected, a ) 488 | 489 | 490 | 491 | if __name__ == '__main__': 492 | unittest.main() 493 | 494 | 495 | -------------------------------------------------------------------------------- /gp/mysql.py: -------------------------------------------------------------------------------- 1 | from client import * 2 | from client import __function__ 3 | 4 | import types 5 | import re 6 | import MySQLdb, MySQLdb.cursors 7 | import warnings 8 | 9 | class MySQLSource (DataSource): 10 | 11 | def __init__(self, result, table): 12 | self.result = result 13 | self.table = table 14 | 15 | 16 | def next(self): 17 | # XXX: if we knew that the order of fields in the result set is the same 18 | # as the order given in self.table, we could just use result.fetchone() 19 | 20 | raw = _fetch_dict( self.result ) 21 | 22 | if not raw: 23 | raise StopIteration() 24 | 25 | row = ( raw.get( f ) for f in self.table.get_fields() ) 26 | 27 | return tuple( row ) 28 | 29 | 30 | def close (self): 31 | self.result.close() 32 | 33 | 34 | def strip_qualifier(self, n ): 35 | return re.sub(r'^.*\.', '', n) 36 | 37 | class MySQLTable (object): 38 | 39 | def __init__(self, name, *fields): 40 | self.name = name 41 | 42 | self.field_definitions = {} 43 | self.key_definitions = [] 44 | 45 | if ( isinstance(fields[0], (tuple, list) ) ): self.fields = fields[0] 46 | else: self.fields = fields 47 | 48 | for f in self.fields: 49 | if ( not f ): raise gpUsageException( "empty field name!" ) 50 | 51 | 52 | #for ( i = count(self.fields) -1; i >= 0; i-- ): 53 | #if ( self.fields[i] ) break 54 | 55 | 56 | #if i+1 < count(self.fields) : 57 | #self.fields = array_slice(self.fields, 0, i+1) 58 | 59 | 60 | def set_name( self, name ): 61 | self.name = name 62 | 63 | 64 | def set_fields(self, field ): 65 | self.fields = fields 66 | 67 | 68 | def set_field_definition(self, field, decl ): 69 | self.field_definitions[field] = decl 70 | 71 | 72 | def add_key_definition(self, keyDef ): 73 | self.key_definitions.append( keyDef ) 74 | 75 | 76 | def get_name(self,): 77 | return self.name 78 | 79 | 80 | 81 | def get_field(self, n ): 82 | return self.fields[ n-1 ] 83 | 84 | 85 | def get_field1(self, basename_only = False ): 86 | if ( basename_only ): return strip_qualifier( self.get_field(1) ) 87 | else: return self.get_field(1) 88 | 89 | 90 | def get_field2(self, basename_only = False ): 91 | if ( basename_only ): return strip_qualifier( self.get_field(2) ) 92 | else: return self.get_field(2) 93 | 94 | 95 | def get_fields(self,): 96 | return self.fields 97 | 98 | 99 | def get_field_list(self,): 100 | return ", ".join( self.fields ) 101 | 102 | 103 | def get_field_definitions(self,): 104 | s = "" 105 | 106 | for f in self.fields: 107 | if ( not f ): continue #XXX: should not happen! 108 | if ( len(s) > 0 ) : s+= ", " 109 | 110 | if ( f in self.field_definitions ) : 111 | s += " %s %s " % (f, self.field_definitions[f]) 112 | else: 113 | s += f + " INT NOT NULL " 114 | 115 | 116 | for k in self.key_definitions: 117 | if ( len(s)>0 ): s+= ", " 118 | s += k 119 | 120 | 121 | return s 122 | 123 | 124 | def _get_select(self,): 125 | return "SELECT " + self.get_field_list() + " FROM " + self.get_name() 126 | 127 | 128 | def get_insert(self, ignore = False ): 129 | ig = "IGNORE" if ignore else "" 130 | return "INSERT " + ig + " INTO " + self.get_name() + " ( " + self.get_field_list() + " ) " 131 | 132 | 133 | def get_order_by(self,): 134 | return "ORDER BY %s" % self.get_field_list() 135 | 136 | 137 | 138 | 139 | class MySQLSelect (MySQLTable): 140 | 141 | def __init__(self, select): 142 | m = re.search(r'^\s*select\s+(.*?)\s+from\s+([^ ]+)(?:\s+(.*))?', select, flags = re.IGNORECASE + re.DOTALL) 143 | 144 | if m: 145 | self.select = select 146 | 147 | n = m.group(2) 148 | ff = re.split(r'\s*,\s*', m.group(1) ) 149 | 150 | for i in range(len(ff)): 151 | f = ff[i] 152 | f = re.sub(r'^.*\s+AS\s+', '', f, flags = re.IGNORECASE) # use alias if defined 153 | ff[i] = f 154 | 155 | super(MySQLSelect,self).__init__(n, ff) 156 | else: 157 | raise gpUsageException("can't parse statement: %s" % select) 158 | 159 | 160 | 161 | def _get_select(self,): 162 | return self.select 163 | 164 | 165 | def get_insert(self, ignore = False ): 166 | raise gpUsageEsxception("can't create insert statement for: %s" % self.select) 167 | 168 | 169 | 170 | class MySQLInserter (object): 171 | def __init__ ( self, glue, table ): 172 | self.glue = glue 173 | self.table = table 174 | self.fields = None 175 | 176 | def insert(self, values ): 177 | raise NotImplementedError( "`insert()' not implemented by %s" % self.__class__ ) 178 | 179 | def flush (self): 180 | pass 181 | 182 | def close (self): 183 | self.flush() 184 | 185 | 186 | 187 | class MySQLSimpleInserter (MySQLInserter): 188 | 189 | def as_list (self, values ): 190 | return self.glue.as_list( values ) 191 | 192 | 193 | def _insert_command(self): 194 | return self.table.get_insert() 195 | 196 | 197 | def insert (self, values ): 198 | sql = self._insert_command() 199 | sql += " VALUES " 200 | sql += self.as_list(values) 201 | 202 | self.glue.mysql_update( sql ) 203 | 204 | 205 | 206 | 207 | class MySQLBufferedInserter (MySQLSimpleInserter): 208 | 209 | def __init__(self, glue, table ): 210 | super(MySQLBufferedInserter,self).__init__(glue, table) 211 | self.buffer = "" 212 | 213 | 214 | def insert (self, values ): 215 | vlist = self.as_list(values) 216 | max = self.glue.get_max_allowed_packet() 217 | 218 | if len(self.buffer)>0 and ( len(self.buffer) + len(vlist) + 2 ) >= max : 219 | self.flush() 220 | 221 | 222 | if len(self.buffer) == 0: 223 | self.buffer = self._insert_command() 224 | self.buffer += " VALUES " 225 | else: 226 | self.buffer += ", " 227 | 228 | self.buffer += vlist 229 | 230 | if len(self.buffer) >= max : 231 | self.flush() 232 | 233 | 234 | 235 | def flush (self): 236 | if len(self.buffer)>0: 237 | self.glue.mysql_update( self.buffer ) 238 | self.buffer = "" 239 | 240 | 241 | class MySQLSink (DataSink): 242 | 243 | def __init__(self, inserter ): 244 | self.inserter = inserter 245 | 246 | 247 | def putRow (self, row ): 248 | self.inserter.insert( row ) 249 | 250 | 251 | def flush (self): 252 | self.inserter.flush() 253 | 254 | 255 | def close (self): 256 | super(MySQLSink, self).close() 257 | self.inserter.close() 258 | 259 | 260 | def drop (self): 261 | raise gpUsageException("only temporary sinks can be dropped") 262 | 263 | 264 | 265 | class MySQLTempSink (MySQLSink): 266 | def __init__( self, inserter, glue, table ): 267 | super(MySQLTempSink, self).__init__(inserter) 268 | 269 | self.glue = glue 270 | self.table = table 271 | 272 | 273 | def drop (self): 274 | sql = "DROP TEMPORARY TABLE IF EXISTS %s" % self.table.get_name() 275 | 276 | ok = self.glue.mysql_update( sql ) 277 | return ok 278 | 279 | 280 | def getTable (self): 281 | return self.table 282 | 283 | 284 | def getTableName (self): 285 | return self.table 286 | 287 | def _fetch_dict( cursor ): 288 | try: 289 | row = cursor.fetch_dict( ) 290 | return row 291 | except AttributeError: 292 | pass 293 | 294 | r = cursor.fetchone() 295 | if r is None: return None 296 | 297 | if hasattr(r, "has_key"): 298 | return r # it's a dict! 299 | 300 | row = {} 301 | 302 | for i in range(len(cursor.description)): 303 | d = cursor.description[ i ] 304 | row[ d[0] ] = r[ i ] 305 | 306 | return row 307 | 308 | class MySQLGlue (Connection): 309 | 310 | def __init__(self, transport, graphname = None ): 311 | super(MySQLGlue, self).__init__(transport, graphname) 312 | 313 | self.connection = None 314 | 315 | self.unbuffered = False 316 | self._update_cursor = None 317 | 318 | self.temp_table_prefix = "gp_temp_" 319 | self.temp_table_db = None 320 | 321 | self.addCallHandler( self.gp_mysql_call_handler ) 322 | 323 | self.max_allowed_packet = None 324 | 325 | def set_unbuffered(self, unbuffered ): 326 | self.unbuffered = unbuffered 327 | 328 | 329 | def mysql_connect( self, server, username, password, db, port = 3306 ): 330 | #FIXME: connection charset, etc! 331 | 332 | #try: 333 | self.connection = MySQLdb.connect(host=server, user=username, passwd=password, db = db, port = port) 334 | 335 | #XXX: would be nice to wrap the exception and provide additional info. 336 | # but without exception chaining, we lose the traceback. wich is bad. 337 | #except MySQLdb.Error, e: 338 | # try: 339 | # raise gpClientException( "Failed to connect! MySQL Error %s: %s" % (e.args[0], e.args[1]) ) 340 | # except IndexError: 341 | # raise gpClientException( "Failed to connect! MySQL Error: %s" % e ) 342 | 343 | if not self.connection : 344 | raise gpClientException( "Failed to connect! (unknown error)" ) 345 | 346 | # autocommit is the default. It's even needed when reading, if we want to 347 | # see changes during a persistent connection. 348 | self.mysql_autocommit(True) 349 | 350 | return True 351 | 352 | def mysql_unbuffered_query( self, sql, **kwargs ): #TODO: port kwargs to PHP 353 | return self.mysql_query( sql, unbuffered = True, **kwargs ) 354 | 355 | def mysql_update( self, sql, **kwargs ): #TODO: port to PHP; use in PHP! 356 | if 'cursor' not in kwargs or not kwargs['cursor']: 357 | if not self.update_cursor: 358 | self._update_cursor = MySQLdb.cursors.SSCursor(self.connection) 359 | 360 | kwargs['cursor'] = self._update_cursor 361 | 362 | self.mysql_query( sql, unbuffered = True, dict_rows = False, **kwargs ) 363 | 364 | return self.connection.affected_rows() 365 | 366 | def inject_query_markers( self, sql, *markers ): #TODO: port markers to PHP 367 | if markers: 368 | for m in markers: 369 | if not m: #handle explicit None, etc 370 | continue 371 | 372 | sql = re.sub( '^\s*(select|update|replace|insert|delete)\s+', '\\1 /* '+m+' */ ', sql, flags = re.IGNORECASE | re.DOTALL ) 373 | 374 | return sql 375 | 376 | def mysql_query( self, sql, unbuffered = None, dict_rows = False, cursor = None, comment = None ): #TODO: port markers to PHP 377 | if unbuffered is None: 378 | unbuffered = self.unbuffered 379 | 380 | sql = self.inject_query_markers(sql, comment) 381 | 382 | if cursor: 383 | using_new_cursor = False 384 | else: 385 | using_new_cursor = True 386 | 387 | if unbuffered: 388 | if dict_rows: 389 | # no buffering, returns dicts 390 | cursor = MySQLdb.cursors.SSDictCursor(self.connection) # TESTME 391 | else: 392 | # no buffering, returns tuples 393 | cursor = MySQLdb.cursors.SSCursor(self.connection) # TESTME 394 | else: 395 | if dict_rows: 396 | # buffers result, returns dicts 397 | cursor = MySQLdb.cursors.DictCursor(self.connection) # TESTME 398 | else: 399 | # default: buffered tuples 400 | cursor = MySQLdb.cursors.Cursor(self.connection) 401 | 402 | with warnings.catch_warnings(): 403 | #ignore MySQL warnings. use cursor.nfo() to get them. 404 | warnings.simplefilter("ignore") 405 | 406 | try: 407 | cursor.execute( sql ) 408 | except: 409 | if using_new_cursor: 410 | cursor.close() #NOTE: *always* close the cursor if an exception ocurred. 411 | raise 412 | 413 | if not dict_rows: 414 | # HACK: glue a fetch_dict method to a cursor that natively returns sequences from fetchone() 415 | # FIXME: if we do this, we for some reason retain a reference to the cursor forever! 416 | # 417 | #m = types.MethodType(_fetch_dict, cursor, cursor.__class__) 418 | #setattr(cursor, "fetch_dict", m) 419 | pass 420 | else: 421 | # make fetch_dict an alias for fetchone 422 | cursor.fetch_dict = cursor.fetchone # TESTME 423 | 424 | return cursor 425 | 426 | #XXX: would be nice to wrap the exception and provide additional info. 427 | # but without exception chaining, we lose the traceback. wich is bad. 428 | #except MySQLdb.Error as e: 429 | #q = sql.replace('/\s+/', ' ') 430 | #if ( len(q) > 255 ): q = q[:252] + '...' 431 | 432 | #try: 433 | # raise gpClientException( "Query failed! MySQL Error %s: %s\nQuery was: %s" % (e.args[0], e.args[1], q) ) 434 | #except IndexError: 435 | # raise gpClientException( "Query failed! MySQL Error: %s\nQuery was: %s" % (e, q) ) 436 | 437 | 438 | def set_mysql_connection(self, connection ): 439 | self.connection = connection 440 | 441 | 442 | def gp_mysql_call_handler( self, gp, params ): 443 | # params: cmd, args, source, sink, capture, result 444 | 445 | cmd = params['command'] 446 | args = params['arguments'] 447 | source = params['source'] 448 | sink = params['sink'] 449 | capture = params['capture'] 450 | result = params['result'] 451 | 452 | m = re.search( r'-(from|into)$', cmd ) 453 | 454 | if m: 455 | cmd = re.sub(r'-(from|into)?$', '', cmd) 456 | action = m.group(1) 457 | 458 | c = len(args) 459 | if not c : 460 | raise gpUsageException("expected last argument to be a table spec; args: %s" % (args, )) 461 | 462 | 463 | t = args[c-1] 464 | args = args[0:c-1] 465 | 466 | if isinstance(t, (str, unicode)) : 467 | if ( re.search( r'^.*select\s+', t, flags = re.IGNORECASE) ): 468 | t = MySQLSelect(t) 469 | else: 470 | t = re.split( r'\s+|\s*,\s*', t ) 471 | 472 | 473 | if ( isinstance(t, (list, tuple)) ): t = MySQLTable( t[0], t[1:] ) 474 | if ( not isinstance(t, MySQLTable) ): raise gpUsageException("expected last argument to be a table spec; found %s" % get_class(t)) 475 | 476 | if action == 'into' : 477 | if ( not t.get_name() or t.get_name() == "?" ): sink = self.make_temp_sink( t ) 478 | else: sink = self.make_sink( t ) 479 | 480 | result = sink #XXX: quite useless, but consistent with -from 481 | else: 482 | source = self.make_source( t ) 483 | 484 | result = source #XXX: a bit confusing, and only useful for temp sinks 485 | 486 | params['command'] = cmd 487 | params['arguments'] = args 488 | params['source'] = source 489 | params['sink'] = sink 490 | params['capture'] = capture 491 | params['result'] = result 492 | 493 | return True 494 | 495 | 496 | def __make_mysql_closure( self, name ): 497 | rc = False 498 | 499 | def call_mysql( *args ): 500 | if not self.connection: 501 | raise gpUsageException( "not connected to mysql, can't run mysql function %s" % (name,) ) 502 | 503 | if not hasattr(self.connection, name): 504 | raise gpUsageException( "unknown mysql function: %s, not in %s" % (name, self.connection.__class__.__name__) ) 505 | 506 | f = getattr(self.connection, name) 507 | 508 | #try: 509 | res = f( *args ) # note: f is bound to self.connection 510 | return res 511 | 512 | #XXX: would be nice to wrap the exception and provide additional info. 513 | # but without exception chaining, we lose the traceback. wich is bad. 514 | #except MySQLdb.Error, e: 515 | #try: 516 | # raise gpClientException( "MySQL %s failed! Error %s: %s" % (name, e.args[0], e.args[1]) ) 517 | #except IndexError: 518 | # raise gpClientException( "MySQL %s failed! Error: %s" % (name, e) ) 519 | 520 | return call_mysql 521 | 522 | def __getattr__( self, name ): 523 | if name.startswith('mysql_'): 524 | f = self.__make_mysql_closure(name[6:]) 525 | 526 | setattr(self, name, f) #re-use closure! 527 | 528 | return f 529 | else: 530 | return super(MySQLGlue, self).__getattr__(name) 531 | 532 | def quote_string (self, s ): #TODO: charset 533 | if type(s) not in (str, unicode): 534 | s = "%s" % s 535 | 536 | return "'" + self.connection.escape_string( s ) + "'" 537 | 538 | def as_list (self, values ): 539 | sql = "(" 540 | 541 | first = True 542 | for v in values: 543 | if ( not first ): sql += "," 544 | else: first = False 545 | 546 | t = type(v) 547 | if ( v is None ): sql+= "None" 548 | elif ( t == int ): sql+= "%i" % v 549 | elif ( t == float ): sql+= "%d" % v 550 | elif ( t == str or t == unicode ): sql+= self.quote_string(v) #TODO: charset... 551 | else: raise gpUsageException("bad value type: %s" % gettype(v)) 552 | 553 | 554 | sql += ")" 555 | 556 | return sql 557 | 558 | id = 1 559 | 560 | def next_id (self): 561 | MySQLGlue.id += 1 562 | return MySQLGlue.id 563 | 564 | def drop_temp_table (self, spec ): 565 | sql = "DROP TEMPORARY TABLE %s" % spec.get_name() 566 | self.mysql_update(sql) 567 | 568 | 569 | def make_temp_table (self, spec ): 570 | table = spec.get_name() 571 | 572 | if ( not table or table == '?' ): 573 | table = "%s%d" % (self.temp_table_prefix, self.next_id()) 574 | 575 | if self.temp_table_db: 576 | table = "%s.%s" % (self.temp_table_db, table); 577 | 578 | sql = "CREATE TEMPORARY TABLE %s" % table 579 | sql += "(" 580 | sql += spec.get_field_definitions() 581 | sql += ")" 582 | 583 | self.mysql_update(sql) 584 | 585 | return MySQLTable(table, spec.get_fields()) 586 | 587 | def mysql_select_db ( self, db ): 588 | #NOTE: the native select_db "sometimes" triggers an InterfaceError. 589 | # This is a strange issue with MySQLdb 590 | 591 | sql = "USE %s" % re.sub('[^\w]', '', db) #TODO: apply real identifier quoting! 592 | 593 | self.mysql_update( sql ) 594 | 595 | def mysql_query_value (self, sql, **kwargs ): 596 | r = self.mysql_query_record( sql, **kwargs ) #TODO: port kwargs to PHP 597 | 598 | if not r: return None 599 | else: return r[0] 600 | 601 | def mysql_query_record (self, sql, **kwargs ): 602 | cursor = self.mysql_query( sql, unbuffered = True, dict_rows = False, **kwargs ) #TODO: port kwargs to PHP 603 | 604 | try: 605 | a = cursor.fetchone() 606 | finally: 607 | cursor.close() 608 | 609 | if ( not a ): return None 610 | else: return a 611 | 612 | def set_max_allowed_packet (self, size ): 613 | self.max_allowed_packet = size 614 | 615 | def get_max_allowed_packet (self): 616 | if self.max_allowed_packet is None: 617 | self.max_allowed_packet = self.mysql_query_value("select @@max_allowed_packet") 618 | 619 | if self.max_allowed_packet is None: 620 | self.max_allowed_packet = 16 * 1024 * 1024 #fall back to MySQL's default of 16MB 621 | 622 | return self.max_allowed_packet 623 | 624 | 625 | def select_into (self, query, sink, **kwargs ): #TODO: port kwargs to PHP 626 | if isinstance(query, (str, unicode)) : 627 | table = MySQLSelect( query ) 628 | sql = query 629 | else: 630 | table = query 631 | sql = src._get_select() 632 | 633 | 634 | res = self.mysql_query( sql, **kwargs ) 635 | src = MySQLSource( res, table ) 636 | 637 | c = self.copy( src, sink, '+' ) 638 | src.close() 639 | 640 | return c 641 | 642 | 643 | def _new_inserter(self, table ): 644 | return MySQLBufferedInserter( self, table ) 645 | 646 | 647 | def make_temp_sink (self, table ): 648 | table = self.make_temp_table(table) 649 | 650 | ins = self._new_inserter(table) 651 | sink = MySQLTempSink( ins, self, table ) 652 | 653 | return sink 654 | 655 | 656 | def make_sink (self, table ): 657 | inserter = self._new_inserter(table) 658 | sink = MySQLSink( inserter ) 659 | 660 | return sink 661 | 662 | 663 | def make_source (self, table, big = False, auto_order = False, **kwargs ): #TODO: PORT auto_order to PHP 664 | sql = table._get_select() 665 | 666 | if auto_order and not re.search(r'\s+ORDER\s+BY\s+', sql, flags = re.IGNORECASE | re.DOTALL ) : #TODO: PORT auto_order to PHP 667 | sql += ' ' + table.get_order_by() 668 | 669 | if not 'unbuffered' in kwargs: 670 | kwargs['unbuffered'] = big 671 | 672 | res = self.mysql_query(sql, **kwargs) #TODO: port kwargs to PHP 673 | 674 | src = MySQLSource( res, table ) 675 | return src 676 | 677 | 678 | def query_to_file (self, query, file, remote = False, **kwargs ): 679 | r = "" if remote else "LOCAL" #TESTME 680 | 681 | query += " INTO %s DATA OUTFILE " % r #TESTME 682 | query += self.quote_string(file) 683 | 684 | cursor = self.mysql_query(query, **kwargs) #TODO: port kwargs to PHP 685 | cursor.close() 686 | 687 | return self.connection.affected_rows() 688 | 689 | 690 | def insert_from_file (self, table, file, remote = False, **kwargs ): 691 | r = "" if remote else "LOCAL" #TESTME 692 | 693 | query = "" 694 | query += " LOAD %s DATA INFILE " % r #TESTME 695 | query += self.quote_string(file) 696 | query += " INTO TABLE %s " % table 697 | 698 | cursor = self.mysql_query(query, **kwargs) #TODO: port kwargs to PHP 699 | cursor.close() 700 | 701 | return self.connection.affected_rows() 702 | 703 | 704 | def close(self): 705 | if self._update_cursor: 706 | try: 707 | self._update_cursor.close() 708 | except Exception as e: 709 | self._trace(__function__(), "failed to close mysql cursor: %s" % e) 710 | #XXX: do we really not care? can we go on? could there have been a commit pending? 711 | 712 | if self.connection: 713 | try: 714 | self._trace(__function__(), "closing mysql connection") 715 | self.mysql_close() 716 | except Exception as e: 717 | self._trace(__function__(), "failed to close mysql connection: %s" % e) 718 | #XXX: do we really not care? can we go on? could there have been a commit pending? 719 | 720 | return super(MySQLGlue, self).close() 721 | 722 | 723 | @staticmethod 724 | def new_client_connection(graphname, host = False, port = False ): 725 | return MySQLGlue( ClientTransport(host, port), graphname ) #FIXME: PORT graphname stuff to PHP! 726 | 727 | 728 | @staticmethod 729 | def new_slave_connection(command, cwd = None, env = None ): 730 | return MySQLGlue( SlaveTransport(command, cwd, env), None ) 731 | 732 | 733 | def dump_query (self, sql ): 734 | print "*** %s ***" % sql 735 | 736 | res = self.mysql_query( sql ) 737 | if ( not res ): return False 738 | 739 | c = self.dump_result( res ) 740 | res.close() 741 | 742 | return c 743 | 744 | 745 | def dump_result (self, res ): 746 | keys = None 747 | c = 0 748 | 749 | print "" 750 | while True: 751 | row = _fetch_dict(res) 752 | if not row: break 753 | 754 | if keys is None : 755 | s = "" 756 | for k in row.keys(): 757 | s += k 758 | s += "\t" 759 | 760 | 761 | print s 762 | 763 | s = "" 764 | for v in row: 765 | s += v 766 | s += "\t" 767 | 768 | print s 769 | c += 1 770 | 771 | 772 | print "-----------------------------" 773 | print "%i rows" % c 774 | 775 | return c 776 | 777 | 778 | -------------------------------------------------------------------------------- /gpfeeder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys, os, os.path, pprint, traceback 4 | import ConfigParser, optparse 5 | import datetime, time 6 | import pickle 7 | import pprint 8 | import threading 9 | import traceback 10 | import socket 11 | import resource, struct 12 | import code 13 | import gc 14 | import _mysql_exceptions 15 | 16 | from gp.client import * 17 | from gp.mysql import * 18 | from gp.mediawiki import * 19 | #from wikitools import wiki, api 20 | 21 | class CommandStream(object): 22 | def __init__(self, f): 23 | if f is None: 24 | pass 25 | if type(file) in (str, unicode): 26 | f = file(f, "ra") 27 | self.close_file = True 28 | else: 29 | self.close_file = False 30 | 31 | if isinstance(f, socket.socket): 32 | f = f.makefile("ra") 33 | 34 | #NOTE: f could be a file or a fifo or a unix socket or a tcp socket... 35 | 36 | self.file = f 37 | self._closed = False 38 | 39 | def close(self): 40 | self._closed = True 41 | 42 | if self.close_file and self.file: 43 | self.file.close() 44 | self.file = None 45 | 46 | #FIXME: if file is None, we are using raw_input. close stdin to interrupt?! 47 | 48 | def __iter__(self): 49 | return self 50 | 51 | def next(self): 52 | while not self._closed: 53 | try: 54 | if self.file: 55 | s = self.file.readline() 56 | 57 | if s is None or s == '': #EOF 58 | break 59 | else: 60 | s = raw_input("> ") 61 | 62 | except socket.timeout: 63 | #use timeouts to check self._closed 64 | continue 65 | 66 | except EOFError: 67 | break #EOF 68 | 69 | except KeyboardInterrupt: 70 | #self.feeder.warning("KeyboardInterrupt, terminating") 71 | s = "shutdown" 72 | 73 | s = s.strip() 74 | 75 | if s == '': # blank line 76 | continue 77 | 78 | if s.startswith('#') or s.startswith(';'): # comment line 79 | continue 80 | 81 | return s.split() 82 | 83 | self.close() 84 | raise StopIteration() 85 | 86 | class Script(threading.Thread): 87 | def __init__(self, feeder, stream): 88 | super(Script, self).__init__() 89 | 90 | self.feeder = feeder 91 | 92 | if not isinstance(stream, CommandStream): 93 | stream = CommandStream(stream) # wrap handle or file path 94 | 95 | self._cmd_stream = stream 96 | 97 | self._stopped = False 98 | 99 | def stop(self): 100 | #XXX: this thread is probably currently blocking on I/O. 101 | # how to interrupt it? 102 | 103 | self.feeder.trace("closing script loop") 104 | self._cmd_stream.close() 105 | self._stopped = True 106 | 107 | def run(self): 108 | self.feeder.trace("entering script loop") 109 | 110 | for cmd in self._cmd_stream: 111 | if self._stopped: 112 | self._cmd_stream.close() 113 | break 114 | 115 | try: 116 | self.feeder.trace("running command %s" % cmd) 117 | self.run_command(cmd) 118 | except Exception as e: 119 | self.feeder.error("failed to run command %s: %s" % (cmd, e), e) 120 | 121 | self.feeder.trace("exiting script loop") 122 | 123 | def run_command(self, cmd): 124 | name = "cmd_%s" % cmd[0].replace('-', '_') 125 | 126 | args = [] 127 | opt = {} 128 | 129 | if cmd[0] == "bye" or cmd[0] == "exit": 130 | self.stop() 131 | return 132 | 133 | if cmd[0] == "shutdown": 134 | self.stop() 135 | 136 | for c in cmd[1:]: 137 | (k, sep, v) = c.partition("=") 138 | 139 | if sep == '': 140 | args.append(c) 141 | else: 142 | opt[k]=v 143 | 144 | f = getattr(self.feeder, name) # let exception rip 145 | f(*args, **opt) 146 | 147 | 148 | class Job(object): 149 | def __init__(self, feeder, wiki): 150 | self.feeder = feeder 151 | self.wiki = wiki 152 | 153 | self.done = False 154 | self.success = None 155 | self.followup = None 156 | self.time = None 157 | 158 | self.try_create = False 159 | 160 | def trace(self, msg): 161 | self.feeder.trace(msg) 162 | 163 | def log(self, msg): 164 | self.feeder.log(msg) 165 | 166 | def warning(self, msg): 167 | self.feeder.warning(msg) 168 | 169 | def error(self, msg, e = None): 170 | self.feeder.error(msg, e) 171 | 172 | def activate(self): 173 | # switch to the desired wiki 174 | self.feeder.connect( self.wiki, create = self.try_create ) 175 | 176 | def reset(self): 177 | self.done = False 178 | 179 | def execute(self): 180 | if self.done: 181 | raise Exception("can't re-run a job! %s", self) 182 | 183 | self.start = datetime.datetime.now() 184 | 185 | try: 186 | self.activate() 187 | 188 | self.followup = self.run() 189 | 190 | self.success = True 191 | self.time = datetime.datetime.now() - self.start 192 | 193 | self.trace("finished %s after %s" % (self, self.time)) 194 | return self.followup 195 | finally: 196 | self.done = True 197 | 198 | def run(self): 199 | raise Error("must implement run()") 200 | 201 | def __str__(self): 202 | return "%s ( wiki = %s )" % (self.__class__.__name__, self.wiki) 203 | 204 | class LoadChunkJob(Job): 205 | def __init__(self, feeder, wiki, namespaces, from_page, offset, limit = None, update_followup = None): 206 | super(LoadChunkJob, self).__init__(feeder, wiki) 207 | 208 | assert type(offset) == int 209 | 210 | self.update_followup = update_followup 211 | 212 | self.namespaces = namespaces 213 | self.chunk_size = limit 214 | self.from_page = from_page 215 | self.offset = offset 216 | 217 | if not self.namespaces: 218 | self.cl_types = None 219 | else: 220 | self.cl_types = set() 221 | 222 | if NS_CATEGORY in self.namespaces: 223 | self.cl_types.add( "subcat" ) 224 | 225 | if NS_FILE in self.namespaces: 226 | self.cl_types.add( "file" ) 227 | 228 | if len(self.cl_types) < len(self.namespaces): # if there's more in self.namespaces... 229 | self.cl_types.add( "page" ) 230 | 231 | def __str__(self): 232 | return "%s ( wiki = %s, namespaces = %s, from_page = %d, offset = %d )" % (self.__class__.__name__, self.wiki, self.namespaces, self.from_page, self.offset) 233 | 234 | def run(self): 235 | wiki_config = feeder.get_wiki_config(self.wiki) 236 | 237 | if ( 'load-query-limit' in wiki_config 238 | and wiki_config['load-query-limit'] 239 | and int(wiki_config['load-query-limit']) > 0 ): 240 | 241 | limit = int(wiki_config['load-query-limit']) 242 | else: 243 | if self.offset: 244 | limit = 1000000000 245 | else: 246 | limit = None 247 | 248 | self.log( 'loading cat structure from page %d, offset %d ' % (self.from_page, self.offset) ) 249 | 250 | sql = "SELECT P.page_id as parent, C.page_id as child, C.page_title as name " 251 | sql += " FROM " + self.feeder.gp.wiki_table( "categorylinks" ) 252 | sql += " JOIN " + self.feeder.gp.wiki_table( "page" ) + " AS P " 253 | sql += " ON P.page_title = cl_to AND P.page_namespace = %d " % NS_CATEGORY 254 | sql += " JOIN " + self.feeder.gp.wiki_table( "page" ) + " AS C " 255 | sql += " ON C.page_id = cl_from " 256 | 257 | where = [] 258 | 259 | if self.namespaces: 260 | where.append( " C.page_namespace IN %s " % self.feeder.gp.as_list( self.namespaces ) ) 261 | where.append( " cl_type IN %s " % self.feeder.gp.as_list( self.cl_types ) ) 262 | 263 | if self.from_page: 264 | where.append( " C.page_id >= %d " % self.from_page ) 265 | 266 | if where: 267 | sql += " WHERE ( %s ) " % " ) AND ( ".join ( where ) 268 | 269 | if limit: # if there is no limit, we don't need to sort! 270 | sql += " ORDER BY C.page_id, P.page_id " 271 | 272 | if limit: 273 | sql += " LIMIT %d " % limit 274 | 275 | if self.offset: 276 | sql += " OFFSET %d " % self.offset 277 | 278 | self.log( 'executing query for category arcs: %s' % sql ) 279 | src = self.feeder.gp.make_source( MySQLSelect( sql ), comment = self.feeder.slow_query_comment, big = True ) 280 | 281 | self.trace( 'fetching category arcs' ) 282 | 283 | cont = ChunkTracer() 284 | cont.offset = self.offset 285 | 286 | if ( 'load-chunk-size' in wiki_config 287 | and wiki_config['load-chunk-size'] 288 | and int( wiki_config['load-chunk-size'] ) > 0 ): 289 | 290 | chunk_size = int(wiki_config['load-chunk-size']) 291 | else: 292 | chunk_size = None 293 | 294 | self.feeder.add_arcs( src, cont, chunk_size = chunk_size ) 295 | 296 | src.close() 297 | 298 | self.log( 'loaded %d arcs' % cont.count ) 299 | 300 | if not limit or cont.count == 0: # done loading 301 | self.feeder.set_meta(status = "loaded") 302 | 303 | return self.update_followup 304 | 305 | assert cont.id, "Don't know continuation ID even though query returned results" 306 | 307 | if cont.count < limit: 308 | self.feeder.set_meta(status = "loaded", load_pos = cont.id, load_offset = cont.offset) 309 | 310 | return self.update_followup 311 | else: 312 | self.feeder.set_meta( status = "loading", load_pos = cont.id, load_offset = cont.offset ) 313 | 314 | return LoadChunkJob( self.feeder, self.wiki, self.namespaces, from_page = cont.id, offset = cont.offset, update_followup = self.update_followup ) 315 | 316 | class StartLoadJob(Job): 317 | def __init__(self, feeder, wiki, namespaces, start_polling = False): 318 | super(StartLoadJob, self).__init__(feeder, wiki) 319 | 320 | self.namespaces = namespaces 321 | 322 | self.start_polling = start_polling 323 | self.try_create = True 324 | 325 | def __str__(self): 326 | return "%s ( wiki = %s, start_polling = %s )" % (self.__class__.__name__, self.wiki, self.start_polling) 327 | 328 | def run(self): 329 | wiki_config = feeder.get_wiki_config(self.wiki) 330 | self.log("loading category structure of %s" % self.wiki) 331 | 332 | up_current = self.feeder.get_latest_category_timestamp() 333 | dl_current = self.feeder.get_latest_log_timestamp() 334 | 335 | if not up_current: 336 | up_current = "00000000000000" 337 | 338 | if not dl_current: 339 | dl_current = "00000000000000" 340 | 341 | if self.start_polling: 342 | # get job that will launch polling, starting with the current state (before loading the structure) 343 | 344 | update_job = [] 345 | update_job.append( UpdateModifiedJob(self.feeder, self.wiki, self.namespaces, since = up_current, keep_polling = self.start_polling) ) 346 | update_job.append( UpdateDeletedJob(self.feeder, self.wiki, self.namespaces, since = dl_current, keep_polling = self.start_polling) ) 347 | else: 348 | update_job = None 349 | 350 | feeder_state = self.feeder.get_meta() 351 | 352 | #reset state for incremental processing 353 | feeder_state["mods_offset"] = 0 354 | feeder_state["dels_offset"] = 0 355 | feeder_state["mods_until"] = up_current 356 | feeder_state["dels_until"] = dl_current 357 | feeder_state["mods_state"] = "init" 358 | feeder_state["dels_state"] = "init" 359 | feeder_state["status"] = "loading" 360 | feeder_state["load_offset"] = 0 361 | feeder_state["load_pos"] = 0 362 | 363 | if not self.namespaces: 364 | feeder_state["namespaces"] = "*" 365 | else: 366 | feeder_state["namespaces"] = "+".join( [ str(n) for n in self.namespaces ] ) 367 | 368 | if self.namespaces and NS_CATEGORY in self.namespaces and len(self.namespaces) == 1: 369 | feeder_state["graph_type"] = "no-leafs" 370 | else: 371 | feeder_state["graph_type"] = "with-leafs" 372 | 373 | self.feeder.set_meta(**feeder_state) 374 | 375 | g = wiki_config['gp-graph'] 376 | 377 | #TODO: create with temp name, rename when import done 378 | #TODO: we want graphserv to support rename-graph and replace-graph 379 | # note: existing connections should switch to new graph seemlessly. 380 | # note: rename blocks until there's no active command on the graph 381 | 382 | created = self.feeder.gp.try_create_graph(g) 383 | if created: 384 | self.log( "created graph %s" % g) 385 | else: 386 | self.log( 'clearing graph %s' % g ) 387 | self.feeder.gp.clear() 388 | 389 | next_job = LoadChunkJob(self.feeder, self.wiki, self.namespaces, 0, 0, update_followup = update_job) 390 | return next_job 391 | 392 | class UpdateModifiedJob(Job): 393 | def __init__(self, feeder, wiki, namespaces, since = None, offset = 0, keep_polling = False): 394 | super(UpdateModifiedJob, self).__init__(feeder, wiki) 395 | 396 | assert type(offset) == int 397 | 398 | self.keep_polling = keep_polling 399 | self.namespaces = namespaces 400 | 401 | self.since = since 402 | self.offset = offset 403 | 404 | self.state_name = "mods" 405 | 406 | def __str__(self): 407 | return "%s ( wiki = %s, state_name = %s, since = %s, offset = %d )" % (self.__class__.__name__, self.wiki, self.state_name, self.since, self.offset) 408 | 409 | def make_followup(self, since = None, offset = 0, keep_polling = None): 410 | assert type(offset) == int 411 | 412 | if keep_polling is None: 413 | keep_polling = self.keep_polling 414 | 415 | return self.__class__(self.feeder, self.wiki, namespaces = self.namespaces, since = since, offset = offset, keep_polling = keep_polling) 416 | 417 | def run(self): 418 | wiki_config = feeder.get_wiki_config(self.wiki) 419 | limit = int(wiki_config['update-max-cats']) 420 | 421 | feeder_state = self.feeder.get_meta() 422 | 423 | if not "status" in feeder_state or not feeder_state["status"]: 424 | raise gpException( "not yet loaded (no feeder status)" ) 425 | elif feeder_state["status"] == "loading": 426 | raise gpException( "loadeding not yet complete (feeder: %s)" % feeder_state["status"] ) 427 | 428 | #FIXME: verify that our namespace set is the same as the original in feeder_state 429 | #XXX: Or just use the namespaces set in feeder_state?? 430 | 431 | if self.since is None: 432 | offset = feeder_state.get(self.state_name+'_offset') 433 | since = feeder_state.get(self.state_name+'_until') 434 | else: 435 | since = self.since 436 | offset = self.offset 437 | 438 | if not since: 439 | self.error("can't apply an update without a baseline. If the graph contains no meta-vars describing the update state, you must provide the --update-since option.") 440 | return False 441 | 442 | if offset is None: 443 | offset = 0 444 | 445 | if self.keep_polling: 446 | feeder_state['status'] = "polling" 447 | else: 448 | feeder_state['status'] = "updating" 449 | 450 | seen = set() 451 | items = 0 452 | max_timestamp = since 453 | 454 | self.trace( 'updating %s since %s, offset %s ' % (self.state_name, since, offset) ) 455 | 456 | cats = self.get_modified_pages(since, offset, limit) 457 | 458 | if not cats: 459 | #update done 460 | 461 | if self.keep_polling: 462 | # keep polling for updates 463 | 464 | feeder_state[self.state_name+'_state'] = "up_to_date" 465 | self.feeder.set_meta(**feeder_state) 466 | 467 | # rely on stored state to pass since/offset to the next update job 468 | return self.make_followup(keep_polling = True) 469 | else: 470 | self.log( 'update complete' ) 471 | self.feeder.set_meta(status = "updated") 472 | 473 | return None 474 | 475 | pg = {} # reusable dict for page entries 476 | 477 | for pg_row in cats: 478 | # assign fields of reusable page dict 479 | # ...the meaning of the column is arcane knowledge... 480 | pg['id'] = pg_row[0] 481 | pg['title'] = pg_row[1] 482 | pg['namespace'] = pg_row[2] 483 | pg['timestamp'] = pg_row[3] 484 | pg['i'] = pg_row[4] 485 | 486 | page_id = pg['id'] 487 | 488 | if pg['timestamp'] > max_timestamp: 489 | offset = 0 490 | max_timestamp = pg['timestamp'] 491 | 492 | offset += 1 493 | 494 | if page_id in seen: 495 | continue 496 | 497 | seen.add(page_id) 498 | items += 1 499 | 500 | self.log( 'updating %s: category %s ' % (self.state_name, pg['title']) ) 501 | self.update_categorization( pg ) 502 | 503 | self.log( 'updated %d %s since %s until %s' % (items, self.state_name, since, max_timestamp) ) 504 | 505 | # store update state 506 | feeder_state[self.state_name+'_offset'] = offset 507 | feeder_state[self.state_name+'_until'] = max_timestamp 508 | feeder_state[self.state_name+'_state'] = "catching_up" 509 | 510 | self.feeder.set_meta(**feeder_state) 511 | 512 | # not done yet, keep going. 513 | # rely on stored state to pass since/offset to the next update job 514 | 515 | followup = self.make_followup() 516 | return followup 517 | 518 | def get_modified_pages(self, since, offset, limit): 519 | mods = self.feeder.get_touched_categories( since = since, offset = offset, limit = limit ) 520 | return mods 521 | 522 | def update_categorization(self, pg ): 523 | n = self.feeder.update_category_children( pg, namespaces = self.namespaces ) 524 | return n 525 | 526 | class UpdateDeletedJob(UpdateModifiedJob): 527 | def __init__(self, feeder, wiki, namespaces, since = None, offset = 0, keep_polling = False): 528 | super(UpdateDeletedJob, self).__init__(feeder, wiki, namespaces = namespaces, since = since, offset = offset, keep_polling = keep_polling) 529 | self.state_name = "dels" 530 | 531 | def get_modified_pages(self, since, offset, limit): 532 | mods = self.feeder.get_deleted_pages( since = since, namespaces = self.namespaces, offset = offset, limit = limit ) 533 | return mods 534 | 535 | def update_categorization(self, pg ): 536 | n = self.feeder.remove_node( pg ) 537 | return n 538 | 539 | PROC_STAT_FIELDS = ( 540 | "pid", 541 | "comm", 542 | "state", 543 | "ppid", 544 | "pgrp", 545 | "session", 546 | "tty_nr", 547 | "tpgid", 548 | "flags", 549 | "minflt", 550 | "cminflt", 551 | "majflt", 552 | "cmajflt", 553 | "utime", 554 | "stime", 555 | "cutime", 556 | "cstime", 557 | "priority", 558 | "nice", 559 | "dummy", 560 | "itrealvalue", 561 | "starttime", 562 | "vsize", 563 | "rss", 564 | ) 565 | 566 | PROC_PSINFO_STRUCT = ( 567 | # Perl: iiii iiii iiIi iiiS Sa8a8a8 Z16Z80ii IIaa3 iiia 568 | # Python: iiii iiii iiIi iiiH H8s8s8s 16s80sii IIc3s iiic 569 | 570 | ("flag", "i", 4), 571 | ("nlwp", "i", 4), 572 | ("pid", "i", 4), 573 | ("ppid", "i", 4), 574 | 575 | ("pgid", "i", 4), 576 | ("sid", "i", 4), 577 | ("uid", "i", 4), 578 | ("euid", "i", 4), 579 | 580 | ("gid", "i", 4), 581 | ("egid", "i", 4), 582 | ("addr", "i", 4), 583 | ("vsize", "i", 4), 584 | 585 | ("rss", "i", 4), 586 | ("pad1", "i", 4), 587 | ("ttydev", "i", 4), 588 | ("pctcpu", "H", 2), 589 | 590 | ("pctmem", "H", 2), 591 | ("start", "8s", 8), 592 | ("time", "8s", 8), 593 | ("ctime", "8s", 8), 594 | 595 | ("fname", "16s", 16), 596 | ("psargs", "80s", 80), 597 | ("wstat", "i", 4), 598 | ("argc", "i", 4), 599 | 600 | ("argv", "i", 4), 601 | ("envp", "i", 4), 602 | ("dmodel", "c", 1), 603 | ("taskid", "3s", 3), 604 | 605 | ("projid", "i", 4), 606 | ("nzomb", "i", 4), 607 | ("filler_1", "i", 4), 608 | ("filler_2", "c", 1), 609 | ) 610 | 611 | PROC_PSINFO_FIELDS = [ r[0] for r in PROC_PSINFO_STRUCT ] 612 | PROC_PSINFO_PATTERN = " ".join( [ r[1] for r in PROC_PSINFO_STRUCT ] ) 613 | PROC_PSINFO_SIZE = reduce( lambda acc, c: acc+c, [ r[2] for r in PROC_PSINFO_STRUCT ] ) 614 | 615 | def proc_stat( pid = None ): 616 | if not pid: 617 | pid = os.getpid() 618 | 619 | try: 620 | f = file("/proc/%d/stat" % pid) 621 | s = f.read() 622 | f.close() 623 | 624 | r = s.split(" ") 625 | 626 | return dict(zip(PROC_STAT_FIELDS, r)) 627 | 628 | except IOError: 629 | pass #not supported, /proc/.../stat is a Linuxism! 630 | 631 | try: 632 | f = file("/proc/%d/psinfo" % pid) 633 | s = f.read() 634 | f.close() 635 | 636 | pattern = PROC_PSINFO_PATTERN 637 | 638 | if len(s) > PROC_PSINFO_SIZE: #ignore extra data 639 | pattern += " %ds" % ( len(s) - PROC_PSINFO_SIZE, ) 640 | 641 | r = struct.unpack(pattern, s) 642 | 643 | stats = dict(zip(PROC_PSINFO_FIELDS, r)) 644 | 645 | stats['vsize'] *= 1024 #solaris provides KB here, apparently 646 | 647 | return stats 648 | 649 | except IOError: 650 | pass #not supported, /proc/.../psinfo is a Solarisism! 651 | 652 | return False 653 | 654 | class ChunkTracer(object): 655 | """callback for tracing and muning items while loading the category structure""" 656 | 657 | def __init__(self): 658 | self.id = None 659 | self.name = None 660 | 661 | self.offset = 0 662 | self.count = 0 663 | 664 | def __call__( self, *args ): 665 | row = args[0] 666 | id = row[1] 667 | 668 | if self.id is None or self.id != id: 669 | self.id = id 670 | self.name = row[2] 671 | self.offset = 1 672 | else: 673 | self.offset += 1 674 | 675 | self.count += 1 676 | 677 | return row[0:2] 678 | 679 | def __str__(self): 680 | if not self.id: 681 | return "none" 682 | else: 683 | return "%d rows, up to %s, offset %d" % (self.count, self.name, self.offset) 684 | 685 | class Feeder(object): 686 | def __init__(self, options): 687 | self.wiki_states = {} 688 | self.wiki_config = None 689 | self.current_wiki = None 690 | 691 | self.gp = None 692 | self.slow_query_comment = None 693 | 694 | self.config = None #XXX: automatically load options here? 695 | self.options = options 696 | 697 | self.jobs = [] # job queue 698 | self.job_lock = threading.Lock() 699 | 700 | self.verbose = options.verbose 701 | self.debug = options.debug 702 | 703 | self._stopped = False 704 | self._frozen = False 705 | 706 | self.terminate_when_empty = False 707 | 708 | # profilers, etc 709 | self.heapy = None 710 | self.dowser = None 711 | 712 | if options.heapy: 713 | self.log("preparing heapy memory profiler.") 714 | self.get_heapy() 715 | 716 | if options.dowser: 717 | self.log("preparing dowser memory profiler.") 718 | 719 | import dowser 720 | self.dowser = dowser.Root() 721 | 722 | if options.cherrypy: 723 | self.log("preparing cherrypy browser") 724 | import cherrypy 725 | 726 | if self.dowser: 727 | self.trace("mounting dowser into cherrypy") 728 | cherrypy.tree.mount(self.dowser) 729 | elif self.heapy: 730 | self.trace("mounting heapy into cherrypy") 731 | cherrypy.tree.mount(self.heapy) 732 | 733 | cherrypy.config.update({ 734 | 'environment': 'embedded', 735 | 'server.socket_port': int(options.cherrypy_port) 736 | }) 737 | 738 | self.log("starting cherrypy browser on port %s" % options.cherrypy_port) 739 | cherrypy.server.quickstart() 740 | cherrypy.engine.start() 741 | 742 | def log_prefix(self): 743 | return time.strftime("%Y-%m-%d %H:%M:%S") 744 | 745 | def trace(self, msg): 746 | if self.verbose: 747 | print self.log_prefix(), " ", msg 748 | 749 | def log(self, msg): 750 | print self.log_prefix(), msg 751 | 752 | def warning(self, msg): 753 | print self.log_prefix(), "WARNING:", msg 754 | 755 | def error(self, msg, e = None): 756 | print self.log_prefix(), "ERROR:", msg 757 | 758 | if e and self.verbose: 759 | traceback.print_exc() 760 | 761 | def get_meta(self, use_cache = True): 762 | state = None 763 | 764 | if self.gp.supportsProtocolVersion(4): # meta-vars supported since prot version 4 765 | meta = self.gp.capture_list_meta_map() 766 | state = {} 767 | 768 | for (k, v) in meta.items(): 769 | if k.startswith( "gpfeeder_" ): 770 | k = k[9:] 771 | state[k] = v 772 | 773 | elif 'state-file' in self.wiki_config: 774 | if not use_cache or not self.current_wiki not in self.wiki_states: 775 | p = self.wiki_config['state-file'] 776 | 777 | if os.path.exists( p ): 778 | try: 779 | f = file( p, 'r' ) 780 | self.wiki_states = pickle.load( f ) 781 | f.close() 782 | except IOError as (errno, strerror): 783 | self.warning( "Coudn't load state: I/O error({0}): {1}".format(errno, strerror) ) 784 | except EOFError as e: 785 | self.warning( "Coudn't load state: %s" % sys.exc_info()[0] ) 786 | 787 | if state is None: 788 | if self.current_wiki in self.wiki_states: 789 | state = self.wiki_states[self.current_wiki] 790 | else: 791 | state = {} 792 | 793 | self.trace("get_meta: %s => %s" % (self.current_wiki, state)) 794 | return state 795 | 796 | def set_meta(self, **state_map): 797 | state_map["timestamp"] = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") 798 | 799 | self.trace("set_meta: %s => %s" % (self.current_wiki, state_map)) 800 | 801 | if self.gp.supportsProtocolVersion(4): # meta-vars supported since prot version 4 802 | for (k, v) in state_map.items(): 803 | if v is None or v == False or v == "": 804 | self.gp.try_remove_meta("gpfeeder_" + k) 805 | else: 806 | self.gp.set_meta("gpfeeder_" + k, v) 807 | 808 | else: 809 | if self.current_wiki in self.wiki_states: 810 | self.wiki_states[self.current_wiki].update( state_map ) 811 | else: 812 | self.wiki_states[self.current_wiki] = state_map 813 | 814 | if 'state-file' in self.wiki_config: 815 | p = self.wiki_config['state-file'] 816 | 817 | f = file( p, 'w' ) 818 | pickle.dump( self.wiki_states, f ) 819 | f.close() 820 | 821 | def get_latest_rcid( self, before = None ): 822 | sql = "SELECT MAX(rc_id) " 823 | sql += " FROM " + self.gp.wiki_table( "recentchanges" ) 824 | 825 | if before: 826 | sql += " WHERE rc_timestamp < %s " % self.gp.quote_string(before) 827 | 828 | self.trace("get_latest_rcid(%s)" % before) 829 | rcid = self.gp.mysql_query_value( sql ) 830 | return rcid 831 | 832 | def get_latest_rc_timestamp( self ): 833 | sql = "SELECT MAX(rc_timestamp) " 834 | sql += " FROM " + self.gp.wiki_table( "recentchanges" ) 835 | 836 | self.trace("get_latest_rc_timestamp") 837 | t = self.gp.mysql_query_value( sql ) 838 | return t 839 | 840 | def get_latest_log_timestamp( self ): 841 | sql = "SELECT MAX(log_timestamp) " 842 | sql += " FROM " + self.gp.wiki_table( "logging" ) 843 | 844 | self.trace("get_latest_log_timestamp") 845 | t = self.gp.mysql_query_value( sql ) 846 | return t 847 | 848 | def get_latest_category_timestamp( self ): 849 | sql = "SELECT MAX(page_touched) " 850 | sql += " FROM " + self.gp.wiki_table( "page" ) 851 | sql += " WHERE page_namespace = %d " % NS_CATEGORY 852 | 853 | self.trace("get_latest_category_timestamp") 854 | t = self.gp.mysql_query_value( sql, comment = self.slow_query_comment ) 855 | return t 856 | 857 | def get_deleted_pages( self, since, namespaces, limit = None, offset = None ): 858 | if offset is None: 859 | offset = 0 860 | 861 | # deleted category pages 862 | # note: we need to join against archive to get the page id, since log_page_id is 0 for deletions. 863 | 864 | self.gp.mysql_query( "SET @counter = 0" ); #use counter instead of native offset 865 | 866 | sql = "SELECT ar_page_id as id, log_title as title, log_namespace as namespace, max(log_timestamp) as log_timestamp, @counter:=@counter+1 as i " 867 | sql += " FROM " + self.gp.wiki_table( "logging" ) 868 | sql += " JOIN " + self.gp.wiki_table( "archive" ) 869 | sql += " ON ar_namespace = log_namespace AND ar_title = log_title " 870 | 871 | sql += " WHERE log_type = 'delete' AND log_action = 'delete' " 872 | sql += " AND log_timestamp >= %s " % self.gp.quote_string( since ) 873 | 874 | if namespaces: 875 | sql += " AND log_namespace IN %s " % self.gp.as_list( namespaces ) 876 | 877 | sql += " GROUP BY ar_page_id " #XXX: this could make the limit expensive, if since is far in the past. 878 | sql += " HAVING log_timestamp > %s " % self.gp.quote_string( since ) 879 | sql += " OR i > %d " % offset 880 | sql += " ORDER BY log_timestamp, log_id " # NOTE: must be monotonously increasing over time 881 | 882 | if limit: 883 | sql += " LIMIT %d " % limit 884 | 885 | #XXX: This doesn't actually work! entries in the deletion log have log_page=0 always! 886 | # There's apparently no way to get the id of the deleted page. 887 | src = self.gp.make_source( MySQLSelect( sql ) ) 888 | dels = src.drain() 889 | src.close() 890 | 891 | return dels 892 | 893 | def get_touched_categories( self, since, limit = None, offset = None ): 894 | if offset is None: 895 | offset = 0 896 | 897 | # touched category pages 898 | 899 | self.gp.mysql_query( "SET @counter = 0" ); #use counter instead of native offset 900 | 901 | sql = "SELECT page_id as id, page_title as title, page_namespace as namespace, page_touched as touched, @counter:=@counter+1 as i " 902 | sql += " FROM " + self.gp.wiki_table( "page" ) 903 | sql += " WHERE page_namespace = %d " % NS_CATEGORY 904 | sql += " AND page_touched >= %s " % self.gp.quote_string( since ) 905 | sql += " HAVING page_touched > %s " % self.gp.quote_string( since ) 906 | sql += " OR i > %d " % offset 907 | sql += " ORDER BY page_touched, page_latest " # NOTE: hopefully be monotonously increasing over time 908 | 909 | if limit: 910 | sql += " LIMIT %d " % limit 911 | 912 | src = self.gp.make_source( MySQLSelect( sql ) ) 913 | mods = src.drain() 914 | src.close() 915 | 916 | return mods 917 | 918 | def update_categorization( self, pg ): 919 | sql = "SELECT P.page_id as parent " 920 | sql += " FROM " + self.gp.wiki_table( "categorylinks" ) 921 | sql += " JOIN " + self.gp.wiki_table( "page" ) + " AS P " 922 | sql += " ON P.page_title = cl_to AND P.page_namespace = %d " % NS_CATEGORY 923 | sql += " WHERE cl_from = %d " % pg['id'] 924 | 925 | src = self.gp.make_source( MySQLSelect( sql ) ) 926 | n = src.result.rowcount # XXX: this is kind of hackish and should perhaps be encapsulated in in Source object 927 | 928 | self.gp.replace_predecessors( pg['id'], src ) 929 | src.close() 930 | 931 | return n 932 | 933 | def update_category_children( self, pg, namespaces = None ): 934 | sql = "SELECT cl_from " 935 | sql += " FROM " + self.gp.wiki_table( "categorylinks" ) 936 | 937 | if namespaces: 938 | sql += " JOIN " + self.gp.wiki_table( "page" ) + " AS P " 939 | sql += " ON P.page_id = cl_from " 940 | 941 | sql += " WHERE cl_to = %s " % self.gp.quote_string( pg['title'] ) 942 | 943 | if namespaces: 944 | sql += " AND P.page_namespace IN %s " % self.gp.as_list( namespaces ) 945 | 946 | src = self.gp.make_source( MySQLSelect( sql ) ) 947 | n = src.result.rowcount # XXX: this is kind of hackish and should perhaps be encapsulated in in Source object 948 | 949 | self.gp.replace_successors( pg['id'], src ) 950 | src.close() 951 | 952 | return n 953 | 954 | def remove_node( self, pg ): 955 | #TODO: we *really* want graphcore to support remove-node! 956 | 957 | if pg['id'] <=0: 958 | return 959 | 960 | pre = self.gp.capture_list_predecessors( pg['id'] ) 961 | succ = self.gp.capture_list_successors( pg['id'] ) 962 | 963 | arcs = [ (p[0], pg['id']) for p in pre ] + [ (pg['id'], s[0]) for s in succ ] 964 | 965 | self.gp.remove_arcs( arcs ) 966 | 967 | def add_arcs( self, src, tracer = None, chunk_size = None ): 968 | if ( chunk_size ): 969 | #XXX: the caller may already know the number of rows in src. 970 | # we could shorten out if we know that it was less than chunk_size! 971 | 972 | while True: # loop though chunks 973 | chunk_src = LimitedSource( src, chunk_size ) 974 | self.gp.add_arcs( chunk_src, tracer ) 975 | 976 | self.trace( "added a chunk of %d arcs " % chunk_src.index ) 977 | 978 | if not chunk_src.limit_reached(): 979 | break # if the limit wasn't reached, there's no more data in src. 980 | else: 981 | # if there's no chunk limit 982 | self.gp.add_arcs( src, tracer ) 983 | 984 | def drop_wiki_graph( self, wiki ): 985 | wiki_config = self.get_wiki_config(wiki) 986 | pre = self.gp.drop_graph( wiki_config['gp_graph'] ) 987 | 988 | def disconnect( self ): 989 | if self.gp: 990 | self.gp.close() #FIXME: graphserve bug: connection stays open 991 | self.gp = None 992 | 993 | def connect( self, wiki, create = False ): 994 | self.trace("connect to wiki %s" % wiki) 995 | 996 | wiki_config = self.get_wiki_config(wiki) 997 | 998 | old_wiki_config = self.wiki_config 999 | self.wiki_config = None 1000 | self.current_wiki = None 1001 | 1002 | if ( old_wiki_config is not None and self.gp 1003 | and old_wiki_config['gp-host'] == wiki_config['gp-host'] 1004 | and old_wiki_config['gp-port'] == wiki_config['gp-port'] ): 1005 | 1006 | ok = False 1007 | 1008 | try: 1009 | if old_wiki_config['gp-graph'] != wiki_config['gp-graph']: 1010 | self.trace("switching to graph %s" % wiki_config['gp-graph']) 1011 | self.gp.use_graph(wiki_config['gp-graph']) 1012 | else: 1013 | self.gp.ping() # check alive #TODO: make this optional 1014 | 1015 | ok = True 1016 | except gpException as e: 1017 | self.warning( "%s" % e ) 1018 | 1019 | if ok: 1020 | self.trace("re-using gp connection to %s" % wiki_config['gp-host']) 1021 | else: 1022 | self.trace("closing broken gp connection") 1023 | self.gp.close() #FIXME: graphserve bug: connection stays open 1024 | self.gp = None 1025 | 1026 | elif self.gp: 1027 | self.trace("closing old gp connection") 1028 | self.gp.close() 1029 | self.gp = None 1030 | 1031 | if not self.gp: 1032 | self.log("creating fresh gp connection to %s" % wiki_config['gp-host']) 1033 | 1034 | self.gp = MediaWikiGlue.new_client_connection(None, wiki_config['gp-host'], wiki_config.getint('gp-port') ) 1035 | self.gp.setDebug( self.debug ) 1036 | self.gp.connect() 1037 | 1038 | if wiki_config['gp-auth'] and wiki_config['gp-credentials']: 1039 | self.gp.authorize(wiki_config['gp-auth'], wiki_config['gp-credentials']) 1040 | 1041 | if wiki_config['gp-graph']: 1042 | g = wiki_config['gp-graph'] 1043 | 1044 | if create: 1045 | created = self.gp.try_create_graph(g) 1046 | if created: self.log("created graph %s" % g) 1047 | #else: self.log("could not create graph %s" % g) 1048 | 1049 | self.gp.use_graph(wiki_config['gp-graph']) 1050 | 1051 | #TODO: set debug if desired 1052 | 1053 | if ( old_wiki_config is not None and self.gp.connection 1054 | and old_wiki_config['mysql-host'] == wiki_config['mysql-host'] 1055 | and old_wiki_config['mysql-port'] == wiki_config['mysql-port'] ): 1056 | 1057 | ok = False 1058 | 1059 | try: 1060 | self.trace("switching to database %s on %s" % (wiki_config['mysql-database'], wiki_config['mysql-host'])) 1061 | self.gp.mysql_select_db( wiki_config['mysql-database'] ) 1062 | ok = True 1063 | except _mysql_exceptions.OperationalError as e: 1064 | self.warning( "MySQL error: %s" % e ) 1065 | except IOError as (errno, strerror): 1066 | self.warning( "I/O error({0}): {1}".format(errno, strerror) ) 1067 | 1068 | if ok: 1069 | self.trace("re-using db connection to %s" % wiki_config['mysql-host']) 1070 | else: 1071 | self.trace("closing broken db connection") 1072 | self.gp.connection.close() #TODO: encapsulate! 1073 | self.gp.connection = None 1074 | 1075 | elif self.gp.connection: 1076 | self.trace("closing old db connection") 1077 | self.gp.connection.close() #TODO: encapsulate! 1078 | self.gp.connection = None 1079 | 1080 | if not self.gp.connection: 1081 | h = wiki_config['mysql-host'] 1082 | if 'mysql-port' in wiki_config and wiki_config['mysql-port']: 1083 | h += ":" + wiki_config['mysql-port'] 1084 | 1085 | self.log("creating fresh db connection to %s" % h) 1086 | 1087 | self.gp.mysql_connect( server = wiki_config['mysql-host'], 1088 | port = int(wiki_config['mysql-port']), 1089 | db = wiki_config['mysql-database'], 1090 | username = wiki_config['mysql-user'], 1091 | password = wiki_config['mysql-password'] ) 1092 | 1093 | self.gp.mysql_autocommit(True) #NOTE: need that for read too, so we see changes at all 1094 | 1095 | self.gp.table_prefix = wiki_config['mysql-prefix'] 1096 | 1097 | if 'mysql-temp-db' in wiki_config: 1098 | self.gp.temp_table_db = wiki_config['mysql-temp-db'] 1099 | 1100 | if 'slow-query-comment' in wiki_config: 1101 | self.slow_query_comment = wiki_config['slow-query-comment'] 1102 | 1103 | self.wiki_config = wiki_config 1104 | self.current_wiki = wiki 1105 | 1106 | def add_job(self, job): 1107 | with self.job_lock: 1108 | self.trace("add_job: %s" % job) 1109 | 1110 | try: #try whether job is actually a sequence 1111 | self.jobs.extend( job ) 1112 | except: #not a sequence, just one job 1113 | self.jobs.append( job ) 1114 | 1115 | self.trace("jobs now: %s" % self.jobs) 1116 | 1117 | def remove_jobs(self, wiki): 1118 | self.trace("remove_jobs: %s" % (wiki,) ) 1119 | 1120 | c = 0 1121 | with self.job_lock: 1122 | i = 0 1123 | while i < len(self.jobs): 1124 | j = self.jobs[i] 1125 | 1126 | if j.wiki == wiki: 1127 | self.trace("remove_jobs: removing %s" % j) 1128 | del self.jobs[i] 1129 | c += 1 1130 | else: 1131 | i += 1 1132 | 1133 | self.trace("jobs now: %s" % self.jobs) 1134 | return c 1135 | 1136 | def run_next_job(self): 1137 | if not self.jobs: 1138 | return None 1139 | 1140 | with self.job_lock: 1141 | job = self.jobs[0] 1142 | del self.jobs[0] 1143 | 1144 | self.trace("popped job %s " % job) 1145 | self.trace("jobs now: %s" % self.jobs) 1146 | 1147 | done = False 1148 | 1149 | try: 1150 | followup = job.execute() 1151 | done = True 1152 | 1153 | if followup: 1154 | self.add_job( followup ) 1155 | 1156 | except IOError as (errno, strerror): 1157 | self.error( "I/O error({0}): {1}\n".format(errno, strerror), (errno, strerror) ) 1158 | except gpException as e: 1159 | self.error( "GraphServ error: %s\n" % e, e ) 1160 | except ConfigParser.NoSectionError as e: 1161 | self.error( "Configuration error: %s\n" % e, e ) 1162 | except ConfigParser.NoOptionError as e: 1163 | self.error( "Configuration error: %s\n" % e, e ) 1164 | except _mysql_exceptions.OperationalError as e: 1165 | self.error( "MySQL error: %s\n" % e, e ) 1166 | 1167 | if not done: 1168 | job.reset() 1169 | 1170 | with self.job_lock: 1171 | self.trace("queueing job after error: %s " % job) 1172 | self.jobs.append(job) 1173 | 1174 | self.trace("jobs now: %s" % self.jobs) 1175 | 1176 | return job 1177 | 1178 | def stop(self): 1179 | self.trace("stopping feeder's job loop") 1180 | self._stopped = True 1181 | 1182 | with self.job_lock: 1183 | self.jobs[:] = [] #clear 1184 | 1185 | self.disconnect() 1186 | 1187 | def run_jobs( self, delay = None ): 1188 | try: 1189 | while ( not self._stopped 1190 | and ( self.jobs or not self.terminate_when_empty ) ): 1191 | 1192 | if self._frozen: 1193 | self.log("job processing loop frozen") 1194 | 1195 | while self._frozen and not self._stopped: 1196 | time.sleep(1) # polling is not elegant, but simple and reliable 1197 | 1198 | if self._stopped: 1199 | break 1200 | 1201 | self.log("job processing loop unfrozen") 1202 | 1203 | self.run_next_job() 1204 | 1205 | if delay: 1206 | time.sleep(delay) 1207 | 1208 | except KeyboardInterrupt: 1209 | self.warning("KeyboardInterrupt, terminating") 1210 | self.cmd_shutdown() 1211 | 1212 | if threading.current_thread().name != "MainThread": #XXX: is there a better way? 1213 | threading.interrupt_main() 1214 | 1215 | def schedule_load(self, wikis, start_polling = False): 1216 | for wiki in wikis: 1217 | wiki_config = self.get_wiki_config(wiki) 1218 | namespaces = self.get_namespaces( wiki_config ) 1219 | 1220 | job = StartLoadJob(self, wiki, namespaces, start_polling = start_polling) 1221 | self.add_job( job ) 1222 | 1223 | def schedule_update(self, wikis, since = None, keep_polling = False): 1224 | for wiki in wikis: 1225 | wiki_config = self.get_wiki_config(wiki) 1226 | namespaces = self.get_namespaces( wiki_config ) 1227 | 1228 | job = UpdateModifiedJob(self, wiki, namespaces = namespaces, since = since, keep_polling = keep_polling) 1229 | self.add_job( job ) 1230 | 1231 | job = UpdateDeletedJob(self, wiki, namespaces = namespaces, since = since, keep_polling = keep_polling) 1232 | self.add_job( job ) 1233 | 1234 | def get_namespaces(self, wiki_config): 1235 | if not 'namespaces' in wiki_config: 1236 | return None 1237 | 1238 | nn = wiki_config['namespaces'] 1239 | 1240 | if type(nn) in (str, unicode): 1241 | nn = nn.split(",") 1242 | 1243 | namespaces = set() 1244 | 1245 | for n in nn: 1246 | namespaces.add( int(n) ) 1247 | #TODO: catch ValueError, try namespace names! 1248 | 1249 | return namespaces 1250 | 1251 | 1252 | def get_wiki_config(self, w): 1253 | if w == self.current_wiki and self.wiki_config is not None: 1254 | return self.wiki_config 1255 | 1256 | wiki_config = ConfigDict(self.config, w) 1257 | 1258 | #inject derivative defaults 1259 | if 'gp-graph' not in wiki_config or wiki_config['gp-graph'] is None: 1260 | wiki_config['gp-graph'] = wiki_config['mysql-database'] 1261 | 1262 | if 'gp-credentials' not in wiki_config or wiki_config['gp-credentials'] is None: 1263 | cred= wiki_config['mysql-user'] + ":" + wiki_config['mysql-password'] 1264 | wiki_config['gp-credentials'] = cred 1265 | 1266 | return wiki_config 1267 | 1268 | 1269 | def cmd_help(self, *wikis, **opt): 1270 | print_script_help() 1271 | 1272 | def cmd_verbose(self, *wikis, **opt): 1273 | self.verbose = True 1274 | self.debug = False 1275 | 1276 | if self.gp: 1277 | self.gp.setDebug(False) 1278 | 1279 | def cmd_debug(self, *wikis, **opt): 1280 | self.verbose = True 1281 | self.debug = True 1282 | 1283 | if self.gp: 1284 | self.gp.setDebug(True) 1285 | 1286 | def cmd_hush(self, *wikis, **opt): 1287 | self.verbose = False 1288 | self.debug = False 1289 | 1290 | if self.gp: 1291 | self.gp.setDebug(False) 1292 | 1293 | def cmd_freeze(self, *wikis, **opt): 1294 | self._frozen = True 1295 | 1296 | def cmd_unfreeze(self, *wikis, **opt): 1297 | self._frozen = False 1298 | 1299 | def cmd_jobs(self, *wikis, **opt): 1300 | if not self.jobs: 1301 | print "no jobs" 1302 | return 1303 | 1304 | for j in self.jobs: 1305 | print j 1306 | 1307 | def cmd_shutdown(self, *wikis, **opt): 1308 | self.stop() 1309 | 1310 | def cmd_die(self, *wikis, **opt): 1311 | self.warning("dieing") 1312 | os._exit(11) #FIXME: leaves tty dirty. but sys.exit doesn't work! 1313 | 1314 | def cmd_reload_config(self, *wikis, **opt): 1315 | self.load_config() 1316 | 1317 | def cmd_update(self, *wikis, **opt): 1318 | opt['keep_polling'] = False 1319 | self.schedule_update(self, wikis, **opt) 1320 | 1321 | def cmd_start(self, *wikis, **opt): 1322 | opt['keep_polling'] = True 1323 | self.schedule_update(self, wikis, **opt) 1324 | 1325 | def cmd_stop(self, *wikis, **opt): 1326 | for w in wikis: 1327 | self.remove_jobs(w) 1328 | 1329 | def cmd_drop(self, *wikis, **opt): 1330 | for w in wikis: 1331 | self.remove_jobs(w) 1332 | self.drop_wiki_graph(w) 1333 | 1334 | def cmd_launch(self, *wikis, **opt): 1335 | self.cmd_stop(wikis, opt) 1336 | 1337 | self.load_config() 1338 | 1339 | opt['start_polling'] = True 1340 | self.schedule_load(wikis, **opt) 1341 | 1342 | def cmd_load(self, *wikis, **opt): 1343 | self.cmd_stop(wikis, opt) 1344 | 1345 | self.load_config() 1346 | 1347 | opt['start_polling'] = False 1348 | self.schedule_load(wikis, **opt) 1349 | 1350 | def cmd_list_jobs(self, *wikis, **opt): 1351 | for j in self.jobs: 1352 | print j 1353 | 1354 | def cmd_gc(self, *wikis, **opt): 1355 | gc.collect(2) # run full garbage collection of all generations 1356 | 1357 | def get_heapy(self): 1358 | if not self.heapy: 1359 | try: 1360 | import guppy 1361 | self.heapy = guppy.hpy() 1362 | 1363 | self.log("loaded guppy library.") 1364 | except ImportError as ex: 1365 | self.warning("guppy library not installed: %s" % ex) 1366 | 1367 | return self.heapy 1368 | 1369 | def cmd_heapy(self, *wikis, **opt): 1370 | hpy = self.get_heapy() 1371 | 1372 | print hpy.heap() 1373 | print "" 1374 | print hpy.iso(self) 1375 | 1376 | def cmd_heapy_browser(self, *wikis, **opt): 1377 | hpy = self.get_heapy() 1378 | 1379 | print "launching profile browser" 1380 | hpy.pb() 1381 | 1382 | def cmd_prstat(self, *wikis, **opt): 1383 | usage = resource.getrusage(resource.RUSAGE_SELF) 1384 | pgsize = resource.getpagesize() 1385 | 1386 | print "utime: %d sec" % (usage.ru_utime,) 1387 | print "stime: %d sec" % (usage.ru_stime,) 1388 | if usage.ru_maxrss: print "peak physical memory (maxrss): %d KB" % (usage.ru_maxrss * pgsize / 1024,) 1389 | if usage.ru_idrss: print "current physical memory (idrss): %d KB" % (usage.ru_idrss * pgsize / 1024,) 1390 | if usage.ru_ixrss: print "current shared memory (ixrss): %d KB" % (usage.ru_ixrss * pgsize / 1024,) 1391 | 1392 | st = proc_stat() 1393 | if st: 1394 | if "rss" in st: print "current physical memory (rss): %d KB" % (int(st["rss"]), ) 1395 | if "vsize" in st: print "current virtual memory (vsize): %d KB" % (int(st["vsize"]) / 1024, ) 1396 | 1397 | def cmd_heapy_baseline(self, *wikis, **opt): 1398 | if not self.heapy: 1399 | raise Exception("heapy not loaded") 1400 | 1401 | self.heapy.setref() 1402 | print "new baseline set for heapy" 1403 | 1404 | def cmd_py(self, *wikis, **opt): 1405 | if not self.options.interactive: 1406 | print "python shell is only allowed in interactive mode (use --interactive)" 1407 | return 1408 | 1409 | print "launching python shell. use EOF (Ctrl-D) to return." 1410 | code.interact( banner="++++++++++++++++ Python Shell ++++++++++++++++", 1411 | local = { "feeder" : self, 1412 | "gpfeeder": globals(), 1413 | "options": self.options, 1414 | "jobs": self.jobs, } ) 1415 | print "python shell terminated, welcome back." 1416 | 1417 | 1418 | def start_job_worker(self, delay = 1, daemon = False): 1419 | worker = threading.Thread( name="Job Worker", target = self.run_jobs, kwargs = { "delay": poll_delay } ) 1420 | worker.daemon = daemon 1421 | worker.start() 1422 | return worker 1423 | 1424 | def run_script(self, script): 1425 | loop = Script(self, script) 1426 | loop.run() 1427 | 1428 | def start_script(self, script, daemon = False): 1429 | loop = Script(self, script) 1430 | loop.daemon = daemon 1431 | 1432 | loop.start() 1433 | return loop 1434 | 1435 | def load_config(self): 1436 | # find config file........ 1437 | bindir= os.path.dirname( os.path.realpath( sys.argv[0] ) ) 1438 | 1439 | if self.options.config_file: 1440 | cfg = self.options.config_file #take it from --config 1441 | else: 1442 | cfg = bindir + "/gpfeeder.ini" #installation root 1443 | 1444 | config_defaults = {} 1445 | 1446 | # read .my.cnf........ 1447 | dbcnf_path = os.path.expanduser( "~/.my.cnf" ) 1448 | 1449 | config_defaults['mysql-host'] = 'localhost' 1450 | config_defaults['mysql-port'] = "3306" 1451 | config_defaults['mysql-database'] = 'wiki' 1452 | config_defaults['mysql-user'] = 'gpfeeder' 1453 | config_defaults['mysql-password'] = 'gpfeeder' 1454 | 1455 | if os.path.exists(dbcnf_path): 1456 | self.log( "reading database config from %s" % dbcnf_path ) 1457 | dbcnf = ConfigParser.SafeConfigParser(allow_no_value=True) 1458 | dbcnf.add_section( 'client' ) 1459 | 1460 | try: 1461 | if dbcnf.read( dbcnf_path ): 1462 | for n in ('host', 'port', 'database', 'user', 'password'): 1463 | if dbcnf.has_option("Client", n): 1464 | config_defaults['mysql-' + n] = dbcnf.get("Client", n) 1465 | elif dbcnf.has_option("DEFAULT", n): 1466 | config_defaults['mysql-' + n] = dbcnf.get("DEFAULT", n) 1467 | except ConfigParser.Error: 1468 | self.warning( "failed to read mysql client config from %s." % dbcnf_path ) 1469 | 1470 | 1471 | # define more config defaults........ 1472 | config_defaults['mysql-prefix'] = '' 1473 | 1474 | config_defaults['gp-host'] = 'localhost' 1475 | config_defaults['gp-port'] = PORT 1476 | config_defaults['gp-auth'] = 'password' 1477 | #config_defaults['gp-graph'] = '' 1478 | config_defaults['gp-credentials'] = None 1479 | config_defaults['gp-graph'] = None 1480 | config_defaults['load-chunk-size'] = '32000' 1481 | config_defaults['update-max-cats'] = '100' 1482 | config_defaults['load-query-limit'] = '' 1483 | 1484 | config_defaults['state-file'] = bindir + '/gpfeeder-state.p' 1485 | #config_defaults['state-table'] = 'gpfeeder_state' 1486 | 1487 | config_defaults['poll-delay'] = "1" 1488 | 1489 | # load config........ 1490 | config = ConfigParser.SafeConfigParser(defaults = config_defaults) 1491 | 1492 | if os.path.exists(cfg): 1493 | self.log( "reading config file %s" % cfg ) 1494 | 1495 | if not config.read( cfg ): 1496 | self.error( "failed to read config from %s" % cfg ) 1497 | return False 1498 | 1499 | self.config = config 1500 | return True 1501 | 1502 | 1503 | class ConfigDict(object): 1504 | def __init__(self, config, section): 1505 | self.config = config 1506 | self.section = section 1507 | 1508 | def __getitem__(self, k): 1509 | return self.config.get(self.section, k) 1510 | 1511 | def __delitem__(self, k): 1512 | return self.config.remove_option(self.section, k) 1513 | 1514 | def __setitem__(self, k, v): 1515 | return self.config.set(self.section, k, str(v)) 1516 | 1517 | def __contains__(self, k): 1518 | return self.config.has_option(self.section, k) 1519 | 1520 | def __iter__(self): 1521 | return self.iterkeys() 1522 | 1523 | def iterkeys(self): 1524 | for key, value in self.items(): 1525 | yield key 1526 | 1527 | def items(self): 1528 | return self.config.items(self.section) 1529 | 1530 | def initialize(self, k, v): 1531 | if not k in self: 1532 | self[k] = str(v) 1533 | 1534 | def __len__(self): 1535 | return len( self.items() ) 1536 | 1537 | def get(self, k): 1538 | if k in self: 1539 | return self[k] 1540 | else: 1541 | return None 1542 | 1543 | def getint(self, k): 1544 | v = self.get(k) 1545 | if v is None: return None 1546 | 1547 | return int(v) 1548 | 1549 | 1550 | ################################################################################## 1551 | 1552 | def print_script_help(): 1553 | print "gpfeeder can process very simple scripts, in the form of lists of " 1554 | print "commands. Commands are processed on per line of input. Available " 1555 | print "commands are: " 1556 | print " " 1557 | print " help show this help text" 1558 | print " load ... loads structure of the given wikis" 1559 | print " launch ... loads wikis and starts polling for updates" 1560 | print " update ... updates the structure of the given wikis" 1561 | print " start ... starts polling for changes on the wikis" 1562 | print " stop ... stops polling for the given wikis" 1563 | print " drop ... stops polling and removes structures from graphserv" 1564 | print " reload-config reloads the configuration file" 1565 | print " " 1566 | print " freeze freeze job queue processing" 1567 | print " unfreeze unfreeze job queue processing" 1568 | print " exit terminate script (for interactive mode)" 1569 | print " shutdown shuts down the gpfeeder" 1570 | print " " 1571 | print " verbose enables verbose output." 1572 | print " debug enables debug (and verbose) output." 1573 | print " hush disabled debug and verbose output." 1574 | print " " 1575 | print " jobs list current job queue." 1576 | print " prstat output process statistics (system memory, etc)." 1577 | print " heapy uses guppy's heapy to show heap summary." 1578 | print " gc run full garbage collection." 1579 | print " py starts interactive python shell." 1580 | print " NOTE: works in interactive mode only." 1581 | 1582 | 1583 | def print_config_help(): 1584 | bindir= os.path.dirname( os.path.realpath( sys.argv[0] ) ) 1585 | 1586 | print "gpfeeder requires a configuration file to operate. Per default, the " 1587 | print "configuration is loaded from pgfeeder.ini in the directory where " 1588 | print "gpfeeder.py is located, that is, from: " 1589 | print " " + bindir + "/gpfeeder.ini" 1590 | print "A different configuration file may be specified using the --config" 1591 | print "option." 1592 | print " " 1593 | print "The configuration file use the INI-file syntax (as specified by python's " 1594 | print "ConfigParser class). Note that no settings may occurr outside sections." 1595 | print "Each section in the config file provides settings for a specific wiki." 1596 | print "A special section called DEFAULT may be used to provide default settings" 1597 | print "that apply to all wikis." 1598 | print " " 1599 | print "The following settings controll the connection to the MySQL database: " 1600 | print " mysql-host the host to connect to. Default is 'localhost'" 1601 | print " mysql-port the port to connect to. Default is 3306" 1602 | print " mysql-user the user for authenticatin to MySQL. Default is 'gpfeeder'" 1603 | print " mysql-password the password for authenticatin to MySQL. Default: 'gpfeeder'" 1604 | print " mysql-database the database containing the wiki. Default is 'wiki'" 1605 | print " mysql-prefix the table prefix used by the wiki. Default is none ('')" 1606 | print " mysql-temp-db database to use for temp tables (default: use the wiki's database)" 1607 | print " " 1608 | print "If not given in the config file, the settings for mysql-host, mysql-port, mysql-user " 1609 | print "mysql-password and mysql-database are taken from the respective settings in the" 1610 | print "standard MySQL client configuration file (.my.cnf) in the user's home directory." 1611 | print " " 1612 | print "The following settings controll the connection to the graphserve instance: " 1613 | print " gp-host the host to connect to. Default is the mysql-host setting." 1614 | print " gp-port the port to connect to. Default is 6666" 1615 | print " gp-auth the authentication method. Default is 'password'" 1616 | print " gp-credentials the authentication credentials. For password auth, use" 1617 | print " user:password syntax. The default is derived from the " 1618 | print " mysql-user and mysql-password settings." 1619 | print " gp-graph the graph name for the wiki. Default is the " 1620 | print " mysql-database setting." 1621 | print " " 1622 | print "The following settings controll the operation of gpfeeder: " 1623 | print " load-chunk-size the maximum number of arcs to push into graphserv in a" 1624 | print " single command when initially loading a category structure." 1625 | print " The default is 32000 arcs." 1626 | print " load-query-limit the maximum number of arcs to fetch from the database in" 1627 | print " in one go while loading the category structure." 1628 | print " update-max-cats the maximum number of categories to process for a given wiki" 1629 | print " during an update pass. The default is 100 categories." 1630 | print " Note that since there are two types of updates (modifications " 1631 | print " and deletions), and this limit is applied to each kind. " 1632 | print " state-file the file to store the update state in. With the help of this" 1633 | print " file, gpfeeder can start the next update pass exactly where" 1634 | print " the last one (or the original import) left off. Per default, " 1635 | print " gpfeeder-state.p in the same directory as gpfeeder.py is used." 1636 | print " Note that this file will only be used if it is not possible to" 1637 | print " maintain the state in graphserv itself (graphserv protocol v4)." 1638 | print " poll-delay the number of seconds to wait between polls. This is intended " 1639 | print " to avoid busy waiting. The default value is 1 second. This " 1640 | print " setting may be overridden using the --poll-delay option." 1641 | print " slow-query-comment a comment to be injected into SQL queries that are expected " 1642 | print " to be slow. " 1643 | 1644 | 1645 | def get_options(): 1646 | bindir= os.path.dirname( os.path.realpath( sys.argv[0] ) ) 1647 | 1648 | option_parser = optparse.OptionParser() 1649 | option_parser.set_usage("gpfeeder.py [options] [wiki...]") 1650 | option_parser.add_option("--config", dest="config_file", 1651 | help="config file", metavar="FILE") 1652 | option_parser.add_option("--config-help", dest="config_help", action="store_true", 1653 | help="show configuration file help and exit" ) 1654 | option_parser.add_option("--script-help", dest="script_help", action="store_true", 1655 | help="show script syntax help and exit" ) 1656 | option_parser.add_option("--verbose", dest="verbose", action="store_true", 1657 | help="enable verbose output (fairly noisy)") 1658 | option_parser.add_option("--debug", dest="debug", action="store_true", 1659 | help="enable debug output (extremly noisy)") 1660 | option_parser.add_option("--load", dest="load", action="store_true", 1661 | help="load category structure") 1662 | option_parser.add_option("--update", dest="update", action="store_true", 1663 | help="update category structure once and exit") 1664 | option_parser.add_option("--update-since", dest="update_since", 1665 | help="start updates from date/time RC_TIMESTAMP (single wiki only)", metavar="RC_TIMESTAMP") 1666 | option_parser.add_option("--poll", dest="poll", action="store_true", 1667 | help="poll for updates periodically, do not terminate") 1668 | option_parser.add_option("--poll-delay", dest="poll_delay", default = 1, type = int, 1669 | help="sleep for SEC seconds between polls", metavar="SEC") 1670 | option_parser.add_option("--all", dest="all", action="store_true", 1671 | help="process all wikis defined in the config file") 1672 | option_parser.add_option("--interactive", dest="interactive", action="store_true", 1673 | help="accept interactive commands from stdin") 1674 | option_parser.add_option("--script", dest="script", 1675 | help="read commands from script (or fifo)") 1676 | 1677 | option_parser.add_option("--shell", dest="shell", action="store_true", 1678 | help="starts a python shell after all scripts are complete.") 1679 | option_parser.add_option("--heapy", dest="heapy", action="store_true", 1680 | help="use guppy's heapy library for memory profiling.") 1681 | option_parser.add_option("--dowser", dest="dowser", action="store_true", 1682 | help="use dowser library for memory profiling (experimental).") 1683 | option_parser.add_option("--cherrypy", dest="cherrypy", action="store_true", 1684 | help="use cherrypy to provide web interface for debugging. Currently only works with --dowser.") 1685 | option_parser.add_option("--cherrypy-port", dest="cherrypy_port", default=8008, metavar="P", 1686 | help="port to use for cherrypy web server (default: 8008).") 1687 | 1688 | (options, args) = option_parser.parse_args() 1689 | 1690 | options.script_on_stdin = ( not os.isatty(sys.stdin.fileno()) ) 1691 | 1692 | if options.script_on_stdin: 1693 | options.interactive = False 1694 | 1695 | if options.config_help: 1696 | print_config_help() 1697 | sys.exit(0) 1698 | 1699 | if options.script_help: 1700 | print_script_help() 1701 | sys.exit(0) 1702 | 1703 | if ( not options.load and not options.update and not options.poll 1704 | and not options.script and not options.interactive and not options.script_on_stdin ): 1705 | sys.stderr.write( "Nothing to do. Use at least one of --load, --update, --poll, --script, or --interactive\n" ) 1706 | sys.exit(1) 1707 | 1708 | if options.all and args: 1709 | sys.stderr.write( "Conflicting arguments: do not list wikis if --all is provided.\n" ) 1710 | sys.exit(1) 1711 | 1712 | if options.shell and options.interactive: 1713 | sys.stderr.write( "Conflicting arguments: do not use --shell and --interactive together.\n" ) 1714 | sys.exit(1) 1715 | 1716 | if (options.all or options.load) and options.update_since: 1717 | sys.stderr.write( "Conflicting options: do not use --update-since together with --load or --all.\n" ) 1718 | sys.exit(1) 1719 | 1720 | if ( not options.all and not args and not options.script_on_stdin 1721 | and not options.script and not options.interactive ): 1722 | 1723 | args = ( "DEFAULT", ) 1724 | 1725 | options.verbose = ( options.verbose or options.debug ) 1726 | 1727 | return (args, options) 1728 | 1729 | if __name__ == '__main__': 1730 | (wikis, options) = get_options() 1731 | 1732 | if options.poll_delay: 1733 | poll_delay = options.poll_delay 1734 | else: 1735 | poll_delay = config.getint("DEFAULT", 'poll-delay') 1736 | 1737 | gc.enable() 1738 | 1739 | try: 1740 | feeder = Feeder(options) 1741 | 1742 | if not feeder.load_config(): #XXX: do that implicitly somewhere? 1743 | sys.exit(1) 1744 | 1745 | if options.all: 1746 | wikis = feeder.config.sections() # go through all explicitly defined wikis 1747 | 1748 | if not wikis: 1749 | feeder.error( "No sections found in %s\n" % cfg ) 1750 | feeder.error( "If --all is given, the config file must define a section for each wiki to process." ) 1751 | sys.exit(1) 1752 | 1753 | if options.script: 1754 | static_script = os.path.isfile(options.script) # reading from a fifo? 1755 | else: 1756 | static_script = True 1757 | 1758 | if options.load: 1759 | feeder.schedule_load(wikis, start_polling = options.poll) 1760 | else: 1761 | feeder.schedule_update(wikis, since = options.update_since, keep_polling = options.poll) 1762 | 1763 | worker = feeder.start_job_worker( delay = poll_delay, daemon = options.interactive ) 1764 | 1765 | if options.script: 1766 | 1767 | if static_script: 1768 | feeder.log("reading script from %s" % options.script) 1769 | else: 1770 | if options.script_on_stdin or options.interactive: 1771 | feeder.error("Conflicting options: can not read script from fifo and also handle stdin.") 1772 | sys.exit(1) 1773 | 1774 | feeder.log("reading commands from pipe at %s" % options.script) 1775 | 1776 | feeder.run_script(options.script, daemon = not static_script) 1777 | 1778 | if options.script_on_stdin or options.interactive: 1779 | if options.script_on_stdin: 1780 | feeder.log("reading script from stdin") 1781 | loop = feeder.run_script(sys.stdin) 1782 | else: 1783 | #interactive 1784 | feeder.log("++++++++++++++++++ reading interactive commands from stdin ++++++++++++++++++") 1785 | 1786 | import readline 1787 | loop = feeder.run_script(None) #use raw_input, which uses readline 1788 | feeder.log("input closed, stopping") 1789 | feeder.stop() 1790 | 1791 | if options.shell: 1792 | feeder.log("launching python shell.") 1793 | code.interact( banner="++++++++++++++++ Python Shell ++++++++++++++++", 1794 | local = { "feeder" : feeder, 1795 | "worker": worker, 1796 | "gpfeeder": globals(), 1797 | "wikis": wikis, 1798 | "options": options } ) 1799 | 1800 | feeder.terminate_when_empty = True 1801 | 1802 | while worker.isAlive(): 1803 | #XXX: ugly hack: use laszy polling spin lock, becahuse Thread.join ignores KeyboardInterrupt 1804 | time.sleep(0.200) 1805 | 1806 | except IOError as (errno, strerror): 1807 | sys.stderr.write( "FAILED: I/O error({0}): {1}\n".format(errno, strerror) ) 1808 | except gpProtocolException as e: 1809 | sys.stderr.write( "FAILED: GraphServ error: %s\n" % e ) 1810 | except gpProcessorException as e: 1811 | sys.stderr.write( "FAILED: GraphServ error: %s\n" % e ) 1812 | except ConfigParser.NoSectionError as e: 1813 | sys.stderr.write( "FAILED: Configuration error: %s\n" % e ) 1814 | except ConfigParser.NoOptionError as e: 1815 | sys.stderr.write( "FAILED: Configuration error: %s\n" % e ) 1816 | except _mysql_exceptions.OperationalError as e: 1817 | sys.stderr.write( "FAILED: MySQL error: %s\n" % e ) 1818 | except (KeyboardInterrupt, SystemExit): 1819 | sys.stderr.write( "INTERRUPTED: doing cleanup and shutting down\n" ) 1820 | feeder.stop() 1821 | --------------------------------------------------------------------------------