├── __init__.py ├── .gitignore ├── logprovider.py ├── cfg └── config.json ├── config.py ├── client_async.py ├── result.py ├── server.py ├── message.py ├── listener.py ├── server_mp.py ├── client.py ├── modulewatcher.py ├── dbbase.py ├── testing.py ├── pages.py ├── test_downloader.py ├── mysql.py ├── pgsql.py ├── sqlite.py ├── downloader.py └── utils.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | -------------------------------------------------------------------------------- /logprovider.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class LogProvider(object): 5 | """make loggers""" 6 | def __init__(self): 7 | super(LogProvider, self).__init__() 8 | 9 | def get(self, config, suffix): 10 | """get logger with name 11 | 12 | :config: config class 13 | :suffix: additional text 14 | :returns: @todo 15 | 16 | """ 17 | txt = '{}.{}'.format(config.g('logger.base'), suffix) 18 | return logging.getLogger(txt) 19 | 20 | -------------------------------------------------------------------------------- /cfg/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "caching": "no", 3 | "proxies": "no", 4 | "timeout": 60, 5 | "logger": { 6 | "template" : "%(asctime)s - %(levelname)-6s - %(name)s - %(lineno)d - %(message)s", 7 | "path": "../logs/testing.log", 8 | "base": "app_name", 9 | "level": "DEBUG", 10 | "console": { "off": "yes", "level": "INFO" }, 11 | "file": { "level": "DEBUG" }, 12 | "backupsize": 100000000 13 | }, 14 | "db": { 15 | "sqlite": { 16 | "file": "database_file_name", 17 | "timeout": 10, 18 | "same_thread": 1 19 | } 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class Config(object): 5 | """manage configurations""" 6 | def __init__(self, cfgfile='cfg/config.json'): 7 | super(Config, self).__init__() 8 | self.cfgfile = cfgfile 9 | with open(self.cfgfile) as jsonfile: 10 | self.cfg = json.load(jsonfile) 11 | 12 | def get(self, *args, **kwargs): 13 | k = self.cfg 14 | for kw in args: 15 | k = k[kw] 16 | return k 17 | 18 | def g(self, ky, default=False): 19 | keys = ky.split('.') 20 | k = self.cfg 21 | for kwrd in keys: 22 | if kwrd not in k: 23 | return default 24 | k = k[kwrd] 25 | return k 26 | -------------------------------------------------------------------------------- /client_async.py: -------------------------------------------------------------------------------- 1 | from libs.config import Config 2 | import logging 3 | from libs.message import Message 4 | import asyncore 5 | 6 | 7 | class AsyncHandler(asyncore.dispatcher_with_send): 8 | 9 | def __init__(self, sock, request_provider): 10 | self.request_provider = request_provider 11 | cfg = Config() 12 | lcfg = '{}.client'.format(cfg.g('logger.base')) 13 | self.logger = logging.getLogger(lcfg) 14 | asyncore.dispatcher_with_send.__init__(self, sock) 15 | 16 | def handle_read(self): 17 | """reads data from socket""" 18 | data = self.recv(2048) 19 | data = data.strip().decode('utf-8') 20 | self.logger.debug('received: %s', data) 21 | self.request = self.request_provider(data, Message(self)) 22 | self.request.check_request() 23 | -------------------------------------------------------------------------------- /result.py: -------------------------------------------------------------------------------- 1 | 2 | def _rst(r, k='s', v=None): 3 | if v == None: return result[k] 4 | result[k] = v 5 | 6 | def s(result, v=None): 7 | return _rst(result, 's', v) 8 | 9 | def m(result, v=None): 10 | return _rst(result, 'match', v) 11 | 12 | def o(result, v=None): 13 | return _rst(result, 'original', v) 14 | 15 | def b(result, v=None): 16 | return _rst(result, 'by', v) 17 | 18 | def i(result, v=None): 19 | return _rst(result, 'info', v) 20 | 21 | def gt(m, o, i): 22 | return { "s": True, "match": m, "original":o, "info":i, "by": 'title' } 23 | 24 | def gc(m, o, i): 25 | return { "s": True, "match": m, "original":o, "info":i, "by": 'content' } 26 | 27 | def gl(m, o, i): 28 | return { "s": True, "match": m, "original":o, "info":i, "by": 'link' } 29 | 30 | def gf(o): 31 | return { "s": False, "original":o } 32 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | from libs import listener 2 | from libs import utils 3 | from libs.mysql import MySQL 4 | 5 | logger = utils.setup_logger() 6 | 7 | 8 | class Server(object): 9 | """docstring for Server""" 10 | def __init__(self, host, port): 11 | super(Server, self).__init__() 12 | self.host = host 13 | self.port = int(port) 14 | self.db = MySQL() 15 | self.db.query('delete from runlogs') 16 | 17 | def start(self): 18 | try: 19 | conn = listener.ConnectionThread(self.host, self.port, self.db) 20 | utils.save_to_file('port', '{}'.format(self.port)) 21 | conn.start() 22 | except Exception as e: 23 | print ("FAILED") 24 | print (e) 25 | try: 26 | if conn: 27 | conn.s.close() 28 | except Exception as e: 29 | pass 30 | 31 | 32 | -------------------------------------------------------------------------------- /message.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | # {{{ Messages 5 | class Message(object): 6 | """sends message and things""" 7 | def __init__(self, conn): 8 | super(Message, self).__init__() 9 | self.conn = conn 10 | 11 | def send_msg(self, msg, close=True): 12 | """ 13 | send a message through socket 14 | """ 15 | try: 16 | msg = json.dumps(msg) 17 | except Exception: 18 | pass 19 | replylen = len(msg) 20 | msg = "%s\n%s" % (replylen, msg) 21 | self.conn.send(bytearray(msg, 'utf8')) 22 | if close: 23 | self.conn.close() 24 | 25 | def send_good_msg(self, msg): 26 | """ 27 | send success messages 28 | """ 29 | self.send_msg({'s': 1, 'm': msg}) 30 | 31 | def send_fail_msg(self, msg): 32 | """ 33 | send fail messages 34 | """ 35 | self.send_msg({'s': 0, 'm': msg}) 36 | 37 | def send_good_result(self, data): 38 | """ 39 | sent result when success 40 | """ 41 | data['s'] = 1 42 | self.send_msg(data) 43 | 44 | def send_fail_result(self, data): 45 | """ 46 | send result with fail 47 | """ 48 | data['s'] = 0 49 | self.send_msg(data) 50 | 51 | # }}} 52 | -------------------------------------------------------------------------------- /listener.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import socket 3 | import threading 4 | import sys 5 | import os 6 | from libs.config import Config 7 | from inc.client import Client 8 | 9 | g_config = Config() 10 | l = '{}.server'.format(g_config.g('logger.base')) 11 | logger = logging.getLogger(l) 12 | 13 | 14 | class ConnectionThread(threading.Thread): 15 | 16 | def __init__(self, host, port, db): 17 | super(ConnectionThread, self).__init__() 18 | self.db = db 19 | self.db.query('delete from runlogs') 20 | try: 21 | self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 22 | self.s.bind((host, port)) 23 | logger.info('listening to %s:%s', host, port) 24 | self.s.listen(3) 25 | except socket.error: 26 | self.s.close() 27 | sys.exit() 28 | self.clients = [] 29 | 30 | def main_loop(self): 31 | try: 32 | while True: 33 | if os.path.exists('cache/stop'): 34 | break 35 | conn, address = self.s.accept() 36 | logger.info('[+] Client connected: {0}'.format(address[0])) 37 | c = Client(conn, self.db) 38 | c.start() 39 | self.clients.append(c) 40 | except Exception: 41 | logger.exception("Error!") 42 | finally: 43 | logger.info("[-] Closing connection") 44 | conn.close() 45 | os.remove('cache/stop') 46 | sys.exit() 47 | 48 | def run(self): 49 | self.main_loop() 50 | -------------------------------------------------------------------------------- /server_mp.py: -------------------------------------------------------------------------------- 1 | """ 2 | multiprocessing server 3 | """ 4 | from libs import utils 5 | from time import sleep 6 | 7 | LOGGER = utils.setup_logger() 8 | 9 | 10 | class Server(object): 11 | """docstring for ServerMultiProcess""" 12 | 13 | def __init__(self): 14 | super(Server, self).__init__() 15 | self.client_provider = None 16 | self.provider = None 17 | 18 | def set_client_provider(self, client_provider): 19 | """request handler""" 20 | self.client_provider = client_provider 21 | return self 22 | 23 | def set_provider(self, provider): 24 | """data provider 25 | 26 | :provider: @todo 27 | :returns: @todo 28 | 29 | """ 30 | self.provider = provider 31 | return self 32 | 33 | def start(self): 34 | """everything starts here""" 35 | process = None 36 | try: 37 | while True: 38 | try: 39 | providerdata = self.provider.get_queued_data() 40 | if providerdata is not None: 41 | process = self.client_provider()\ 42 | .set_provider_data(providerdata) 43 | process.start() 44 | sleep(3) 45 | except KeyboardInterrupt: 46 | break 47 | except Exception: 48 | LOGGER.exception("FAILED") 49 | break 50 | except Exception: 51 | LOGGER.exception("server out ...") 52 | finally: 53 | LOGGER.info('cleaning up') 54 | # set running scrapers to be paused 55 | if process != None: 56 | process.cleanup() 57 | -------------------------------------------------------------------------------- /client.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.dummy import Process 2 | from libs.config import Config 3 | import logging 4 | 5 | 6 | class Client(Process): 7 | """Request handler 8 | 9 | Receives request from scraper server and creates new threaded process""" 10 | def __init__(self): 11 | super(Client, self).__init__() 12 | self.request_provider = None 13 | self.request_cleanup = None 14 | self.request = None 15 | self.providerdata = None 16 | cfg = Config() 17 | lcfg = '{}.client'.format(cfg.g('logger.base')) 18 | self.logger = logging.getLogger(lcfg) 19 | 20 | def set_request_provider(self, request_provider): 21 | """request handler, 22 | 23 | request handlers process the request and starts scraping""" 24 | self.request_provider = request_provider 25 | return self 26 | 27 | def set_request_cleanup(self, request_cleanup): 28 | """request_cleanup function""" 29 | self.request_cleanup = request_cleanup 30 | return self 31 | 32 | def set_provider_data(self, providerdata): 33 | """set data 34 | 35 | :providerdata: @todo 36 | :returns: @todo 37 | 38 | """ 39 | self.providerdata = providerdata 40 | return self 41 | 42 | def set_daemon(self, mode): 43 | """should be daemon or not""" 44 | self.daemon = mode 45 | return self 46 | 47 | def run(self): 48 | request = self.request_provider(self.providerdata) 49 | request.setup_repo() 50 | request.start() 51 | return self 52 | 53 | def cleanup(self): 54 | """cleans up when exiting 55 | 56 | :returns: @todo 57 | """ 58 | self.logger.info("cleaning up") 59 | self.request_cleanup() 60 | -------------------------------------------------------------------------------- /modulewatcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Author: Chris Eberle 3 | # Watch for any changes in a module or package, and reload it automatically 4 | 5 | import pyinotify 6 | import imp 7 | import os 8 | 9 | class ModuleWatcher(pyinotify.ProcessEvent): 10 | """ 11 | Automatically reload any modules or packages as they change 12 | """ 13 | 14 | def __init__(self): 15 | "El constructor" 16 | 17 | self.wm = pyinotify.WatchManager() 18 | self.notifier = None 19 | self.mod_map = {} 20 | 21 | def _watch_file(self, file_name, module): 22 | "Add a watch for a specific file, and map said file to a module name" 23 | 24 | file_name = os.path.realpath(file_name) 25 | self.mod_map[file_name] = module 26 | self.wm.add_watch(file_name, pyinotify.IN_MODIFY) 27 | #print 'Watching', file_name 28 | 29 | def watch_module(self, name): 30 | "Load a module, determine which files it uses, and watch them" 31 | 32 | if imp.is_builtin(name) != 0: 33 | # Pretty pointless to watch built-in modules 34 | return 35 | 36 | (fd, pathname, description) = imp.find_module(name) 37 | 38 | try: 39 | mod = imp.load_module(name, fd, pathname, description) 40 | if fd: 41 | self._watch_file(fd.name, name) 42 | else: 43 | for root, dirs, files in os.walk(pathname): 44 | for filename in files: 45 | fpath = os.path.join(root, filename) 46 | if fpath.endswith('.py'): 47 | self._watch_file(fpath, name) 48 | finally: 49 | if fd: 50 | fd.close() 51 | 52 | def start_watching(self): 53 | "Start the pyinotify watch thread" 54 | 55 | if self.notifier is None: 56 | self.notifier = pyinotify.ThreadedNotifier(self.wm, self) 57 | self.notifier.start() 58 | 59 | def stop_watching(self): 60 | "Stop the pyinotify watch thread" 61 | 62 | if self.notifier is not None: 63 | self.notifier.stop() 64 | 65 | def process_IN_MODIFY(self, event): 66 | "A file of interest has changed" 67 | 68 | # Is it a file I know about? 69 | if event.path not in self.mod_map: 70 | return 71 | 72 | # Find out which module is using that file 73 | modname = self.mod_map[event.path] 74 | 75 | # Reload the module 76 | (fd, pathname, description) = imp.find_module(modname) 77 | try: 78 | print ('reloading module') 79 | imp.load_module(modname, fd, pathname, description) 80 | finally: 81 | if fd: 82 | fd.close() 83 | 84 | #print 'Reload', modname 85 | 86 | if __name__ == '__main__': 87 | # Test everything 88 | 89 | import sys 90 | 91 | mw = ModuleWatcher() 92 | mw.watch_module('module1') 93 | mw.watch_module('module2') 94 | mw.start_watching() 95 | 96 | try: 97 | raw_input('Press ENTER to exit') 98 | finally: 99 | mw.stop_watching() 100 | sys.exit(0) 101 | -------------------------------------------------------------------------------- /dbbase.py: -------------------------------------------------------------------------------- 1 | """ 2 | base database stuffs 3 | """ 4 | import logging 5 | from libs.config import Config 6 | 7 | 8 | class DBBase(object): 9 | """base database object""" 10 | 11 | cfg = None 12 | logger = None 13 | 14 | def __init__(self): 15 | """ 16 | initiate common requirements 17 | 18 | """ 19 | self.dbc = None 20 | DBBase.cfg = Config() 21 | txt = '{}.dbbase'.format(DBBase.cfg.g('logger.base')) 22 | DBBase.logger = logging.getLogger(txt) 23 | 24 | def requires_commit(self, _query): 25 | """check if query is either insert/update/delete/truncate 26 | 27 | """ 28 | query = _query.lower().strip() 29 | insert = query.startswith('insert') 30 | update = query.startswith('update') 31 | delete = query.startswith('delete') 32 | truncate = query.startswith('truncate') 33 | return insert or update or delete or truncate 34 | 35 | def should_commit(self, _query, conn=None): 36 | """ 37 | determine if the query needs to be committed 38 | """ 39 | if self.requires_commit(_query): 40 | if conn != None: 41 | conn.commit() 42 | else: 43 | self.dbc.commit() 44 | 45 | def do_query(self, qtpl, data, conn=None): 46 | """execute query 47 | 48 | :qtpl: @todo 49 | :data: @todo 50 | :returns: @todo 51 | 52 | """ 53 | if conn != None: 54 | cur = conn.cursor() 55 | else: 56 | cur = self.dbc.cursor() 57 | cur.execute(qtpl, data) 58 | self.should_commit(qtpl, conn=conn) 59 | return cur 60 | 61 | def make_condition(self, cond, col, col_name): 62 | """method signature 63 | 64 | :cond: @todo 65 | :col: @todo 66 | :col_name: @todo 67 | :returns: @todo 68 | 69 | """ 70 | raise NotImplementedError() 71 | 72 | def safe_query(self, querytpl, data, conn=None, retries=0): 73 | """method signature 74 | 75 | :querytpl: @todo 76 | :data: @todo 77 | :returns: @todo 78 | 79 | """ 80 | raise NotImplementedError() 81 | 82 | def query(self, query): 83 | """method signature 84 | 85 | :querytpl: @todo 86 | :returns: @todo 87 | 88 | """ 89 | raise NotImplementedError() 90 | 91 | def select(self, table, data=None, cols='*', at_end=''): 92 | """Executes simple select query 93 | 94 | :table: name of the table 95 | :data: [col|cond|val, ...] 96 | :cols: name of the columns 97 | :at_end: if we want order/limit/group 98 | :returns: cursor 99 | 100 | """ 101 | if data == None: 102 | querytpl = 'select %s from %s %s' % (cols, table, at_end) 103 | return self.safe_query(querytpl, data) 104 | conds = [] 105 | fdata = {} 106 | for k, item in enumerate(data): 107 | try: 108 | col, cond, val = item.split('|', 3) 109 | except ValueError: 110 | breaks = item.split('|') 111 | col = breaks[0] 112 | cond = breaks[1] 113 | val = '|'.join(breaks[2:]) 114 | col_name = '%s_%s' % (col, k) 115 | fdata[col_name] = val 116 | conds.append(self.make_condition(cond, col, col_name)) 117 | querytpl = 'select %s from %s where %s %s' % (cols, table, 118 | ' '.join(conds), at_end) 119 | return self.safe_query(querytpl, fdata) 120 | 121 | def _query(self, query, conn=None): 122 | """runs query 123 | 124 | :query: @todo 125 | :returns: @todo 126 | 127 | """ 128 | if conn != None: 129 | cur = conn.cursor() 130 | else: 131 | cur = self.dbc.cursor() 132 | cur.execute(query) 133 | self.should_commit(query, conn=conn) 134 | return cur 135 | 136 | def count_rows(self, query): 137 | """ 138 | counts row using given query 139 | """ 140 | res = self.query(query) 141 | result = res.fetchone() 142 | return result[0] 143 | -------------------------------------------------------------------------------- /testing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import libs.downloader as downloader 3 | import libs.utils as utils 4 | 5 | 6 | CODES = { 7 | '200': [True, 'OK'], 8 | '201': [True, 'Created'], 9 | '202': [True, 'Accepted'], 10 | '203': [True, 'Non-Authoritative Information'], 11 | '204': [True, 'No Content'], 12 | '205': [True, 'Reset Content'], 13 | '206': [True, 'Partial Content'], 14 | '300': [True, 'Multiple Choices'], 15 | '301': [True, 'Moved Permanently'], 16 | '302': [True, 'Found'], 17 | '303': [True, 'See Other'], 18 | '304': [True, 'Not Modified'], 19 | '305': [True, 'Use Proxy'], 20 | '306': [True, 'Unused'], 21 | '307': [True, 'Temporary Redirect'], 22 | '308': [True, 'Permanent Redirect'], 23 | '400': [False, 'Bad Request'], 24 | '401': [False, 'Unauthorized'], 25 | '402': [False, 'Payment Required'], 26 | '403': [False, 'Forbidden'], 27 | '404': [False, 'Not Found'], 28 | '405': [False, 'Method Not Allowed'], 29 | '406': [False, 'Not Acceptable'], 30 | '407': [False, 'Proxy Authentication Required'], 31 | '408': [False, 'Request Timeout'], 32 | '409': [False, 'Conflict'], 33 | '410': [False, 'Gone'], 34 | '411': [False, 'Length Required'], 35 | '412': [False, 'Precondition Required'], 36 | '413': [False, 'Request Entry Too Large'], 37 | '414': [False, 'Request-URI Too Long'], 38 | '415': [False, 'Unsupported Media Type'], 39 | '416': [False, 'Requested Range Not Satisfiable'], 40 | '417': [False, 'Expectation Failed'], 41 | '418': [False, "I'm a teapot"], 42 | '422': [False, 'Unprocessable Entity'], 43 | '428': [False, 'Precondition Required'], 44 | '429': [False, 'Too Many Requests'], 45 | '431': [False, 'Request Header Fields Too Large'], 46 | '451': [False, 'Unavailable For Legal Reasons'], 47 | '500': [False, 'Internal Server Error'], 48 | '501': [False, 'Not Implemented'], 49 | '502': [False, 'Bad Gateway'], 50 | '503': [False, 'Service Unavailable'], 51 | '504': [False, 'Gateway Timeout'], 52 | '505': [False, 'HTTP Version Not Supported'], 53 | '511': [False, 'Network Authentication Required'], 54 | '520': [False, 'Web server is returning an unknown error'], 55 | '522': [False, 'Connection timed out'], 56 | '524': [False, 'A timeout occurred'], 57 | } 58 | 59 | 60 | class TestDownloaderBasics(unittest.TestCase): 61 | """docstring for TestDownloaderBasics""" 62 | def test_200(self): 63 | """test 200 status code""" 64 | dlm = downloader.BaseDownloader() 65 | self.assertTrue(dlm.download('http://httpstat.us/200')) 66 | self.assertEqual(dlm.status_code, 200) 67 | 68 | def test_301(self): 69 | """redirection 70 | :returns: @todo 71 | 72 | """ 73 | dlm = downloader.BaseDownloader() 74 | self.assertTrue(dlm.download('http://httpstat.us/301')) 75 | self.assertEqual(200, dlm.status_code) 76 | self.assertEqual(dlm.last_url, 'http://httpstat.us') 77 | 78 | def test_404(self): 79 | """handling errors""" 80 | dlm = downloader.BaseDownloader() 81 | self.assertFalse(dlm.download('http://httpstat.us/404')) 82 | self.assertEqual(404, dlm.status_code) 83 | 84 | def test_404_web(self): 85 | """handling errors""" 86 | dlm = downloader.BaseDownloader() 87 | self.assertFalse(dlm.download('http://192.155.84.35/scraper/sd')) 88 | self.assertEqual(404, dlm.status_code) 89 | 90 | def test_timeout_fail(self): 91 | """handling errors""" 92 | dlm = downloader.BaseDownloader() 93 | dlm.timeout = 1 94 | self.assertFalse(dlm.download('http://httpstat.us/524')) 95 | self.assertEqual(524, dlm.status_code) 96 | 97 | def test_all_codes(self): 98 | """test with all possible status codes""" 99 | dlm = downloader.BaseDownloader() 100 | for code in CODES: 101 | info = CODES[code] 102 | url = 'http://httpstat.us/%s' % code 103 | self.assertEqual(info[0], dlm.download(url)) 104 | if int(code) >= 400: 105 | self.assertEqual(int(code), dlm.status_code) 106 | 107 | def test_cached_downloader(self): 108 | """@todo: Docstring for test_cached_downloader. 109 | :returns: @todo 110 | 111 | """ 112 | url = 'http://example.com/' 113 | filename = utils.hash(url) 114 | fullpath = utils.file_cached_path(filename, url) 115 | dlm = downloader.CachedDownloader() 116 | dlm.download(url) 117 | import os 118 | self.assertTrue(os.path.exists(fullpath), "cache path exists") 119 | 120 | 121 | def main(): 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /pages.py: -------------------------------------------------------------------------------- 1 | from lxml.etree import XMLSyntaxError 2 | from lxml.html.clean import Cleaner 3 | from lxml import html 4 | try: 5 | import libs.utils as utils 6 | except ImportError: 7 | import utils 8 | 9 | 10 | def clean_dom(dom): 11 | """get rids of script, style and comments""" 12 | cleaner = Cleaner() 13 | cleaner.script = True 14 | cleaner.style = True 15 | cleaner.comments = True 16 | return cleaner.clean_html(dom) 17 | 18 | 19 | def load_dom(content, remove_br): 20 | """loads the content 21 | 22 | :content: html 23 | :remove_br: should remove
tags? 24 | :returns: dom 25 | 26 | """ 27 | if remove_br: 28 | content = utils.remove_br(content) 29 | dom = html.fromstring(content) 30 | return Dom(dom) 31 | 32 | 33 | class BasePage(object): 34 | """result of downloads are stored here""" 35 | def __init__(self): 36 | super(BasePage, self).__init__() 37 | self.url = None 38 | self.post = None 39 | self.state = False 40 | self.load_time = None 41 | 42 | def set_url(self, url): 43 | """set url 44 | 45 | :url: @todo 46 | :returns: @todo 47 | 48 | """ 49 | self.url = url 50 | return self 51 | 52 | def set_post(self, post): 53 | """set post 54 | 55 | :url: @todo 56 | :returns: @todo 57 | 58 | """ 59 | self.post = post 60 | return self 61 | 62 | def set_load_time(self, load_time): 63 | """sets time took to load the page""" 64 | self.load_time = load_time 65 | return self 66 | 67 | 68 | class DownloadedPage(BasePage): 69 | """store page Information""" 70 | def __init__(self): 71 | super(DownloadedPage, self).__init__() 72 | self.url = None 73 | self.post = None 74 | self.redirected_to = None 75 | self.status_code = None 76 | self.text = None 77 | self.raw_text = None 78 | self.dom = None 79 | 80 | def get_dom(self, remove_br=False): 81 | """returns dom""" 82 | content = self.text 83 | tried_non_unicode = False 84 | while True: 85 | try: 86 | return load_dom(content, remove_br) 87 | except ValueError: 88 | if tried_non_unicode is True: 89 | break 90 | tried_non_unicode = True 91 | content = self.raw_text 92 | except XMLSyntaxError: 93 | break 94 | return None 95 | 96 | def set_redirected_to_url(self, redirected_to): 97 | """set last url set in response, is useful for redirected webpages""" 98 | self.redirected_to = redirected_to 99 | return self 100 | 101 | def set_status_code(self, status_code): 102 | """sets status code 103 | 104 | :status_code: @todo 105 | :returns: @todo 106 | 107 | """ 108 | self.status_code = status_code 109 | self.state = self.status_code < 400 110 | return self 111 | 112 | def set_state(self, state): 113 | """set state 114 | 115 | :state: @todo 116 | :returns: @todo 117 | 118 | """ 119 | self.state = state 120 | return self 121 | 122 | def set_text(self, text, raw_text=None): 123 | """set text values 124 | 125 | :text: @todo 126 | :returns: @todo 127 | 128 | """ 129 | self.text = text 130 | self.raw_text = raw_text 131 | return self 132 | 133 | 134 | class Dom(object): 135 | """dom helper, 136 | 137 | incase we have to switch to beautifulsoup parser 138 | """ 139 | 140 | def __init__(self, dom): 141 | super(Dom, self).__init__() 142 | self.dom = dom 143 | 144 | def first(self, xpath): 145 | """gets the first element from the result""" 146 | elist = self.xpath(xpath) 147 | try: 148 | return elist[0] 149 | except IndexError: 150 | return None 151 | 152 | def attr(self, xpath, attr): 153 | """get [attr] of element at [index] from the result""" 154 | elm = self.first(xpath) 155 | try: 156 | return elm.attrib[attr] 157 | except (KeyError, IndexError, AttributeError): 158 | return None 159 | 160 | def text(self, xpath, index=0): 161 | """get text of element at [index] from the result""" 162 | elist = self.xpath(xpath) 163 | try: 164 | return elist[index].text_content() 165 | except IndexError: 166 | return None 167 | 168 | def xpath(self, xpath): 169 | """use xpath 170 | 171 | :xpath: @todo 172 | :returns: @todo 173 | 174 | """ 175 | return self.dom.xpath(xpath) 176 | 177 | def make_links_absolute(self, link): 178 | """calls make_links_absolute 179 | :returns: @todo 180 | 181 | """ 182 | self.dom.make_links_absolute(link) 183 | -------------------------------------------------------------------------------- /test_downloader.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from downloader import BaseDownloader, CachedDownloader 3 | from downloader import curl_factory 4 | try: 5 | import libs.utils as utils 6 | import libs.pages as pages 7 | except ImportError: 8 | import utils 9 | import pages 10 | 11 | 12 | class TestDownloaderBasics(unittest.TestCase): 13 | """docstring for TestDownloaderBasics""" 14 | def setUp(self): 15 | """clear cache folder 16 | """ 17 | try: 18 | utils.delete_folder_content('cache/example.com') 19 | except Exception: 20 | pass 21 | 22 | def test_200(self): 23 | """test 200 status code""" 24 | dlm = BaseDownloader() 25 | dlm.download_with = USE_DOWNLOADER 26 | page = pages.DownloadedPage().set_url('http://httpstat.us/200') 27 | dlm.download(page) 28 | self.assertTrue(page.state) 29 | self.assertEqual(page.status_code, 200) 30 | 31 | def test_301(self): 32 | """redirection 33 | :returns: @todo 34 | 35 | """ 36 | dlm = BaseDownloader() 37 | dlm.download_with = USE_DOWNLOADER 38 | page = pages.DownloadedPage().set_url('http://httpstat.us/301') 39 | dlm.download(page) 40 | self.assertTrue(page.state) 41 | self.assertEqual(200, page.status_code) 42 | self.assertEqual(page.last_url, 'http://httpstat.us') 43 | 44 | def test_404(self): 45 | """handling errors""" 46 | dlm = BaseDownloader() 47 | dlm.download_with = USE_DOWNLOADER 48 | page = pages.DownloadedPage().set_url('http://httpstat.us/404') 49 | dlm.download(page) 50 | self.assertFalse(page.state) 51 | self.assertEqual(404, page.status_code) 52 | 53 | def test_404_web(self): 54 | """handling errors""" 55 | dlm = BaseDownloader() 56 | dlm.download_with = USE_DOWNLOADER 57 | page = pages.DownloadedPage().set_url('http://192.155.84.35/scraper/sd') 58 | dlm.download(page) 59 | self.assertFalse(page.state) 60 | self.assertEqual(404, page.status_code) 61 | 62 | def test_timeout_fail(self): 63 | """handling errors""" 64 | dlm = BaseDownloader() 65 | dlm.download_with = USE_DOWNLOADER 66 | dlm.timeout = 1 67 | page = pages.DownloadedPage().set_url('http://httpstat.us/524') 68 | dlm.download(page) 69 | self.assertFalse(page.state) 70 | self.assertEqual(524, page.status_code) 71 | 72 | def test_all_codes(self): 73 | """test with all possible status codes""" 74 | dlm = BaseDownloader() 75 | dlm.download_with = USE_DOWNLOADER 76 | for code in CODES: 77 | info = CODES[code] 78 | url = 'http://httpstat.us/%s' % code 79 | page = pages.DownloadedPage().set_url(url) 80 | dlm.download(page) 81 | self.assertEqual(info[0], page.state) 82 | if int(code) >= 400: 83 | self.assertEqual(int(code), page.status_code) 84 | 85 | def test_dom(self): 86 | """test dom parsing and querying 87 | :returns: @todo 88 | 89 | """ 90 | dlm = BaseDownloader() 91 | dlm.download_with = USE_DOWNLOADER 92 | page = pages.DownloadedPage().set_url('http://example.com') 93 | dlm.download(page) 94 | dom = page.get_dom() 95 | result = dom.xpath('//h1') 96 | self.assertEqual(1, len(result)) 97 | self.assertEqual('Example Domain', result[0].text_content().strip()) 98 | self.assertEqual('More information...', dom.text('//a')) 99 | self.assertEqual('Example Domain', dom.first('//h1').text_content()) 100 | self.assertEqual('More information...', dom.text('//p', 1)) 101 | self.assertEqual("http://www.iana.org/domains/example", 102 | dom.attr('//a', 'href')) 103 | 104 | def test_cached_page(self): 105 | """test run cached page class""" 106 | dlm = CachedDownloader() 107 | dlm.download_with = USE_DOWNLOADER 108 | page = pages.DownloadedPage().set_url('http://example.com') 109 | dlm.download(page) 110 | dom = page.get_dom() 111 | result = dom.xpath('//h1') 112 | self.assertEqual(1, len(result)) 113 | self.assertEqual('Example Domain', result[0].text_content().strip()) 114 | self.assertEqual('More information...', dom.text('//a')) 115 | self.assertEqual('Example Domain', dom.first('//h1').text_content()) 116 | self.assertEqual('More information...', dom.text('//p', 1)) 117 | self.assertEqual("http://www.iana.org/domains/example", 118 | dom.attr('//a', 'href')) 119 | 120 | def test_broken_html(self): 121 | """test on how to handle broken html files""" 122 | broken_html = """Hello</head><body onload=crash()> 123 | Hi all<p><a href="google.com">google</a>""" 124 | page = pages.DownloadedPage().set_text(broken_html) 125 | dom = page.get_dom() 126 | self.assertEqual(dom.first('//title').text_content(), 'Hello') 127 | self.assertEqual(dom.attr('//a', 'href'), 'google.com') 128 | self.assertEqual(dom.text('//a'), 'google') 129 | 130 | # 131 | # {{{ 132 | CODES = { 133 | '200': [True, 'OK'], 134 | '201': [True, 'Created'], 135 | '202': [True, 'Accepted'], 136 | '203': [True, 'Non-Authoritative Information'], 137 | '204': [True, 'No Content'], 138 | '205': [True, 'Reset Content'], 139 | '206': [True, 'Partial Content'], 140 | '300': [True, 'Multiple Choices'], 141 | '301': [True, 'Moved Permanently'], 142 | '302': [True, 'Found'], 143 | '303': [True, 'See Other'], 144 | '304': [True, 'Not Modified'], 145 | '305': [True, 'Use Proxy'], 146 | '306': [True, 'Unused'], 147 | '307': [True, 'Temporary Redirect'], 148 | '308': [True, 'Permanent Redirect'], 149 | '400': [False, 'Bad Request'], 150 | '401': [False, 'Unauthorized'], 151 | '402': [False, 'Payment Required'], 152 | '403': [False, 'Forbidden'], 153 | '404': [False, 'Not Found'], 154 | '405': [False, 'Method Not Allowed'], 155 | '406': [False, 'Not Acceptable'], 156 | '407': [False, 'Proxy Authentication Required'], 157 | '408': [False, 'Request Timeout'], 158 | '409': [False, 'Conflict'], 159 | '410': [False, 'Gone'], 160 | '411': [False, 'Length Required'], 161 | '412': [False, 'Precondition Required'], 162 | '413': [False, 'Request Entry Too Large'], 163 | '414': [False, 'Request-URI Too Long'], 164 | '415': [False, 'Unsupported Media Type'], 165 | '416': [False, 'Requested Range Not Satisfiable'], 166 | '417': [False, 'Expectation Failed'], 167 | '418': [False, "I'm a teapot"], 168 | '422': [False, 'Unprocessable Entity'], 169 | '428': [False, 'Precondition Required'], 170 | '429': [False, 'Too Many Requests'], 171 | '431': [False, 'Request Header Fields Too Large'], 172 | '451': [False, 'Unavailable For Legal Reasons'], 173 | '500': [False, 'Internal Server Error'], 174 | '501': [False, 'Not Implemented'], 175 | '502': [False, 'Bad Gateway'], 176 | '503': [False, 'Service Unavailable'], 177 | '504': [False, 'Gateway Timeout'], 178 | '505': [False, 'HTTP Version Not Supported'], 179 | '511': [False, 'Network Authentication Required'], 180 | '520': [False, 'Web server is returning an unknown error'], 181 | '522': [False, 'Connection timed out'], 182 | '524': [False, 'A timeout occurred'], 183 | } 184 | # }}} 185 | # 186 | 187 | 188 | def main(): 189 | """entry point""" 190 | logger = utils.setup_logger() 191 | logger.info('### start testing ###') 192 | unittest.main() 193 | 194 | 195 | if __name__ == '__main__': 196 | USE_DOWNLOADER = curl_factory 197 | # USE_DOWNLOADER = request_factory 198 | main() 199 | -------------------------------------------------------------------------------- /mysql.py: -------------------------------------------------------------------------------- 1 | try: 2 | from libs.config import Config 3 | from libs.dbbase import DBBase 4 | except ImportError: 5 | # pylint: disable=relative-import 6 | from config import Config 7 | from dbbase import DBBase 8 | import MySQLdb 9 | import logging 10 | import unittest 11 | 12 | 13 | def make_columns(data): 14 | """make columns for data 15 | 16 | :data: @todo 17 | :returns: @todo 18 | 19 | """ 20 | return ', '.join(['%%(%s)s' % key for key in data.keys()]) 21 | 22 | 23 | def dict_factory(cursor, row): 24 | """ 25 | dict factory for mysql row 26 | """ 27 | dest = {} 28 | for idx, col in enumerate(cursor.description): 29 | dest[col[0]] = row[idx] 30 | return dest 31 | 32 | 33 | class MySQL(DBBase): 34 | """ 35 | MySQL driver 36 | """ 37 | 38 | cfg = None 39 | logger = None 40 | 41 | """ stores data in a MySQL table """ 42 | def __init__(self): 43 | super(MySQL, self).__init__() 44 | MySQL.cfg = Config() 45 | txt = '{}.mysql'.format(MySQL.cfg.g('logger.base')) 46 | MySQL.logger = logging.getLogger(txt) 47 | self.prep_char = '?' 48 | self.dbc = None 49 | self.lastid = None 50 | self.dbhost = MySQL.cfg.g('db.mysql.host') 51 | self.user = MySQL.cfg.g('db.mysql.user') 52 | self.pswd = MySQL.cfg.g('db.mysql.pass') 53 | self.dbname = MySQL.cfg.g('db.mysql.database') 54 | self.connect() 55 | 56 | def connect(self): 57 | """ 58 | connects to database 59 | """ 60 | try: 61 | self.dbc.close() 62 | except AttributeError: 63 | pass 64 | self.dbc = MySQLdb.connect(self.dbhost, self.user, 65 | self.pswd, self.dbname, charset='utf8', 66 | use_unicode=True) 67 | self.dbc.set_character_set('utf8') 68 | dbc = self.dbc.cursor() 69 | dbc.execute('SET NAMES utf8;') 70 | dbc.execute('SET CHARACTER SET utf8;') 71 | dbc.execute('SET character_set_connection=utf8;') 72 | 73 | def close(self): 74 | """ 75 | closes the database, don't use it, 76 | close database directly by self.dbc.close() 77 | """ 78 | self.dbc.close() 79 | 80 | def clear_database(self, table): 81 | """ 82 | clears given table 83 | """ 84 | self.query("delete from %s" % table) 85 | 86 | def safe_query(self, qtpl, data): 87 | """Executed binding query 88 | ex: select * from table where q=:s, d=:k 89 | 90 | :query: @todo 91 | :data: @todo 92 | :returns: @todo 93 | 94 | """ 95 | retries = 0 96 | while True: 97 | try: 98 | return self.do_query(qtpl, data) 99 | except MySQLdb.MySQLError as err: 100 | if err[0] == 1062: 101 | return -2 102 | self.connect() 103 | retries += 1 104 | if retries > 5: 105 | MySQL.logger.exception('Failed to execute query') 106 | return None 107 | 108 | def make_condition(self, cond, col, col_name): 109 | """builds appropiate query 110 | 111 | :cond: @todo 112 | :col: @todo 113 | :col: @todo 114 | :returns: @todo 115 | 116 | """ 117 | return '%s %s=%%(%s)s' % (cond, col, col_name) 118 | 119 | def query(self, query): 120 | """ 121 | Runs a query in unsafe way 122 | """ 123 | try: 124 | return self._query(query) 125 | except MySQLdb.OperationalError: 126 | return None 127 | 128 | def append_data(self, data, table, pkey=None): 129 | """ 130 | adds row to database 131 | """ 132 | qfields = make_columns(data) 133 | cols = ', '.join(data.keys()) 134 | query = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields) 135 | return self.execute_query(data, query) 136 | 137 | def append_all_data(self, data, table): 138 | """adds multiple rows, 139 | 140 | tries in single query first 141 | uses multiple queries if fails 142 | """ 143 | qfields = make_columns(data[0]) 144 | cols = ', '.join(data[0].keys()) 145 | query = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields) 146 | state = self.execute_query(data, query, True) 147 | if state == -2: 148 | for row in data: 149 | self.append_data(row, table) 150 | else: 151 | return state 152 | return True 153 | 154 | def execute_query(self, data, query, many=False): 155 | """execute query 156 | 157 | :data: @todo 158 | :table: @todo 159 | :many: @todo 160 | :returns: @todo 161 | 162 | """ 163 | # pylint: disable=broad-except, no-member 164 | retries = 0 165 | cur = None 166 | try: 167 | while True: 168 | try: 169 | cur = self.dbc.cursor() 170 | if many: 171 | status = cur.executemany(query, data) 172 | else: 173 | status = cur.execute(query, data) 174 | try: 175 | self.lastid = cur.insert_id() 176 | except AttributeError: 177 | self.lastid = cur.lastrowid 178 | except Exception: 179 | self.lastid = cur.lastrowid 180 | MySQL.logger.exception("ignorable") 181 | self.dbc.commit() 182 | return status 183 | except MySQLdb.MySQLError as err: 184 | if err[0] == 1062: 185 | return -2 186 | MySQL.logger.exception(err) 187 | MySQL.logger.info('reconnecting ... ') 188 | self.connect() 189 | retries += 1 190 | if retries > 5: 191 | MySQL.logger.exception('Failed to execute query') 192 | return None 193 | continue 194 | except Exception as exp: 195 | MySQL.logger.exception('failed inserting data') 196 | self.lastid = None 197 | raise exp 198 | finally: 199 | if cur: 200 | cur.close() 201 | 202 | 203 | class TestMySQL(unittest.TestCase): 204 | """docstring for TestMySQL""" 205 | 206 | def test_inserts(self): 207 | """test insert queries 208 | :returns: @todo 209 | 210 | """ 211 | dbc.append_data({'name': 'gmail.com', 'si': 10}, 'tests') 212 | dbc.append_data({'name': 'inbox.com', 'si': 12}, 'tests') 213 | dbc.append_data({'name': 'reddit.com', 'si': 1}, 'tests') 214 | dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests') 215 | dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests') 216 | dbc.query('insert into tests (name, si) values("google.com", 10)') 217 | self.assertEqual(1, 1) 218 | 219 | def test_queries(self): 220 | """test select queries 221 | :returns: @todo 222 | 223 | """ 224 | result = dbc.select('tests', ['name||sgmail.com']) 225 | self.assertEqual(0, len(result.fetchall())) 226 | result = dbc.select('tests', ['name||gmail.com']) 227 | self.assertEqual(1, len(result.fetchall())) 228 | result = dbc.select('tests', ['si||2', 'si|or|12']) 229 | self.assertEqual(3, len(result.fetchall())) 230 | result = dbc.select('tests', ['name||gmail.com', 'name|or|inbox.com']) 231 | self.assertEqual(2, len(result.fetchall())) 232 | result = dbc.select('tests', ['name||reddit.com'], 'count(*)') 233 | self.assertEqual(3, result.fetchone()[0]) 234 | result = dbc.select('tests', at_end='order by si') 235 | result = dbc.select('tests', ['name||reddit.com'], 'count(*)', 236 | at_end='group by si') 237 | 238 | 239 | def main(): 240 | """ 241 | do some tests 242 | """ 243 | try: 244 | dbc.query('drop table if exists tests') 245 | dbc.query('create table tests(name varchar(20), si integer)') 246 | # pylint: disable=no-member 247 | except MySQLdb.OperationalError: 248 | pass 249 | unittest.main() 250 | 251 | 252 | if __name__ == '__main__': 253 | dbc = MySQL() 254 | main() 255 | -------------------------------------------------------------------------------- /pgsql.py: -------------------------------------------------------------------------------- 1 | try: 2 | from libs.config import Config 3 | from libs.dbbase import DBBase 4 | except ImportError: 5 | # pylint: disable=relative-import 6 | from config import Config 7 | from dbbase import DBBase 8 | import psycopg2 9 | import logging 10 | import unittest 11 | 12 | 13 | def make_columns(data): 14 | """make columns for data 15 | 16 | :data: dictonary containing column name (key) and value (not used) 17 | :returns: @todo 18 | 19 | """ 20 | return ', '.join(['%%(%s)s' % key for key in data.keys()]) 21 | 22 | 23 | class PGSql(DBBase): 24 | """ stores data in a PGSql table """ 25 | 26 | cfg = None 27 | logger = None 28 | 29 | def __init__(self): 30 | super(PGSql, self).__init__() 31 | PGSql.cfg = Config() 32 | txt = '{}.pgsql'.format(PGSql.cfg.g('logger.base')) 33 | PGSql.logger = logging.getLogger(txt) 34 | self.prep_char = '?' 35 | self.lastid = None 36 | self.dbhost = PGSql.cfg.g('db.pgsql.host') 37 | self.user = PGSql.cfg.g('db.pgsql.user') 38 | self.pswd = PGSql.cfg.g('db.pgsql.pass') 39 | self.dbname = PGSql.cfg.g('db.pgsql.database') 40 | self.dbc = self.connect() 41 | 42 | def connect(self): 43 | """ 44 | connects to database 45 | """ 46 | try: 47 | return psycopg2.connect(host=self.dbhost, user=self.user, 48 | password=self.pswd, dbname=self.dbname) 49 | except AttributeError: 50 | pass 51 | 52 | # pylint: disable=no-self-use 53 | def close(self): 54 | """ 55 | closes the database, don't use it, 56 | close database directly by self.dbc.close() 57 | """ 58 | self.dbc.close() 59 | 60 | def clear_database(self, table): 61 | """ 62 | clears given table 63 | """ 64 | self.query("delete from %s" % table) 65 | self.dbc.commit() 66 | 67 | def make_condition(self, cond, col, col_name): 68 | """builds appropiate query 69 | 70 | :cond: @todo 71 | :col: @todo 72 | :col: @todo 73 | :returns: @todo 74 | 75 | """ 76 | return '%s %s=%%(%s)s' % (cond, col, col_name) 77 | 78 | def reconnect(self): 79 | """reconnects persistant connection 80 | :returns: @todo 81 | 82 | """ 83 | PGSql.logger.info("reconnecting") 84 | self.dbc.close() 85 | self.dbc = self.connect() 86 | 87 | def query(self, query): 88 | """Runs a query in unsafe way 89 | """ 90 | try: 91 | if self.requires_commit(query) is False: 92 | return self._query(query) 93 | with self.connect() as conn: 94 | return self._query(query, conn=conn) 95 | except psycopg2.Error: 96 | return None 97 | 98 | def safe_query(self, qtpl, data, conn=None, retries=0): 99 | """Executed binding query 100 | ex: select * from table where q=%s, d=%s 101 | 102 | :query: @todo 103 | :data: @todo 104 | :returns: @todo 105 | 106 | """ 107 | try: 108 | if self.requires_commit(qtpl) is False: 109 | return self.do_query(qtpl, data) 110 | with self.connect() as conn: 111 | return self.do_query(qtpl, data, conn=conn) 112 | except psycopg2.IntegrityError: 113 | self._query('rollback') 114 | PGSql.logger.debug("IntegrityError: %s", qtpl) 115 | return -2 116 | except (psycopg2.InterfaceError, psycopg2.OperationalError, 117 | psycopg2.DatabaseError): 118 | PGSql.logger.debug('closed, reconnecting') 119 | self.reconnect() 120 | retries += 1 121 | if retries > 5: 122 | PGSql.logger.exception("Failed to execute_query") 123 | return None 124 | self.safe_query(qtpl, data, conn, retries=retries) 125 | except psycopg2.Error: 126 | PGSql.logger.exception('Failed: %s', qtpl) 127 | return None 128 | 129 | def append_data(self, data, table, pkey='id'): 130 | """adds row to database 131 | 132 | :data: data to be saved 133 | :table: name of the table 134 | :pk: NEED to provide correct pk (primary key) column, to get last insert id 135 | """ 136 | qfields = make_columns(data) 137 | cols = ', '.join(data.keys()) 138 | query = "INSERT INTO %s (%s) VALUES (%s) RETURNING %s"\ 139 | % (table, cols, qfields, pkey) 140 | return self.execute_query(data, query) 141 | 142 | def append_all_data(self, data, table): 143 | """adds multiple rows, 144 | 145 | tries in single query first 146 | uses multiple queries if fails 147 | """ 148 | qfields = make_columns(data[0]) 149 | cols = ', '.join(data[0].keys()) 150 | query = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields) 151 | state = self.execute_query(data, query, True) 152 | if state == -2 or state == -3: 153 | cnt = 0 154 | for row in data: 155 | if self.append_data(row, table): 156 | cnt += 1 157 | return cnt 158 | else: 159 | return state 160 | 161 | def execute_query(self, data, query, many=False): 162 | """execute query 163 | 164 | :data: data to be saved 165 | :table: name of the table 166 | :many: multiple rows to be inserted or not 167 | :returns: True or None 168 | 169 | """ 170 | with self.connect() as conn: 171 | cur = None 172 | try: 173 | cur = conn.cursor() 174 | if many: 175 | cur.executemany(query, data) 176 | else: 177 | cur.execute(query, data) 178 | self.lastid = cur.fetchone()[0] 179 | return True 180 | except psycopg2.IntegrityError as iexp: 181 | PGSql.logger.debug("duplicate %s %s", query, iexp) 182 | return -2 183 | except psycopg2.DataError as err: 184 | PGSql.logger.debug("data error %s, %s", query, err) 185 | return -3 186 | except psycopg2.Error: 187 | PGSql.logger.exception("%s %s", query, data) 188 | return None 189 | 190 | 191 | class TestSQLITE(unittest.TestCase): 192 | """docstring for TestSQLITE""" 193 | 194 | def test_inserts(self): 195 | """test insert queries 196 | :returns: @todo 197 | 198 | """ 199 | dbc.append_data({'name': 'gmail.com', 'si': 10}, 'tests') 200 | dbc.append_data({'name': 'inbox.com', 'si': 12}, 'tests') 201 | dbc.append_data({'name': 'reddit.com', 'si': 1}, 'tests') 202 | dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests') 203 | dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests') 204 | dbc.query('insert into tests (name, si) values("google.com", 10)') 205 | self.assertEqual(1, 1) 206 | 207 | def test_queries(self): 208 | """test select queries 209 | :returns: @todo 210 | 211 | """ 212 | # TODO: test for duplicate entries 213 | result = dbc.select('tests', ['name||sgmail.com']) 214 | self.assertEqual(0, len(result.fetchall())) 215 | result = dbc.select('tests', ['name||gmail.com']) 216 | self.assertEqual(1, len(result.fetchall())) 217 | result = dbc.select('tests', ['si||2', 'si|or|12']) 218 | self.assertEqual(3, len(result.fetchall())) 219 | result = dbc.select('tests', ['name||gmail.com', 'name|or|inbox.com']) 220 | self.assertEqual(2, len(result.fetchall())) 221 | result = dbc.select('tests', ['name||reddit.com'], 'count(*)') 222 | self.assertEqual(3, result.fetchone()[0]) 223 | result = dbc.select('tests', at_end='order by si') 224 | result = dbc.select('tests', ['name||reddit.com'], 'count(*)', 225 | at_end='group by si') 226 | 227 | 228 | def main(): 229 | """ 230 | do some tests 231 | """ 232 | try: 233 | dbc.query('drop table if exists tests') 234 | # NOTE: better to use CREATE SEQUENCE <table>_id_seq than serial 235 | dbc.query('create table tests(id SERIAL, name varchar(20), si integer)') 236 | except Exception: 237 | pass 238 | unittest.main() 239 | 240 | 241 | if __name__ == '__main__': 242 | dbc = PGSql() 243 | main() 244 | -------------------------------------------------------------------------------- /sqlite.py: -------------------------------------------------------------------------------- 1 | """ 2 | sqlite driver 3 | """ 4 | import sqlite3 as sqlite 5 | from libs.config import Config 6 | from libs.dbbase import DBBase 7 | import logging 8 | import unittest 9 | 10 | 11 | def dict_factory(cursor, row): 12 | """ 13 | conver row to dict 14 | """ 15 | data = {} 16 | for idx, col in enumerate(cursor.description): 17 | data[col[0]] = row[idx] 18 | return data 19 | 20 | 21 | def make_columns(data): 22 | """makes column for sqlite 23 | 24 | :data: @todo 25 | :returns: @todo 26 | 27 | """ 28 | return ', '.join([':%s' % key for key in data.keys()]) 29 | 30 | 31 | # pylint: disable=too-many-instance-attributes 32 | class SQLite(DBBase): 33 | """ stores data in a sqlite table """ 34 | 35 | cfg = None 36 | logger = None 37 | 38 | def __init__(self, dbname=None, lazy_commit=False): 39 | super(SQLite, self).__init__() 40 | SQLite.cfg = Config() 41 | txt = '{}.sqlite'.format(SQLite.cfg.g('logger.base')) 42 | SQLite.logger = logging.getLogger(txt) 43 | self.dbname = dbname if dbname != None else SQLite.cfg.g('db.sqlite.file') 44 | self.timeout = SQLite.cfg.g('db.sqlite.timeout') 45 | self.query_queued = 0 46 | self.lastid = None 47 | strd = SQLite.cfg.g('db.sqlite.same_thread', 0) 48 | if strd == 0: 49 | self.same_thread = False 50 | else: 51 | self.same_thread = True 52 | self.connect() 53 | self.set_lazy_commit(lazy_commit) 54 | 55 | def set_lazy_commit(self, val): 56 | """enables lazy_commit 57 | :returns: @todo 58 | 59 | """ 60 | self.lazy_commit = val 61 | if self.lazy_commit: 62 | self.commit_func = self.should_commit_lazy 63 | self.query_queued = 0 64 | else: 65 | self.commit_func = self.should_commit 66 | 67 | def connect(self): 68 | """ 69 | connects to db 70 | 71 | """ 72 | self.dbc = sqlite.connect(self.dbname, self.timeout, 73 | check_same_thread=self.same_thread) 74 | 75 | def use_dict(self): 76 | """ 77 | use dictionary for rows 78 | """ 79 | self.dbc.row_factory = dict_factory 80 | 81 | def use_tuple(self): 82 | """ 83 | use tuple for row 84 | """ 85 | self.dbc.row_factory = sqlite.Row 86 | 87 | def close(self): 88 | """ 89 | close database 90 | """ 91 | self.dbc.close() 92 | 93 | def clear_database(self, table): 94 | """ 95 | clear table 96 | """ 97 | self.query("delete from %s" % table) 98 | 99 | def safe_query(self, qtpl, data): 100 | """Executed binding query 101 | ex: select * from table where q=:s, d=:k 102 | 103 | :query: @todo 104 | :data: @todo 105 | :commit: @todo 106 | :returns: @todo 107 | 108 | """ 109 | try: 110 | return self.do_query(qtpl, data) 111 | except sqlite.OperationalError: 112 | SQLite.logger.exception("query failed %s", qtpl) 113 | return None 114 | 115 | def make_condition(self, cond, col, col_name): 116 | """@todo: Docstring for make_condition. 117 | 118 | :cond: @todo 119 | :col: @todo 120 | :returns: @todo 121 | 122 | """ 123 | return '%s %s=:%s' % (cond, col, col_name) 124 | 125 | def query(self, query): 126 | try: 127 | return self._query(query) 128 | except sqlite.OperationalError: 129 | SQLite.logger.exception("query failed %s", query) 130 | return None 131 | 132 | def count_rows(self, query): 133 | res = self.query(query) 134 | result = res.fetchone() 135 | return result[0] 136 | 137 | def append_data(self, data, table): 138 | """ 139 | add rows to database 140 | """ 141 | qfields = make_columns(data) 142 | cols = ', '.join(data.keys()) 143 | sql = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields) 144 | return self.execute_query(data, sql) 145 | 146 | # pylint: disable=unused-argument 147 | def execute_query(self, data, query, many=False): 148 | """executes query 149 | 150 | :data: used data 151 | :query: query to execute 152 | :many: dummy param 153 | :returns: negative on error 154 | 155 | """ 156 | cur = None 157 | self.lastid = None 158 | try: 159 | cur = self.dbc.cursor() 160 | status = cur.execute(query, data) 161 | self.commit_func(query) 162 | try: 163 | self.lastid = cur.insert_id() 164 | except AttributeError: 165 | self.lastid = cur.lastrowid 166 | return status 167 | except sqlite.IntegrityError as sie: 168 | SQLite.logger.debug('IntegrityError: %s %s %s', sie, query, data) 169 | return -2 170 | except sqlite.DatabaseError as dbe: 171 | SQLite.logger.debug('DatabaseError: %s %s %s', dbe, query, data) 172 | return -4 173 | except sqlite.OperationalError as oie: 174 | SQLite.logger.debug('OperationalError %s', oie) 175 | return -3 176 | finally: 177 | if cur: 178 | cur.close() 179 | 180 | def append_all_data(self, data, table): 181 | """ 182 | append at once 183 | 184 | """ 185 | for row in data: 186 | self.append_data(row, table) 187 | self.dbc.commit() 188 | self.query_queued = 0 189 | 190 | def should_commit_lazy(self, query): 191 | """override for should_commit 192 | 193 | :query: @todo 194 | :returns: @todo 195 | 196 | """ 197 | self.query_queued += 1 198 | if self.query_queued >= 30: 199 | self.should_commit(query) 200 | self.query_queued = 0 201 | 202 | def force_commit(self): 203 | """forces to commit 204 | :returns: @todo 205 | 206 | """ 207 | self.query_queued = 0 208 | self.dbc.commit() 209 | 210 | 211 | class TestSQLITE(unittest.TestCase): 212 | """docstring for TestSQLITE""" 213 | 214 | def test_inserts(self): 215 | """test insert queries 216 | :returns: @todo 217 | 218 | """ 219 | db.append_data({'name': 'gmail.com', 'si': 10}, 'tests') 220 | db.append_data({'name': 'inbox.com', 'si': 12}, 'tests') 221 | db.append_data({'name': 'reddit.com', 'si': 1}, 'tests') 222 | db.append_data({'name': 'reddit.com', 'si': 2}, 'tests') 223 | db.append_data({'name': 'reddit.com', 'si': 2}, 'tests') 224 | db.query('insert into tests (name, si) values("google.com", 10)') 225 | cnt = db.count_rows('select count(*) rows from tests') 226 | self.assertEqual(cnt, 6) 227 | 228 | def test_queries(self): 229 | """test select queries 230 | 231 | :returns: @todo 232 | """ 233 | result = db.select('tests', ['name||sgmail.com']) 234 | self.assertEqual(0, len(result.fetchall())) 235 | result = db.select('tests', ['name||gmail.com']) 236 | self.assertEqual(1, len(result.fetchall())) 237 | result = db.select('tests', ['si||2', 'si|or|12']) 238 | self.assertEqual(3, len(result.fetchall())) 239 | result = db.select('tests', ['name||gmail.com', 'name|or|inbox.com']) 240 | self.assertEqual(2, len(result.fetchall())) 241 | result = db.select('tests', ['name||reddit.com'], 'count(*)') 242 | self.assertEqual(3, result.fetchone()[0]) 243 | result = db.select('tests', ['name||reddit.com'], 'count(*)', 244 | at_end='group by si') 245 | 246 | def test_non_lazy_commit(self): 247 | """test with possible unique data 248 | :returns: @todo 249 | 250 | """ 251 | db.set_lazy_commit(False) 252 | for k in range(0, 1000): 253 | db.append_data({'name': 'email_%s.com' % k, 'si': k}, 'uniquetests') 254 | cnt = db.count_rows('select count(*) rows from uniquetests') 255 | self.assertEqual(1000, cnt) 256 | db.query('delete from uniquetests') 257 | db.dbc.commit() 258 | 259 | def test_lazy_commit(self): 260 | """test lazy commit 261 | 262 | """ 263 | db.set_lazy_commit(True) 264 | for k in range(0, 1000): 265 | db.append_data({'name': 'email_%s.com' % k, 'si': k}, 'uniquetests') 266 | cnt = db.count_rows('select count(*) rows from uniquetests') 267 | self.assertEqual(1000, cnt) 268 | 269 | 270 | def main(): 271 | """ 272 | test starts here 273 | """ 274 | try: 275 | unittest.main() 276 | # pylint: disable=broad-except 277 | except Exception: 278 | pass 279 | 280 | if __name__ == '__main__': 281 | import os 282 | if os.path.exists('db'): 283 | os.unlink('db') 284 | # pylint: disable=invalid-name 285 | db = SQLite() 286 | db.query('create table tests( name test, si integer)') 287 | db.query('create table uniquetests(name test unique, si integer)') 288 | main() 289 | if os.path.exists('db'): 290 | os.unlink('db') 291 | -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | downloader 4 | """ 5 | import logging 6 | import requests 7 | import pycurl 8 | from io import BytesIO 9 | import time 10 | import random 11 | try: 12 | import libs.config as config 13 | import libs.utils as utils 14 | except ImportError: 15 | # pylint: disable=relative-import 16 | import config 17 | import utils 18 | 19 | 20 | USER_AGENT = 'Mozilla/5.0 Gecko/20120101 Firefox/40.0' 21 | SLEEP_AFTER = 10 22 | SLEEP = 3 23 | 24 | 25 | class Error(Exception): 26 | """handles exceptions""" 27 | def __init__(self, value=None): 28 | self.value = value 29 | 30 | def __str__(self): 31 | return repr(self.value) 32 | 33 | 34 | class RetryableError(Error): 35 | """docstring for ConnectionError""" 36 | def __init__(self, value=None): 37 | super(RetryableError, self).__init__() 38 | self.value = value 39 | 40 | 41 | class SSLError(Error): 42 | """docstring for SSLError""" 43 | def __init__(self, value=None): 44 | super(SSLError, self).__init__() 45 | self.value = value 46 | 47 | 48 | class ConnectionError(Error): 49 | """docstring for ConnectionError""" 50 | def __init__(self, value=None): 51 | super(ConnectionError, self).__init__() 52 | self.value = value 53 | 54 | 55 | def request_factory(page, proxy, headers, timeout, logger=None): 56 | """uses request to download""" 57 | logging.getLogger("requests").setLevel(logging.WARNING) 58 | try: 59 | with requests.Session() as session: 60 | session.headers.update(headers) 61 | if page.post != None: 62 | response = session.post(page.url, page.post, proxies=proxy, 63 | timeout=timeout) 64 | else: 65 | response = session.get(page.url, proxies=proxy, timeout=timeout) 66 | # download page and set response details 67 | page.set_text(response.text, response.content) \ 68 | .set_status_code(response.status_code) \ 69 | .set_redirected_to_url(response.url) 70 | except requests.exceptions.Timeout: 71 | logger.error("Timed out: %s", page.url) 72 | raise RetryableError('timed out') 73 | except requests.packages.urllib3.exceptions.ReadTimeoutError: 74 | logger.exception("%s", page.url) 75 | raise RetryableError('read timed out') 76 | except requests.exceptions.ProxyError: 77 | logger.exception("%s", page.url) 78 | raise RetryableError(proxy) 79 | except requests.exceptions.SSLError: 80 | logger.exception("%s", page.url) 81 | raise SSLError() 82 | except requests.exceptions.InvalidSchema: 83 | logger.exception('Failed to parse: %s', page.url) 84 | raise ConnectionError() 85 | except requests.ConnectionError: 86 | logger.exception('Failed to parse: %s', page.url) 87 | raise ConnectionError() 88 | 89 | 90 | def curl_factory(page, proxy, headers, timeout, logger=None): 91 | """uses curl to download""" 92 | curl_headers = [] 93 | for key in headers: 94 | curl_headers.append('%s: %s' % (key, headers[key])) 95 | curl_headers += ['Accept-Charset: UTF-8'] 96 | response = BytesIO() 97 | headers = BytesIO() 98 | curl = pycurl.Curl() 99 | try: 100 | curl.setopt(curl.URL, page.url) 101 | except UnicodeEncodeError: 102 | logger.error("URL ISSUE: %s", page.url) 103 | raise Error() 104 | curl.setopt(curl.TIMEOUT, timeout) 105 | curl.setopt(curl.WRITEFUNCTION, response.write) 106 | curl.setopt(curl.HEADERFUNCTION, headers.write) 107 | curl.setopt(curl.HTTPHEADER, curl_headers) 108 | curl.setopt(curl.FOLLOWLOCATION, True) 109 | curl.setopt(curl.TIMEOUT, timeout * 2) 110 | if proxy != None: 111 | logger.debug("setting proxy: %s", proxy) 112 | curl.setopt(curl.PROXY, proxy['http']) 113 | if page.post is not None: 114 | logger.debug("setting post: %s", page.post) 115 | curl.setopt(curl.POSTFIELD, page.post) 116 | try: 117 | curl.perform() 118 | except pycurl.error: 119 | logger.exception('failed downloading') 120 | raise Error() 121 | text = response.getvalue().decode('UTF-8', errors='ignore') 122 | status_code = curl.getinfo(curl.RESPONSE_CODE) 123 | page.set_text(text, response).set_status_code(status_code) 124 | try: 125 | headers.seek(0) 126 | lines = headers.getvalue().decode('UTF-8').split('\r\n') 127 | redirected_to = page.url 128 | for line in lines: 129 | if 'Location' in line: 130 | redirected_to = line.split(': ')[-1] 131 | curl.close() 132 | page.set_redirected_to_url(redirected_to) 133 | except Exception: 134 | logger.exception('failed parsing headers') 135 | page.set_redirected_to_url(page.url) 136 | 137 | 138 | def cleanup_url(url): 139 | """cleans up the given url of weird stuffs 140 | 141 | :url: @todo 142 | :returns: @todo 143 | 144 | """ 145 | url = url.replace(' ', '%20').lower() 146 | url = url.replace('<br%20>', '') 147 | url = url.replace('<br%20/>', '') 148 | return url 149 | 150 | 151 | # pylint: disable=too-few-public-methods 152 | class BaseCommon(object): 153 | """logger class""" 154 | 155 | def __init__(self): 156 | super(BaseCommon, self).__init__() 157 | self.cfg = config.Config() 158 | txt = '{}.dm'.format(self.cfg.g('logger.base')) 159 | self.log = logging.getLogger(txt) 160 | 161 | 162 | class BaseDownloader(BaseCommon): 163 | """docstring for BaseDownloader""" 164 | 165 | def __init__(self): 166 | super(BaseDownloader, self).__init__() 167 | self.downloads = 0 168 | self.timeout = self.cfg.g('timeout', 60) 169 | self.from_cache = False 170 | self.current_proxy = None 171 | self.proxy_used = 0 172 | self.bad_proxies = set() 173 | self.headers = {'USER_AGENT': USER_AGENT} 174 | self.use_proxy = self.cfg.g('proxies', 'no') == 'yes' 175 | self.use_curl = self.cfg.g('use_curl', 'no') == 'yes' 176 | self.load_bad_proxies() 177 | self.which_downloader() 178 | 179 | def set_logger(self, logger): 180 | """sets up independent logger 181 | 182 | :logger: @todo 183 | :returns: @todo 184 | 185 | """ 186 | self.log = logger 187 | 188 | def which_downloader(self): 189 | """sets which downloader to be used""" 190 | if self.use_curl: 191 | self.download_with = curl_factory 192 | else: 193 | self.download_with = request_factory 194 | 195 | def proxy_enabled(self): 196 | """check if proxy is enabled 197 | :returns: @todo 198 | 199 | """ 200 | return self.use_proxy 201 | 202 | def load_bad_proxies(self): 203 | """loads up bad proxies 204 | :returns: @todo 205 | 206 | """ 207 | if self.proxy_enabled(): 208 | self.current_proxy = self.get_random_proxy() 209 | try: 210 | self.bad_proxies = set(utils.read_file('bad_proxies', True)) 211 | except OSError: 212 | pass 213 | 214 | def get_random_proxy(self): 215 | """returns a proxy from proxies.txt 216 | 217 | :returns: @todo 218 | """ 219 | proxy_file = self.cfg.g('proxy_file', 'proxies.txt') 220 | proxies = utils.read_file(proxy_file, True) 221 | while True: 222 | proxy = random.choice(proxies) 223 | if proxy in self.bad_proxies: 224 | continue 225 | self.proxy_used = 0 226 | return proxy 227 | 228 | def _download(self, page, proxy=None): 229 | """does the actual download""" 230 | error_count = 0 231 | while True: 232 | try: 233 | self.download_with(page, proxy, self.headers, 234 | self.timeout, self.log) 235 | if self.proxy_enabled(): 236 | self.proxy_used += 1 237 | return 238 | except ConnectionError: 239 | return 240 | except RetryableError: 241 | if self.proxy_enabled(): 242 | self.bad_proxies.add(proxy['http']) 243 | utils.append_to_file('bad_proxies', proxy['http'] + '\n') 244 | self.current_proxy = self.get_random_proxy() 245 | proxy = {'http': self.current_proxy} 246 | error_count += 1 247 | if error_count > 3: 248 | raise Error() 249 | 250 | def take_a_nap_after(self, after, duration): 251 | """force sleep :after: for :duration:""" 252 | if self.downloads % after == 0: 253 | time.sleep(duration) 254 | 255 | def download(self, page): 256 | """downloads given url""" 257 | if self.proxy_used >= self.cfg.g('proxy.used', 100): 258 | old_proxy = self.current_proxy 259 | self.current_proxy = self.get_random_proxy() 260 | self.log.info("proxy: %s -> %s", old_proxy, self.current_proxy) 261 | if self.proxy_enabled(): 262 | proxy = {'http': self.current_proxy} 263 | else: 264 | proxy = None 265 | url = cleanup_url(page.url) 266 | if page.url != url: 267 | page.set_url(url) 268 | try: 269 | start_time = time.time() 270 | self._download(page, proxy) 271 | end_time = time.time() 272 | page.set_load_time(end_time - start_time) 273 | self.take_a_nap_after(SLEEP_AFTER, SLEEP) 274 | self.downloads = self.downloads + 1 275 | except requests.ConnectionError: 276 | self.log.debug("ConnectionError: %s", url) 277 | except (UnboundLocalError, AttributeError, Error): 278 | page.set_state(False) 279 | 280 | 281 | class CachedDownloader(BaseDownloader): 282 | """downloads and save webpages""" 283 | 284 | def download(self, page): 285 | content = '' 286 | fullpath = utils.get_cache_full_path(page.url, page.post) 287 | if utils.is_valid_cache_file(fullpath): 288 | content = utils.read_file(fullpath) 289 | page.set_text(content).set_redirected_to_url(page.url).set_load_time(0) 290 | else: 291 | super(CachedDownloader, self).download(page) 292 | if page.state: 293 | utils.save_to_file(fullpath, page.text, True) 294 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import codecs 3 | import logging 4 | import logging.handlers 5 | import os 6 | import re 7 | import json 8 | try: 9 | from urllib.parse import urlparse 10 | except ImportError: 11 | from urlparse import urlparse 12 | try: 13 | import libs.config as config 14 | except ImportError: 15 | import config 16 | try: 17 | from hashlib import md5 18 | except ImportError: 19 | from md5 import md5 20 | 21 | 22 | def read_file(filename, linewise=False): 23 | """ 24 | reads a file, either as string or by line 25 | 26 | """ 27 | try: 28 | with open(filename) as fptr: 29 | content = fptr.read().strip() 30 | if linewise: 31 | content = content.split("\n") 32 | return content 33 | except Exception as e: 34 | raise e 35 | 36 | 37 | def uni(text): 38 | """get some unicode love 39 | 40 | :text: @todo 41 | :returns: @todo 42 | 43 | """ 44 | try: 45 | return unicode(text) 46 | except NameError: 47 | return text 48 | 49 | 50 | def save_to_file(filename, content, use_codec=False): 51 | if use_codec: 52 | with codecs.open(filename, encoding='utf-8', mode='w') as fp: 53 | try: 54 | return fp.write(unicode(content)) 55 | except NameError: 56 | pass 57 | try: 58 | return fp.write(content.encode('utf-8')) 59 | except TypeError: 60 | pass 61 | return fp.write(content) 62 | else: 63 | with open(filename, mode='w') as fp: 64 | fp.write(content) 65 | 66 | 67 | def append_to_file(filename, content): 68 | with open(filename, mode='a+') as fp: 69 | fp.write(content) 70 | 71 | 72 | def get(arr, indx): 73 | try: 74 | return arr[indx] 75 | except IndexError: 76 | return None 77 | 78 | 79 | def remove_extra_whitespace(txt): 80 | return re.sub(' +', ' ', txt) 81 | 82 | 83 | def cleanup_text(text): 84 | t = text.strip() 85 | t = re.sub('\t+', '', t) 86 | t = re.sub('\n+', '\n', t) 87 | t = re.sub(' +', ' ', t) 88 | t = re.sub('\xa0', '', t) 89 | t = re.sub('\u2022', '', t) 90 | return t 91 | 92 | 93 | def remove_br(content): 94 | """removes <br> tag 95 | 96 | :content: @todo 97 | :returns: @todo 98 | 99 | """ 100 | content = content.replace('<br>', '\n') 101 | content = content.replace('</br>', '\n') 102 | content = content.replace('<br />', '\n') 103 | content = content.replace('<br%20/>', '\n') 104 | return content 105 | 106 | 107 | def clean_url(lnk, baseurl): 108 | """cleans up url""" 109 | lnk = lnk.replace('.html', '') 110 | lnk = lnk.replace('.htm', '') 111 | lnk = lnk.replace(baseurl, '') 112 | lnk = lnk.replace('%20', ' ') 113 | return lnk.lower() 114 | 115 | 116 | def union(l1, l2): 117 | a = l1.copy() 118 | b = l2.copy() 119 | for e in a[:]: 120 | if e in b: 121 | a.remove(e) 122 | b.remove(e) 123 | return a, b 124 | 125 | 126 | def joindict(d1, d2): 127 | d = d1.copy() 128 | d.update(d2) 129 | return d 130 | 131 | 132 | def hash(url, data=None): 133 | """ creates hash of the url and post data (if required and exists)""" 134 | m = md5() 135 | m.update(url.encode('utf-8')) 136 | if data is not None: 137 | m.update(data) 138 | return m.hexdigest() 139 | 140 | 141 | def setup_logger(load_cfg=None): 142 | """sets up logging""" 143 | # {{{ load config 144 | if load_cfg != None: 145 | cfg = config.Config(load_cfg) 146 | else: 147 | cfg = config.Config() 148 | # }}} 149 | # {{{ setting up everything 150 | logger = logging.getLogger(cfg.g('logger.base')) 151 | level = getattr(logging, cfg.g('logger.level')) 152 | clevel = getattr(logging, cfg.g('logger.console.level')) 153 | flevel = getattr(logging, cfg.g('logger.file.level')) 154 | logger.setLevel(level) 155 | logfilepath = cfg.g('logger.path') 156 | maxsize = cfg.g('logger.backupsize', default=33554432) 157 | filehandler = logging.handlers.RotatingFileHandler(logfilepath, 158 | mode='w', 159 | maxBytes=maxsize, 160 | backupCount=2) 161 | template = cfg.get('logger', 'template') 162 | formatter = logging.Formatter(template) 163 | formatter.datefmt = cfg.g('logger.datefmt') 164 | # }}} 165 | # {{{ configure handlers 166 | console_off = cfg.g('logger.console.off', 'no') 167 | if console_off == 'no': 168 | consolehandler = logging.StreamHandler() 169 | consolehandler.setFormatter(formatter) 170 | consolehandler.setLevel(clevel) 171 | logger.addHandler(consolehandler) 172 | filehandler.setFormatter(formatter) 173 | filehandler.setLevel(flevel) 174 | logger.addHandler(filehandler) 175 | # }}} 176 | return logger 177 | 178 | 179 | def dict_g(dct, key, default=False): 180 | keys = key.split('.') 181 | k = dct 182 | for kwrd in keys: 183 | if kwrd not in k: 184 | return default 185 | k = k[kwrd] 186 | return k 187 | 188 | 189 | def dict_s(d, ky, val): 190 | keys = ky.split('.') 191 | k = d 192 | if len(keys) == 1: 193 | d[ky] = val 194 | while True: 195 | kw = keys.pop(0) 196 | if kw not in k: 197 | if len(keys) == 0: 198 | k[kw] = None 199 | else: 200 | k[kw] = {} 201 | if type(k[kw]).isisntance(type({})): 202 | k[kw] = val 203 | break 204 | else: 205 | k = k[kw] 206 | 207 | 208 | def flat_rows(listing): 209 | rows = [] 210 | for item in listing: 211 | rows.append(item[0]) 212 | return '\n'.join(rows) 213 | 214 | 215 | def search_line_in_file(filename, text): 216 | """searchs a text linewise in a file 217 | 218 | :filename: @todo 219 | :text: @todo 220 | :returns: @todo 221 | 222 | """ 223 | with open(filename) as f: 224 | return text in f 225 | return False 226 | 227 | 228 | def get_timestamp(): 229 | """get current unix timestamp 230 | :returns: @todo 231 | 232 | """ 233 | return (datetime.now() - datetime(1970, 1, 1)).total_seconds() 234 | 235 | 236 | def is_valid_cache_file(fullpath, notlessthan=100): 237 | """check if given file path is not an incomplete 238 | html file""" 239 | if not os.path.exists(fullpath): 240 | return False 241 | statinfo = os.stat(fullpath) 242 | return statinfo.st_size > notlessthan 243 | 244 | 245 | def get_cache_full_path(url, post=None): 246 | """generate full path for given URL with POST data 247 | 248 | :url: 249 | :post: post data to pass 250 | :returns: full path of the cache file 251 | 252 | """ 253 | filename = hash(url, post) 254 | return file_cached_path(filename, url) 255 | 256 | 257 | def clean_failed_page_cache(url, post=None): 258 | """ 259 | remove cached files that failed 260 | """ 261 | fullpath = get_cache_full_path(url, post) 262 | if os.path.exists(fullpath): 263 | os.unlink(fullpath) 264 | 265 | 266 | def file_cached_path(filename, url=None): 267 | """ expects hashed filename """ 268 | burl = '' 269 | if url: 270 | burl = url.replace('http://', '') 271 | burl = burl.replace('https://', '') 272 | burl = burl.replace('www.', '') 273 | burl = burl.split('/')[0] 274 | segsize = 3 275 | cachepath = 'cache' 276 | firstpart = filename[0:segsize] 277 | secondpart = filename[segsize: 2 * segsize] 278 | fullpath = "%s/%s/%s/%s" % (cachepath, burl, firstpart, secondpart) 279 | if not os.path.exists(fullpath): 280 | os.makedirs(fullpath) 281 | return '%s/%s.html' % (fullpath, filename) 282 | 283 | 284 | class DateTimeEncoder(json.JSONEncoder): 285 | """ encode datetime to proper string for json 286 | DateTimeEncoder().encode(object) 287 | """ 288 | 289 | def default(self, obj): 290 | if isinstance(obj, datetime.datetime): 291 | return obj.isoformat() 292 | elif isinstance(obj, datetime.date): 293 | return obj.isoformat() 294 | elif isinstance(obj, datetime.timedelta): 295 | return (datetime.datetime.min + obj).time().isoformat() 296 | else: 297 | return super(DateTimeEncoder, self).default(obj) 298 | 299 | 300 | def delete_folder_content(folder, delete_parent=False): 301 | """delete a folders content 302 | 303 | :folder: @todo 304 | :returns: @todo 305 | 306 | """ 307 | import os 308 | import shutil 309 | for the_file in os.listdir(folder): 310 | file_path = os.path.join(folder, the_file) 311 | try: 312 | if os.path.isfile(file_path): 313 | os.unlink(file_path) 314 | elif os.path.isdir(file_path): 315 | shutil.rmtree(file_path) 316 | except Exception as e: 317 | print(e) 318 | if delete_parent: 319 | os.rmdir(folder) 320 | 321 | 322 | def get_net_loc(url): 323 | """get net location without domain 324 | 325 | :url: url to clean 326 | :returns: @todo 327 | 328 | """ 329 | urlobj = urlparse(url.replace('www.', '')) 330 | netloc = urlobj.netloc.split('.') 331 | if len(netloc) > 2: 332 | return '.'.join(netloc[1:]) 333 | else: 334 | return urlobj.netloc 335 | 336 | 337 | def get_shorted_url(url, length=10): 338 | """cuts url for logger """ 339 | try: 340 | segs = url.netloc.split('www.')[1][:length] 341 | return (''.join(segs)).center(length + 4, '_') 342 | except IndexError: 343 | return (''.join(url.netloc[:length])).center(length + 4, '_') 344 | except AttributeError: 345 | urlobj = urlparse(url) 346 | try: 347 | segs = urlobj.netloc.split('www.')[1][:length] 348 | return (''.join(segs)).center(length + 4, '_') 349 | except IndexError: 350 | return (''.join(urlobj.netloc[:length])).center(length + 4, '_') 351 | 352 | 353 | def get_domain(url, tlds): 354 | """extracts top level domain""" 355 | try: 356 | url_elements = urlparse(url)[1].split('.') 357 | except TypeError: 358 | return ValueError("Failed to check url") 359 | for i in range(-len(url_elements), 0): 360 | last_i_elements = url_elements[i:] 361 | # i=-3: ["abcde","co","uk"] 362 | # i=-2: ["co","uk"] 363 | # i=-1: ["uk"] etc 364 | # abcde.co.uk, co.uk, uk 365 | candidate = ".".join(last_i_elements) 366 | # *.co.uk, *.uk, * 367 | wildcard_candidate = ".".join(["*"] + last_i_elements[1:]) 368 | exception_candidate = "!" + candidate 369 | # match tlds: 370 | if (exception_candidate in tlds): 371 | return ".".join(url_elements[i:]) 372 | if (candidate in tlds or wildcard_candidate in tlds): 373 | return ".".join(url_elements[i - 1:]) 374 | # returns "abcde.co.uk" 375 | raise ValueError("Domain not in global list of TLDs") 376 | 377 | 378 | def older_than(filepath, hours): 379 | """check if file path is older than given time 380 | 381 | :filepath: file to check 382 | :hours: how many hours 383 | :returns: true/false 384 | """ 385 | mtime = os.path.getmtime(filepath) 386 | import time 387 | ctime = time.time() 388 | time_passed = (ctime - mtime) / 3600 389 | if time_passed > hours: 390 | return True 391 | return False 392 | 393 | 394 | def get_tlds(): 395 | """load tld""" 396 | from libs.pages import DownloadedPage 397 | from libs.downloader import BaseDownloader 398 | fname = "effective_tld_names.dat.txt" 399 | if not os.path.exists(fname) or older_than(fname, 12): 400 | url = 'https://publicsuffix.org/list/effective_tld_names.dat' 401 | dlm = BaseDownloader() 402 | page = DownloadedPage().set_url(url) 403 | dlm.download(page=page) 404 | save_to_file(fname, page.text) 405 | with open("effective_tld_names.dat.txt") as tld_file: 406 | tlds = [line.strip() for line in tld_file if line[0] not in "/\n"] 407 | return tlds 408 | --------------------------------------------------------------------------------