├── __init__.py
├── .gitignore
├── logprovider.py
├── cfg
    └── config.json
├── config.py
├── client_async.py
├── result.py
├── server.py
├── message.py
├── listener.py
├── server_mp.py
├── client.py
├── modulewatcher.py
├── dbbase.py
├── testing.py
├── pages.py
├── test_downloader.py
├── mysql.py
├── pgsql.py
├── sqlite.py
├── downloader.py
└── utils.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 
3 | 


--------------------------------------------------------------------------------
/logprovider.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class LogProvider(object):
 5 |     """make loggers"""
 6 |     def __init__(self):
 7 |         super(LogProvider, self).__init__()
 8 | 
 9 |     def get(self, config, suffix):
10 |         """get logger with name
11 | 
12 |         :config: config class
13 |         :suffix: additional text
14 |         :returns: @todo
15 | 
16 |         """
17 |         txt = '{}.{}'.format(config.g('logger.base'), suffix)
18 |         return logging.getLogger(txt)
19 | 
20 | 


--------------------------------------------------------------------------------
/cfg/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"caching": "no",
 3 | 	"proxies": "no",
 4 | 	"timeout":  60,
 5 | 	"logger": {
 6 | 		"template" : "%(asctime)s - %(levelname)-6s - %(name)s - %(lineno)d - %(message)s",
 7 | 		"path": "../logs/testing.log",
 8 | 		"base": "app_name",
 9 | 		"level": "DEBUG",
10 | 		"console": { "off": "yes", "level": "INFO" },
11 | 		"file": { "level": "DEBUG" },
12 | 		"backupsize": 100000000
13 | 	},
14 | 	"db": {
15 | 		"sqlite": {
16 | 			"file": "database_file_name",
17 | 			"timeout": 10,
18 | 			"same_thread": 1
19 | 		}
20 | 	}
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class Config(object):
 5 |     """manage configurations"""
 6 |     def __init__(self, cfgfile='cfg/config.json'):
 7 |         super(Config, self).__init__()
 8 |         self.cfgfile = cfgfile
 9 |         with open(self.cfgfile) as jsonfile:
10 |             self.cfg = json.load(jsonfile)
11 | 
12 |     def get(self, *args, **kwargs):
13 |         k = self.cfg
14 |         for kw in args:
15 |             k = k[kw]
16 |         return k
17 | 
18 |     def g(self, ky, default=False):
19 |         keys = ky.split('.')
20 |         k = self.cfg
21 |         for kwrd in keys:
22 |             if kwrd not in k:
23 |                 return default
24 |             k = k[kwrd]
25 |         return k
26 | 


--------------------------------------------------------------------------------
/client_async.py:
--------------------------------------------------------------------------------
 1 | from libs.config import Config
 2 | import logging
 3 | from libs.message import Message
 4 | import asyncore
 5 | 
 6 | 
 7 | class AsyncHandler(asyncore.dispatcher_with_send):
 8 | 
 9 |     def __init__(self, sock, request_provider):
10 |         self.request_provider = request_provider
11 |         cfg = Config()
12 |         lcfg = '{}.client'.format(cfg.g('logger.base'))
13 |         self.logger = logging.getLogger(lcfg)
14 |         asyncore.dispatcher_with_send.__init__(self, sock)
15 | 
16 |     def handle_read(self):
17 |         """reads data from socket"""
18 |         data = self.recv(2048)
19 |         data = data.strip().decode('utf-8')
20 |         self.logger.debug('received: %s', data)
21 |         self.request = self.request_provider(data, Message(self))
22 |         self.request.check_request()
23 | 


--------------------------------------------------------------------------------
/result.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def _rst(r, k='s', v=None):
 3 |     if v == None: return result[k]
 4 |     result[k] = v
 5 | 
 6 | def s(result, v=None):
 7 |     return _rst(result, 's', v)
 8 | 
 9 | def m(result, v=None):
10 |     return _rst(result, 'match', v)
11 | 
12 | def o(result, v=None):
13 |     return _rst(result, 'original', v)
14 | 
15 | def b(result, v=None):
16 |     return _rst(result, 'by', v)
17 | 
18 | def i(result, v=None):
19 |     return _rst(result, 'info', v)
20 | 
21 | def gt(m, o, i):
22 |     return { "s": True, "match": m, "original":o, "info":i, "by": 'title' }
23 | 
24 | def gc(m, o, i):
25 |     return { "s": True, "match": m, "original":o, "info":i, "by": 'content' }
26 | 
27 | def gl(m, o, i):
28 |     return { "s": True, "match": m, "original":o, "info":i, "by": 'link' }
29 | 
30 | def gf(o):
31 |     return { "s": False, "original":o }
32 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
 1 | from libs import listener
 2 | from libs import utils
 3 | from libs.mysql import MySQL
 4 | 
 5 | logger = utils.setup_logger()
 6 | 
 7 | 
 8 | class Server(object):
 9 |     """docstring for Server"""
10 |     def __init__(self, host, port):
11 |         super(Server, self).__init__()
12 |         self.host = host
13 |         self.port = int(port)
14 |         self.db = MySQL()
15 |         self.db.query('delete from runlogs')
16 | 
17 |     def start(self):
18 |         try:
19 |             conn = listener.ConnectionThread(self.host, self.port, self.db)
20 |             utils.save_to_file('port', '{}'.format(self.port))
21 |             conn.start()
22 |         except Exception as e:
23 |             print ("FAILED")
24 |             print (e)
25 |             try:
26 |                 if conn:
27 |                     conn.s.close()
28 |             except Exception as e:
29 |                 pass
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/message.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | # {{{ Messages
 5 | class Message(object):
 6 |     """sends message and things"""
 7 |     def __init__(self, conn):
 8 |         super(Message, self).__init__()
 9 |         self.conn = conn
10 | 
11 |     def send_msg(self, msg, close=True):
12 |         """
13 |         send a message through socket
14 |         """
15 |         try:
16 |             msg = json.dumps(msg)
17 |         except Exception:
18 |             pass
19 |         replylen = len(msg)
20 |         msg = "%s\n%s" % (replylen, msg)
21 |         self.conn.send(bytearray(msg, 'utf8'))
22 |         if close:
23 |             self.conn.close()
24 | 
25 |     def send_good_msg(self, msg):
26 |         """
27 |         send success messages
28 |         """
29 |         self.send_msg({'s': 1, 'm': msg})
30 | 
31 |     def send_fail_msg(self, msg):
32 |         """
33 |         send fail messages
34 |         """
35 |         self.send_msg({'s': 0, 'm': msg})
36 | 
37 |     def send_good_result(self, data):
38 |         """
39 |         sent result when success
40 |         """
41 |         data['s'] = 1
42 |         self.send_msg(data)
43 | 
44 |     def send_fail_result(self, data):
45 |         """
46 |         send result with fail
47 |         """
48 |         data['s'] = 0
49 |         self.send_msg(data)
50 | 
51 | # }}}
52 | 


--------------------------------------------------------------------------------
/listener.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import socket
 3 | import threading
 4 | import sys
 5 | import os
 6 | from libs.config import Config
 7 | from inc.client import Client
 8 | 
 9 | g_config = Config()
10 | l = '{}.server'.format(g_config.g('logger.base'))
11 | logger = logging.getLogger(l)
12 | 
13 | 
14 | class ConnectionThread(threading.Thread):
15 | 
16 |     def __init__(self, host, port, db):
17 |         super(ConnectionThread, self).__init__()
18 |         self.db = db
19 |         self.db.query('delete from runlogs')
20 |         try:
21 |             self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
22 |             self.s.bind((host, port))
23 |             logger.info('listening to %s:%s', host, port)
24 |             self.s.listen(3)
25 |         except socket.error:
26 |             self.s.close()
27 |             sys.exit()
28 |         self.clients = []
29 | 
30 |     def main_loop(self):
31 |         try:
32 |             while True:
33 |                 if os.path.exists('cache/stop'):
34 |                     break
35 |                 conn, address = self.s.accept()
36 |                 logger.info('[+] Client connected: {0}'.format(address[0]))
37 |                 c = Client(conn, self.db)
38 |                 c.start()
39 |                 self.clients.append(c)
40 |         except Exception:
41 |             logger.exception("Error!")
42 |         finally:
43 |             logger.info("[-] Closing connection")
44 |             conn.close()
45 |             os.remove('cache/stop')
46 |             sys.exit()
47 | 
48 |     def run(self):
49 |         self.main_loop()
50 | 


--------------------------------------------------------------------------------
/server_mp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | multiprocessing server
 3 | """
 4 | from libs import utils
 5 | from time import sleep
 6 | 
 7 | LOGGER = utils.setup_logger()
 8 | 
 9 | 
10 | class Server(object):
11 |     """docstring for ServerMultiProcess"""
12 | 
13 |     def __init__(self):
14 |         super(Server, self).__init__()
15 |         self.client_provider = None
16 |         self.provider = None
17 | 
18 |     def set_client_provider(self, client_provider):
19 |         """request handler"""
20 |         self.client_provider = client_provider
21 |         return self
22 | 
23 |     def set_provider(self, provider):
24 |         """data provider
25 | 
26 |         :provider: @todo
27 |         :returns: @todo
28 | 
29 |         """
30 |         self.provider = provider
31 |         return self
32 | 
33 |     def start(self):
34 |         """everything starts here"""
35 |         process = None
36 |         try:
37 |             while True:
38 |                 try:
39 |                     providerdata = self.provider.get_queued_data()
40 |                     if providerdata is not None:
41 |                         process = self.client_provider()\
42 |                                       .set_provider_data(providerdata)
43 |                         process.start()
44 |                     sleep(3)
45 |                 except KeyboardInterrupt:
46 |                     break
47 |                 except Exception:
48 |                     LOGGER.exception("FAILED")
49 |                     break
50 |         except Exception:
51 |             LOGGER.exception("server out ...")
52 |         finally:
53 |             LOGGER.info('cleaning up')
54 |             # set running scrapers to be paused
55 |             if process != None:
56 |                 process.cleanup()
57 | 


--------------------------------------------------------------------------------
/client.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.dummy import Process
 2 | from libs.config import Config
 3 | import logging
 4 | 
 5 | 
 6 | class Client(Process):
 7 |     """Request handler
 8 | 
 9 |     Receives request from scraper server and creates new threaded process"""
10 |     def __init__(self):
11 |         super(Client, self).__init__()
12 |         self.request_provider = None
13 |         self.request_cleanup = None
14 |         self.request = None
15 |         self.providerdata = None
16 |         cfg = Config()
17 |         lcfg = '{}.client'.format(cfg.g('logger.base'))
18 |         self.logger = logging.getLogger(lcfg)
19 | 
20 |     def set_request_provider(self, request_provider):
21 |         """request handler,
22 | 
23 |         request handlers process the request and starts scraping"""
24 |         self.request_provider = request_provider
25 |         return self
26 | 
27 |     def set_request_cleanup(self, request_cleanup):
28 |         """request_cleanup function"""
29 |         self.request_cleanup = request_cleanup
30 |         return self
31 | 
32 |     def set_provider_data(self, providerdata):
33 |         """set data
34 | 
35 |         :providerdata: @todo
36 |         :returns: @todo
37 | 
38 |         """
39 |         self.providerdata = providerdata
40 |         return self
41 | 
42 |     def set_daemon(self, mode):
43 |         """should be daemon or not"""
44 |         self.daemon = mode
45 |         return self
46 | 
47 |     def run(self):
48 |         request = self.request_provider(self.providerdata)
49 |         request.setup_repo()
50 |         request.start()
51 |         return self
52 | 
53 |     def cleanup(self):
54 |         """cleans up when exiting
55 | 
56 |         :returns: @todo
57 |         """
58 |         self.logger.info("cleaning up")
59 |         self.request_cleanup()
60 | 


--------------------------------------------------------------------------------
/modulewatcher.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Author: Chris Eberle <eberle1080@gmail.com>
  3 | # Watch for any changes in a module or package, and reload it automatically
  4 | 
  5 | import pyinotify
  6 | import imp
  7 | import os
  8 | 
  9 | class ModuleWatcher(pyinotify.ProcessEvent):
 10 |     """
 11 |     Automatically reload any modules or packages as they change
 12 |     """
 13 | 
 14 |     def __init__(self):
 15 |         "El constructor"
 16 | 
 17 |         self.wm = pyinotify.WatchManager()
 18 |         self.notifier = None
 19 |         self.mod_map = {}
 20 | 
 21 |     def _watch_file(self, file_name, module):
 22 |         "Add a watch for a specific file, and map said file to a module name"
 23 | 
 24 |         file_name = os.path.realpath(file_name)
 25 |         self.mod_map[file_name] = module
 26 |         self.wm.add_watch(file_name, pyinotify.IN_MODIFY)
 27 |         #print 'Watching', file_name
 28 | 
 29 |     def watch_module(self, name):
 30 |         "Load a module, determine which files it uses, and watch them"
 31 | 
 32 |         if imp.is_builtin(name) != 0:
 33 |             # Pretty pointless to watch built-in modules
 34 |             return
 35 | 
 36 |         (fd, pathname, description) = imp.find_module(name)
 37 | 
 38 |         try:
 39 |             mod = imp.load_module(name, fd, pathname, description)
 40 |             if fd:
 41 |                 self._watch_file(fd.name, name)
 42 |             else:
 43 |                 for root, dirs, files in os.walk(pathname):
 44 |                     for filename in files:
 45 |                         fpath = os.path.join(root, filename)
 46 |                         if fpath.endswith('.py'):
 47 |                             self._watch_file(fpath, name)
 48 |         finally:
 49 |             if fd:
 50 |                 fd.close()
 51 | 
 52 |     def start_watching(self):
 53 |         "Start the pyinotify watch thread"
 54 | 
 55 |         if self.notifier is None:
 56 |             self.notifier = pyinotify.ThreadedNotifier(self.wm, self)
 57 |         self.notifier.start()
 58 | 
 59 |     def stop_watching(self):
 60 |         "Stop the pyinotify watch thread"
 61 | 
 62 |         if self.notifier is not None:
 63 |             self.notifier.stop()
 64 | 
 65 |     def process_IN_MODIFY(self, event):
 66 |         "A file of interest has changed"
 67 | 
 68 |         # Is it a file I know about?
 69 |         if event.path not in self.mod_map:
 70 |             return
 71 | 
 72 |         # Find out which module is using that file
 73 |         modname = self.mod_map[event.path]
 74 | 
 75 |         # Reload the module
 76 |         (fd, pathname, description) = imp.find_module(modname)
 77 |         try:
 78 |             print ('reloading module')
 79 |             imp.load_module(modname, fd, pathname, description)
 80 |         finally:
 81 |             if fd:
 82 |                 fd.close()
 83 | 
 84 |         #print 'Reload', modname
 85 | 
 86 | if __name__ == '__main__':
 87 |     # Test everything
 88 | 
 89 |     import sys
 90 | 
 91 |     mw = ModuleWatcher()
 92 |     mw.watch_module('module1')
 93 |     mw.watch_module('module2')
 94 |     mw.start_watching()
 95 | 
 96 |     try:
 97 |         raw_input('Press ENTER to exit')
 98 |     finally:
 99 |         mw.stop_watching()
100 |         sys.exit(0)
101 | 


--------------------------------------------------------------------------------
/dbbase.py:
--------------------------------------------------------------------------------
  1 | """
  2 | base database stuffs
  3 | """
  4 | import logging
  5 | from libs.config import Config
  6 | 
  7 | 
  8 | class DBBase(object):
  9 |     """base database object"""
 10 | 
 11 |     cfg = None
 12 |     logger = None
 13 | 
 14 |     def __init__(self):
 15 |         """
 16 |         initiate common requirements
 17 | 
 18 |         """
 19 |         self.dbc = None
 20 |         DBBase.cfg = Config()
 21 |         txt = '{}.dbbase'.format(DBBase.cfg.g('logger.base'))
 22 |         DBBase.logger = logging.getLogger(txt)
 23 | 
 24 |     def requires_commit(self, _query):
 25 |         """check if query is either insert/update/delete/truncate
 26 | 
 27 |         """
 28 |         query = _query.lower().strip()
 29 |         insert = query.startswith('insert')
 30 |         update = query.startswith('update')
 31 |         delete = query.startswith('delete')
 32 |         truncate = query.startswith('truncate')
 33 |         return insert or update or delete or truncate
 34 | 
 35 |     def should_commit(self, _query, conn=None):
 36 |         """
 37 |         determine if the query needs to be committed
 38 |         """
 39 |         if self.requires_commit(_query):
 40 |             if conn != None:
 41 |                 conn.commit()
 42 |             else:
 43 |                 self.dbc.commit()
 44 | 
 45 |     def do_query(self, qtpl, data, conn=None):
 46 |         """execute query
 47 | 
 48 |         :qtpl: @todo
 49 |         :data: @todo
 50 |         :returns: @todo
 51 | 
 52 |         """
 53 |         if conn != None:
 54 |             cur = conn.cursor()
 55 |         else:
 56 |             cur = self.dbc.cursor()
 57 |         cur.execute(qtpl, data)
 58 |         self.should_commit(qtpl, conn=conn)
 59 |         return cur
 60 | 
 61 |     def make_condition(self, cond, col, col_name):
 62 |         """method signature
 63 | 
 64 |         :cond: @todo
 65 |         :col: @todo
 66 |         :col_name: @todo
 67 |         :returns: @todo
 68 | 
 69 |         """
 70 |         raise NotImplementedError()
 71 | 
 72 |     def safe_query(self, querytpl, data, conn=None, retries=0):
 73 |         """method signature
 74 | 
 75 |         :querytpl: @todo
 76 |         :data: @todo
 77 |         :returns: @todo
 78 | 
 79 |         """
 80 |         raise NotImplementedError()
 81 | 
 82 |     def query(self, query):
 83 |         """method signature
 84 | 
 85 |         :querytpl: @todo
 86 |         :returns: @todo
 87 | 
 88 |         """
 89 |         raise NotImplementedError()
 90 | 
 91 |     def select(self, table, data=None, cols='*', at_end=''):
 92 |         """Executes simple select query
 93 | 
 94 |         :table: name of the table
 95 |         :data: [col|cond|val, ...]
 96 |         :cols: name of the columns
 97 |         :at_end: if we want order/limit/group
 98 |         :returns: cursor
 99 | 
100 |         """
101 |         if data == None:
102 |             querytpl = 'select %s from %s %s' % (cols, table, at_end)
103 |             return self.safe_query(querytpl, data)
104 |         conds = []
105 |         fdata = {}
106 |         for k, item in enumerate(data):
107 |             try:
108 |                 col, cond, val = item.split('|', 3)
109 |             except ValueError:
110 |                 breaks = item.split('|')
111 |                 col = breaks[0]
112 |                 cond = breaks[1]
113 |                 val = '|'.join(breaks[2:])
114 |             col_name = '%s_%s' % (col, k)
115 |             fdata[col_name] = val
116 |             conds.append(self.make_condition(cond, col, col_name))
117 |         querytpl = 'select %s from %s where %s %s' % (cols, table,
118 |                                                       ' '.join(conds), at_end)
119 |         return self.safe_query(querytpl, fdata)
120 | 
121 |     def _query(self, query, conn=None):
122 |         """runs query
123 | 
124 |         :query: @todo
125 |         :returns: @todo
126 | 
127 |         """
128 |         if conn != None:
129 |             cur = conn.cursor()
130 |         else:
131 |             cur = self.dbc.cursor()
132 |         cur.execute(query)
133 |         self.should_commit(query, conn=conn)
134 |         return cur
135 | 
136 |     def count_rows(self, query):
137 |         """
138 |         counts row using given query
139 |         """
140 |         res = self.query(query)
141 |         result = res.fetchone()
142 |         return result[0]
143 | 


--------------------------------------------------------------------------------
/testing.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import libs.downloader as downloader
  3 | import libs.utils as utils
  4 | 
  5 | 
  6 | CODES = {
  7 |     '200': [True, 'OK'],
  8 |     '201': [True, 'Created'],
  9 |     '202': [True, 'Accepted'],
 10 |     '203': [True, 'Non-Authoritative Information'],
 11 |     '204': [True, 'No Content'],
 12 |     '205': [True, 'Reset Content'],
 13 |     '206': [True, 'Partial Content'],
 14 |     '300': [True, 'Multiple Choices'],
 15 |     '301': [True, 'Moved Permanently'],
 16 |     '302': [True, 'Found'],
 17 |     '303': [True, 'See Other'],
 18 |     '304': [True, 'Not Modified'],
 19 |     '305': [True, 'Use Proxy'],
 20 |     '306': [True, 'Unused'],
 21 |     '307': [True, 'Temporary Redirect'],
 22 |     '308': [True, 'Permanent Redirect'],
 23 |     '400': [False, 'Bad Request'],
 24 |     '401': [False, 'Unauthorized'],
 25 |     '402': [False, 'Payment Required'],
 26 |     '403': [False, 'Forbidden'],
 27 |     '404': [False, 'Not Found'],
 28 |     '405': [False, 'Method Not Allowed'],
 29 |     '406': [False, 'Not Acceptable'],
 30 |     '407': [False, 'Proxy Authentication Required'],
 31 |     '408': [False, 'Request Timeout'],
 32 |     '409': [False, 'Conflict'],
 33 |     '410': [False, 'Gone'],
 34 |     '411': [False, 'Length Required'],
 35 |     '412': [False, 'Precondition Required'],
 36 |     '413': [False, 'Request Entry Too Large'],
 37 |     '414': [False, 'Request-URI Too Long'],
 38 |     '415': [False, 'Unsupported Media Type'],
 39 |     '416': [False, 'Requested Range Not Satisfiable'],
 40 |     '417': [False, 'Expectation Failed'],
 41 |     '418': [False, "I'm a teapot"],
 42 |     '422': [False, 'Unprocessable Entity'],
 43 |     '428': [False, 'Precondition Required'],
 44 |     '429': [False, 'Too Many Requests'],
 45 |     '431': [False, 'Request Header Fields Too Large'],
 46 |     '451': [False, 'Unavailable For Legal Reasons'],
 47 |     '500': [False, 'Internal Server Error'],
 48 |     '501': [False, 'Not Implemented'],
 49 |     '502': [False, 'Bad Gateway'],
 50 |     '503': [False, 'Service Unavailable'],
 51 |     '504': [False, 'Gateway Timeout'],
 52 |     '505': [False, 'HTTP Version Not Supported'],
 53 |     '511': [False, 'Network Authentication Required'],
 54 |     '520': [False, 'Web server is returning an unknown error'],
 55 |     '522': [False, 'Connection timed out'],
 56 |     '524': [False, 'A timeout occurred'],
 57 | }
 58 | 
 59 | 
 60 | class TestDownloaderBasics(unittest.TestCase):
 61 |     """docstring for TestDownloaderBasics"""
 62 |     def test_200(self):
 63 |         """test 200 status code"""
 64 |         dlm = downloader.BaseDownloader()
 65 |         self.assertTrue(dlm.download('http://httpstat.us/200'))
 66 |         self.assertEqual(dlm.status_code, 200)
 67 | 
 68 |     def test_301(self):
 69 |         """redirection
 70 |         :returns: @todo
 71 | 
 72 |         """
 73 |         dlm = downloader.BaseDownloader()
 74 |         self.assertTrue(dlm.download('http://httpstat.us/301'))
 75 |         self.assertEqual(200, dlm.status_code)
 76 |         self.assertEqual(dlm.last_url, 'http://httpstat.us')
 77 | 
 78 |     def test_404(self):
 79 |         """handling errors"""
 80 |         dlm = downloader.BaseDownloader()
 81 |         self.assertFalse(dlm.download('http://httpstat.us/404'))
 82 |         self.assertEqual(404, dlm.status_code)
 83 | 
 84 |     def test_404_web(self):
 85 |         """handling errors"""
 86 |         dlm = downloader.BaseDownloader()
 87 |         self.assertFalse(dlm.download('http://192.155.84.35/scraper/sd'))
 88 |         self.assertEqual(404, dlm.status_code)
 89 | 
 90 |     def test_timeout_fail(self):
 91 |         """handling errors"""
 92 |         dlm = downloader.BaseDownloader()
 93 |         dlm.timeout = 1
 94 |         self.assertFalse(dlm.download('http://httpstat.us/524'))
 95 |         self.assertEqual(524, dlm.status_code)
 96 | 
 97 |     def test_all_codes(self):
 98 |         """test with all possible status codes"""
 99 |         dlm = downloader.BaseDownloader()
100 |         for code in CODES:
101 |             info = CODES[code]
102 |             url = 'http://httpstat.us/%s' % code
103 |             self.assertEqual(info[0], dlm.download(url))
104 |             if int(code) >= 400:
105 |                 self.assertEqual(int(code), dlm.status_code)
106 | 
107 |     def test_cached_downloader(self):
108 |         """@todo: Docstring for test_cached_downloader.
109 |         :returns: @todo
110 | 
111 |         """
112 |         url = 'http://example.com/'
113 |         filename = utils.hash(url)
114 |         fullpath = utils.file_cached_path(filename, url)
115 |         dlm = downloader.CachedDownloader()
116 |         dlm.download(url)
117 |         import os
118 |         self.assertTrue(os.path.exists(fullpath), "cache path exists")
119 | 
120 | 
121 | def main():
122 |     unittest.main()
123 | 


--------------------------------------------------------------------------------
/pages.py:
--------------------------------------------------------------------------------
  1 | from lxml.etree import XMLSyntaxError
  2 | from lxml.html.clean import Cleaner
  3 | from lxml import html
  4 | try:
  5 |     import libs.utils as utils
  6 | except ImportError:
  7 |     import utils
  8 | 
  9 | 
 10 | def clean_dom(dom):
 11 |     """get rids of script, style and comments"""
 12 |     cleaner = Cleaner()
 13 |     cleaner.script = True
 14 |     cleaner.style = True
 15 |     cleaner.comments = True
 16 |     return cleaner.clean_html(dom)
 17 | 
 18 | 
 19 | def load_dom(content, remove_br):
 20 |     """loads the content
 21 | 
 22 |     :content: html
 23 |     :remove_br: should remove <br> tags?
 24 |     :returns: dom
 25 | 
 26 |     """
 27 |     if remove_br:
 28 |         content = utils.remove_br(content)
 29 |     dom = html.fromstring(content)
 30 |     return Dom(dom)
 31 | 
 32 | 
 33 | class BasePage(object):
 34 |     """result of downloads are stored here"""
 35 |     def __init__(self):
 36 |         super(BasePage, self).__init__()
 37 |         self.url = None
 38 |         self.post = None
 39 |         self.state = False
 40 |         self.load_time = None
 41 | 
 42 |     def set_url(self, url):
 43 |         """set url
 44 | 
 45 |         :url: @todo
 46 |         :returns: @todo
 47 | 
 48 |         """
 49 |         self.url = url
 50 |         return self
 51 | 
 52 |     def set_post(self, post):
 53 |         """set post
 54 | 
 55 |         :url: @todo
 56 |         :returns: @todo
 57 | 
 58 |         """
 59 |         self.post = post
 60 |         return self
 61 | 
 62 |     def set_load_time(self, load_time):
 63 |         """sets time took to load the page"""
 64 |         self.load_time = load_time
 65 |         return self
 66 | 
 67 | 
 68 | class DownloadedPage(BasePage):
 69 |     """store page Information"""
 70 |     def __init__(self):
 71 |         super(DownloadedPage, self).__init__()
 72 |         self.url = None
 73 |         self.post = None
 74 |         self.redirected_to = None
 75 |         self.status_code = None
 76 |         self.text = None
 77 |         self.raw_text = None
 78 |         self.dom = None
 79 | 
 80 |     def get_dom(self, remove_br=False):
 81 |         """returns dom"""
 82 |         content = self.text
 83 |         tried_non_unicode = False
 84 |         while True:
 85 |             try:
 86 |                 return load_dom(content, remove_br)
 87 |             except ValueError:
 88 |                 if tried_non_unicode is True:
 89 |                     break
 90 |                 tried_non_unicode = True
 91 |                 content = self.raw_text
 92 |             except XMLSyntaxError:
 93 |                 break
 94 |         return None
 95 | 
 96 |     def set_redirected_to_url(self, redirected_to):
 97 |         """set last url set in response, is useful for redirected webpages"""
 98 |         self.redirected_to = redirected_to
 99 |         return self
100 | 
101 |     def set_status_code(self, status_code):
102 |         """sets status code
103 | 
104 |         :status_code: @todo
105 |         :returns: @todo
106 | 
107 |         """
108 |         self.status_code = status_code
109 |         self.state = self.status_code < 400
110 |         return self
111 | 
112 |     def set_state(self, state):
113 |         """set state
114 | 
115 |         :state: @todo
116 |         :returns: @todo
117 | 
118 |         """
119 |         self.state = state
120 |         return self
121 | 
122 |     def set_text(self, text, raw_text=None):
123 |         """set text values
124 | 
125 |         :text: @todo
126 |         :returns: @todo
127 | 
128 |         """
129 |         self.text = text
130 |         self.raw_text = raw_text
131 |         return self
132 | 
133 | 
134 | class Dom(object):
135 |     """dom helper,
136 | 
137 |     incase we have to switch to beautifulsoup parser
138 |     """
139 | 
140 |     def __init__(self, dom):
141 |         super(Dom, self).__init__()
142 |         self.dom = dom
143 | 
144 |     def first(self, xpath):
145 |         """gets the first element from the result"""
146 |         elist = self.xpath(xpath)
147 |         try:
148 |             return elist[0]
149 |         except IndexError:
150 |             return None
151 | 
152 |     def attr(self, xpath, attr):
153 |         """get [attr] of element at [index] from the result"""
154 |         elm = self.first(xpath)
155 |         try:
156 |             return elm.attrib[attr]
157 |         except (KeyError, IndexError, AttributeError):
158 |             return None
159 | 
160 |     def text(self, xpath, index=0):
161 |         """get text of element at [index] from the result"""
162 |         elist = self.xpath(xpath)
163 |         try:
164 |             return elist[index].text_content()
165 |         except IndexError:
166 |             return None
167 | 
168 |     def xpath(self, xpath):
169 |         """use xpath
170 | 
171 |         :xpath: @todo
172 |         :returns: @todo
173 | 
174 |         """
175 |         return self.dom.xpath(xpath)
176 | 
177 |     def make_links_absolute(self, link):
178 |         """calls make_links_absolute
179 |         :returns: @todo
180 | 
181 |         """
182 |         self.dom.make_links_absolute(link)
183 | 


--------------------------------------------------------------------------------
/test_downloader.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from downloader import BaseDownloader, CachedDownloader
  3 | from downloader import curl_factory
  4 | try:
  5 |     import libs.utils as utils
  6 |     import libs.pages as pages
  7 | except ImportError:
  8 |     import utils
  9 |     import pages
 10 | 
 11 | 
 12 | class TestDownloaderBasics(unittest.TestCase):
 13 |     """docstring for TestDownloaderBasics"""
 14 |     def setUp(self):
 15 |         """clear cache folder
 16 |         """
 17 |         try:
 18 |             utils.delete_folder_content('cache/example.com')
 19 |         except Exception:
 20 |             pass
 21 | 
 22 |     def test_200(self):
 23 |         """test 200 status code"""
 24 |         dlm = BaseDownloader()
 25 |         dlm.download_with = USE_DOWNLOADER
 26 |         page = pages.DownloadedPage().set_url('http://httpstat.us/200')
 27 |         dlm.download(page)
 28 |         self.assertTrue(page.state)
 29 |         self.assertEqual(page.status_code, 200)
 30 | 
 31 |     def test_301(self):
 32 |         """redirection
 33 |         :returns: @todo
 34 | 
 35 |         """
 36 |         dlm = BaseDownloader()
 37 |         dlm.download_with = USE_DOWNLOADER
 38 |         page = pages.DownloadedPage().set_url('http://httpstat.us/301')
 39 |         dlm.download(page)
 40 |         self.assertTrue(page.state)
 41 |         self.assertEqual(200, page.status_code)
 42 |         self.assertEqual(page.last_url, 'http://httpstat.us')
 43 | 
 44 |     def test_404(self):
 45 |         """handling errors"""
 46 |         dlm = BaseDownloader()
 47 |         dlm.download_with = USE_DOWNLOADER
 48 |         page = pages.DownloadedPage().set_url('http://httpstat.us/404')
 49 |         dlm.download(page)
 50 |         self.assertFalse(page.state)
 51 |         self.assertEqual(404, page.status_code)
 52 | 
 53 |     def test_404_web(self):
 54 |         """handling errors"""
 55 |         dlm = BaseDownloader()
 56 |         dlm.download_with = USE_DOWNLOADER
 57 |         page = pages.DownloadedPage().set_url('http://192.155.84.35/scraper/sd')
 58 |         dlm.download(page)
 59 |         self.assertFalse(page.state)
 60 |         self.assertEqual(404, page.status_code)
 61 | 
 62 |     def test_timeout_fail(self):
 63 |         """handling errors"""
 64 |         dlm = BaseDownloader()
 65 |         dlm.download_with = USE_DOWNLOADER
 66 |         dlm.timeout = 1
 67 |         page = pages.DownloadedPage().set_url('http://httpstat.us/524')
 68 |         dlm.download(page)
 69 |         self.assertFalse(page.state)
 70 |         self.assertEqual(524, page.status_code)
 71 | 
 72 |     def test_all_codes(self):
 73 |         """test with all possible status codes"""
 74 |         dlm = BaseDownloader()
 75 |         dlm.download_with = USE_DOWNLOADER
 76 |         for code in CODES:
 77 |             info = CODES[code]
 78 |             url = 'http://httpstat.us/%s' % code
 79 |             page = pages.DownloadedPage().set_url(url)
 80 |             dlm.download(page)
 81 |             self.assertEqual(info[0], page.state)
 82 |             if int(code) >= 400:
 83 |                 self.assertEqual(int(code), page.status_code)
 84 | 
 85 |     def test_dom(self):
 86 |         """test dom parsing and querying
 87 |         :returns: @todo
 88 | 
 89 |         """
 90 |         dlm = BaseDownloader()
 91 |         dlm.download_with = USE_DOWNLOADER
 92 |         page = pages.DownloadedPage().set_url('http://example.com')
 93 |         dlm.download(page)
 94 |         dom = page.get_dom()
 95 |         result = dom.xpath('//h1')
 96 |         self.assertEqual(1, len(result))
 97 |         self.assertEqual('Example Domain', result[0].text_content().strip())
 98 |         self.assertEqual('More information...', dom.text('//a'))
 99 |         self.assertEqual('Example Domain', dom.first('//h1').text_content())
100 |         self.assertEqual('More information...', dom.text('//p', 1))
101 |         self.assertEqual("http://www.iana.org/domains/example",
102 |                          dom.attr('//a', 'href'))
103 | 
104 |     def test_cached_page(self):
105 |         """test run cached page class"""
106 |         dlm = CachedDownloader()
107 |         dlm.download_with = USE_DOWNLOADER
108 |         page = pages.DownloadedPage().set_url('http://example.com')
109 |         dlm.download(page)
110 |         dom = page.get_dom()
111 |         result = dom.xpath('//h1')
112 |         self.assertEqual(1, len(result))
113 |         self.assertEqual('Example Domain', result[0].text_content().strip())
114 |         self.assertEqual('More information...', dom.text('//a'))
115 |         self.assertEqual('Example Domain', dom.first('//h1').text_content())
116 |         self.assertEqual('More information...', dom.text('//p', 1))
117 |         self.assertEqual("http://www.iana.org/domains/example",
118 |                          dom.attr('//a', 'href'))
119 | 
120 |     def test_broken_html(self):
121 |         """test on how to handle broken html files"""
122 |         broken_html = """<meta/><head><title>Hello</head><body onload=crash()>
123 |         Hi all<p><a href="google.com">google</a>"""
124 |         page = pages.DownloadedPage().set_text(broken_html)
125 |         dom = page.get_dom()
126 |         self.assertEqual(dom.first('//title').text_content(), 'Hello')
127 |         self.assertEqual(dom.attr('//a', 'href'), 'google.com')
128 |         self.assertEqual(dom.text('//a'), 'google')
129 | 
130 | #
131 | # {{{
132 | CODES = {
133 |     '200': [True, 'OK'],
134 |     '201': [True, 'Created'],
135 |     '202': [True, 'Accepted'],
136 |     '203': [True, 'Non-Authoritative Information'],
137 |     '204': [True, 'No Content'],
138 |     '205': [True, 'Reset Content'],
139 |     '206': [True, 'Partial Content'],
140 |     '300': [True, 'Multiple Choices'],
141 |     '301': [True, 'Moved Permanently'],
142 |     '302': [True, 'Found'],
143 |     '303': [True, 'See Other'],
144 |     '304': [True, 'Not Modified'],
145 |     '305': [True, 'Use Proxy'],
146 |     '306': [True, 'Unused'],
147 |     '307': [True, 'Temporary Redirect'],
148 |     '308': [True, 'Permanent Redirect'],
149 |     '400': [False, 'Bad Request'],
150 |     '401': [False, 'Unauthorized'],
151 |     '402': [False, 'Payment Required'],
152 |     '403': [False, 'Forbidden'],
153 |     '404': [False, 'Not Found'],
154 |     '405': [False, 'Method Not Allowed'],
155 |     '406': [False, 'Not Acceptable'],
156 |     '407': [False, 'Proxy Authentication Required'],
157 |     '408': [False, 'Request Timeout'],
158 |     '409': [False, 'Conflict'],
159 |     '410': [False, 'Gone'],
160 |     '411': [False, 'Length Required'],
161 |     '412': [False, 'Precondition Required'],
162 |     '413': [False, 'Request Entry Too Large'],
163 |     '414': [False, 'Request-URI Too Long'],
164 |     '415': [False, 'Unsupported Media Type'],
165 |     '416': [False, 'Requested Range Not Satisfiable'],
166 |     '417': [False, 'Expectation Failed'],
167 |     '418': [False, "I'm a teapot"],
168 |     '422': [False, 'Unprocessable Entity'],
169 |     '428': [False, 'Precondition Required'],
170 |     '429': [False, 'Too Many Requests'],
171 |     '431': [False, 'Request Header Fields Too Large'],
172 |     '451': [False, 'Unavailable For Legal Reasons'],
173 |     '500': [False, 'Internal Server Error'],
174 |     '501': [False, 'Not Implemented'],
175 |     '502': [False, 'Bad Gateway'],
176 |     '503': [False, 'Service Unavailable'],
177 |     '504': [False, 'Gateway Timeout'],
178 |     '505': [False, 'HTTP Version Not Supported'],
179 |     '511': [False, 'Network Authentication Required'],
180 |     '520': [False, 'Web server is returning an unknown error'],
181 |     '522': [False, 'Connection timed out'],
182 |     '524': [False, 'A timeout occurred'],
183 | }
184 | # }}}
185 | #
186 | 
187 | 
188 | def main():
189 |     """entry point"""
190 |     logger = utils.setup_logger()
191 |     logger.info('### start testing ###')
192 |     unittest.main()
193 | 
194 | 
195 | if __name__ == '__main__':
196 |     USE_DOWNLOADER = curl_factory
197 |     # USE_DOWNLOADER = request_factory
198 |     main()
199 | 


--------------------------------------------------------------------------------
/mysql.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     from libs.config import Config
  3 |     from libs.dbbase import DBBase
  4 | except ImportError:
  5 |     # pylint: disable=relative-import
  6 |     from config import Config
  7 |     from dbbase import DBBase
  8 | import MySQLdb
  9 | import logging
 10 | import unittest
 11 | 
 12 | 
 13 | def make_columns(data):
 14 |     """make columns for data
 15 | 
 16 |     :data: @todo
 17 |     :returns: @todo
 18 | 
 19 |     """
 20 |     return ', '.join(['%%(%s)s' % key for key in data.keys()])
 21 | 
 22 | 
 23 | def dict_factory(cursor, row):
 24 |     """
 25 |     dict factory for mysql row
 26 |     """
 27 |     dest = {}
 28 |     for idx, col in enumerate(cursor.description):
 29 |         dest[col[0]] = row[idx]
 30 |     return dest
 31 | 
 32 | 
 33 | class MySQL(DBBase):
 34 |     """
 35 |     MySQL driver
 36 |     """
 37 | 
 38 |     cfg = None
 39 |     logger = None
 40 | 
 41 |     """ stores data in a MySQL table """
 42 |     def __init__(self):
 43 |         super(MySQL, self).__init__()
 44 |         MySQL.cfg = Config()
 45 |         txt = '{}.mysql'.format(MySQL.cfg.g('logger.base'))
 46 |         MySQL.logger = logging.getLogger(txt)
 47 |         self.prep_char = '?'
 48 |         self.dbc = None
 49 |         self.lastid = None
 50 |         self.dbhost = MySQL.cfg.g('db.mysql.host')
 51 |         self.user = MySQL.cfg.g('db.mysql.user')
 52 |         self.pswd = MySQL.cfg.g('db.mysql.pass')
 53 |         self.dbname = MySQL.cfg.g('db.mysql.database')
 54 |         self.connect()
 55 | 
 56 |     def connect(self):
 57 |         """
 58 |         connects to database
 59 |         """
 60 |         try:
 61 |             self.dbc.close()
 62 |         except AttributeError:
 63 |             pass
 64 |         self.dbc = MySQLdb.connect(self.dbhost, self.user,
 65 |                                    self.pswd, self.dbname, charset='utf8',
 66 |                                    use_unicode=True)
 67 |         self.dbc.set_character_set('utf8')
 68 |         dbc = self.dbc.cursor()
 69 |         dbc.execute('SET NAMES utf8;')
 70 |         dbc.execute('SET CHARACTER SET utf8;')
 71 |         dbc.execute('SET character_set_connection=utf8;')
 72 | 
 73 |     def close(self):
 74 |         """
 75 |         closes the database, don't use it,
 76 |         close database directly by self.dbc.close()
 77 |         """
 78 |         self.dbc.close()
 79 | 
 80 |     def clear_database(self, table):
 81 |         """
 82 |         clears given table
 83 |         """
 84 |         self.query("delete from %s" % table)
 85 | 
 86 |     def safe_query(self, qtpl, data):
 87 |         """Executed binding query
 88 |         ex: select * from table where q=:s, d=:k
 89 | 
 90 |         :query: @todo
 91 |         :data: @todo
 92 |         :returns: @todo
 93 | 
 94 |         """
 95 |         retries = 0
 96 |         while True:
 97 |             try:
 98 |                 return self.do_query(qtpl, data)
 99 |             except MySQLdb.MySQLError as err:
100 |                 if err[0] == 1062:
101 |                     return -2
102 |                 self.connect()
103 |                 retries += 1
104 |                 if retries > 5:
105 |                     MySQL.logger.exception('Failed to execute query')
106 |                     return None
107 | 
108 |     def make_condition(self, cond, col, col_name):
109 |         """builds appropiate query
110 | 
111 |         :cond: @todo
112 |         :col: @todo
113 |         :col: @todo
114 |         :returns: @todo
115 | 
116 |         """
117 |         return '%s %s=%%(%s)s' % (cond, col, col_name)
118 | 
119 |     def query(self, query):
120 |         """
121 |         Runs a query in unsafe way
122 |         """
123 |         try:
124 |             return self._query(query)
125 |         except MySQLdb.OperationalError:
126 |             return None
127 | 
128 |     def append_data(self, data, table, pkey=None):
129 |         """
130 |         adds row to database
131 |         """
132 |         qfields = make_columns(data)
133 |         cols = ', '.join(data.keys())
134 |         query = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields)
135 |         return self.execute_query(data, query)
136 | 
137 |     def append_all_data(self, data, table):
138 |         """adds multiple rows,
139 | 
140 |         tries in single query first
141 |         uses multiple queries if fails
142 |         """
143 |         qfields = make_columns(data[0])
144 |         cols = ', '.join(data[0].keys())
145 |         query = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields)
146 |         state = self.execute_query(data, query, True)
147 |         if state == -2:
148 |             for row in data:
149 |                 self.append_data(row, table)
150 |         else:
151 |             return state
152 |         return True
153 | 
154 |     def execute_query(self, data, query, many=False):
155 |         """execute query
156 | 
157 |         :data: @todo
158 |         :table: @todo
159 |         :many: @todo
160 |         :returns: @todo
161 | 
162 |         """
163 |         # pylint: disable=broad-except, no-member
164 |         retries = 0
165 |         cur = None
166 |         try:
167 |             while True:
168 |                 try:
169 |                     cur = self.dbc.cursor()
170 |                     if many:
171 |                         status = cur.executemany(query, data)
172 |                     else:
173 |                         status = cur.execute(query, data)
174 |                     try:
175 |                         self.lastid = cur.insert_id()
176 |                     except AttributeError:
177 |                         self.lastid = cur.lastrowid
178 |                     except Exception:
179 |                         self.lastid = cur.lastrowid
180 |                         MySQL.logger.exception("ignorable")
181 |                     self.dbc.commit()
182 |                     return status
183 |                 except MySQLdb.MySQLError as err:
184 |                     if err[0] == 1062:
185 |                         return -2
186 |                     MySQL.logger.exception(err)
187 |                     MySQL.logger.info('reconnecting ... ')
188 |                     self.connect()
189 |                     retries += 1
190 |                     if retries > 5:
191 |                         MySQL.logger.exception('Failed to execute query')
192 |                         return None
193 |                     continue
194 |                 except Exception as exp:
195 |                     MySQL.logger.exception('failed inserting data')
196 |                     self.lastid = None
197 |                     raise exp
198 |         finally:
199 |             if cur:
200 |                 cur.close()
201 | 
202 | 
203 | class TestMySQL(unittest.TestCase):
204 |     """docstring for TestMySQL"""
205 | 
206 |     def test_inserts(self):
207 |         """test insert queries
208 |         :returns: @todo
209 | 
210 |         """
211 |         dbc.append_data({'name': 'gmail.com', 'si': 10}, 'tests')
212 |         dbc.append_data({'name': 'inbox.com', 'si': 12}, 'tests')
213 |         dbc.append_data({'name': 'reddit.com', 'si': 1}, 'tests')
214 |         dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests')
215 |         dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests')
216 |         dbc.query('insert into tests (name, si) values("google.com", 10)')
217 |         self.assertEqual(1, 1)
218 | 
219 |     def test_queries(self):
220 |         """test select queries
221 |         :returns: @todo
222 | 
223 |         """
224 |         result = dbc.select('tests', ['name||sgmail.com'])
225 |         self.assertEqual(0, len(result.fetchall()))
226 |         result = dbc.select('tests', ['name||gmail.com'])
227 |         self.assertEqual(1, len(result.fetchall()))
228 |         result = dbc.select('tests', ['si||2', 'si|or|12'])
229 |         self.assertEqual(3, len(result.fetchall()))
230 |         result = dbc.select('tests', ['name||gmail.com', 'name|or|inbox.com'])
231 |         self.assertEqual(2, len(result.fetchall()))
232 |         result = dbc.select('tests', ['name||reddit.com'], 'count(*)')
233 |         self.assertEqual(3, result.fetchone()[0])
234 |         result = dbc.select('tests', at_end='order by si')
235 |         result = dbc.select('tests', ['name||reddit.com'], 'count(*)',
236 |                             at_end='group by si')
237 | 
238 | 
239 | def main():
240 |     """
241 |     do some tests
242 |     """
243 |     try:
244 |         dbc.query('drop table if exists tests')
245 |         dbc.query('create table tests(name varchar(20), si integer)')
246 |     # pylint: disable=no-member
247 |     except MySQLdb.OperationalError:
248 |         pass
249 |     unittest.main()
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     dbc = MySQL()
254 |     main()
255 | 


--------------------------------------------------------------------------------
/pgsql.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     from libs.config import Config
  3 |     from libs.dbbase import DBBase
  4 | except ImportError:
  5 |     # pylint: disable=relative-import
  6 |     from config import Config
  7 |     from dbbase import DBBase
  8 | import psycopg2
  9 | import logging
 10 | import unittest
 11 | 
 12 | 
 13 | def make_columns(data):
 14 |     """make columns for data
 15 | 
 16 |     :data: dictonary containing column name (key) and value (not used)
 17 |     :returns: @todo
 18 | 
 19 |     """
 20 |     return ', '.join(['%%(%s)s' % key for key in data.keys()])
 21 | 
 22 | 
 23 | class PGSql(DBBase):
 24 |     """ stores data in a PGSql table """
 25 | 
 26 |     cfg = None
 27 |     logger = None
 28 | 
 29 |     def __init__(self):
 30 |         super(PGSql, self).__init__()
 31 |         PGSql.cfg = Config()
 32 |         txt = '{}.pgsql'.format(PGSql.cfg.g('logger.base'))
 33 |         PGSql.logger = logging.getLogger(txt)
 34 |         self.prep_char = '?'
 35 |         self.lastid = None
 36 |         self.dbhost = PGSql.cfg.g('db.pgsql.host')
 37 |         self.user = PGSql.cfg.g('db.pgsql.user')
 38 |         self.pswd = PGSql.cfg.g('db.pgsql.pass')
 39 |         self.dbname = PGSql.cfg.g('db.pgsql.database')
 40 |         self.dbc = self.connect()
 41 | 
 42 |     def connect(self):
 43 |         """
 44 |         connects to database
 45 |         """
 46 |         try:
 47 |             return psycopg2.connect(host=self.dbhost, user=self.user,
 48 |                                     password=self.pswd, dbname=self.dbname)
 49 |         except AttributeError:
 50 |             pass
 51 | 
 52 |     # pylint: disable=no-self-use
 53 |     def close(self):
 54 |         """
 55 |         closes the database, don't use it,
 56 |         close database directly by self.dbc.close()
 57 |         """
 58 |         self.dbc.close()
 59 | 
 60 |     def clear_database(self, table):
 61 |         """
 62 |         clears given table
 63 |         """
 64 |         self.query("delete from %s" % table)
 65 |         self.dbc.commit()
 66 | 
 67 |     def make_condition(self, cond, col, col_name):
 68 |         """builds appropiate query
 69 | 
 70 |         :cond: @todo
 71 |         :col: @todo
 72 |         :col: @todo
 73 |         :returns: @todo
 74 | 
 75 |         """
 76 |         return '%s %s=%%(%s)s' % (cond, col, col_name)
 77 | 
 78 |     def reconnect(self):
 79 |         """reconnects persistant connection
 80 |         :returns: @todo
 81 | 
 82 |         """
 83 |         PGSql.logger.info("reconnecting")
 84 |         self.dbc.close()
 85 |         self.dbc = self.connect()
 86 | 
 87 |     def query(self, query):
 88 |         """Runs a query in unsafe way
 89 |         """
 90 |         try:
 91 |             if self.requires_commit(query) is False:
 92 |                 return self._query(query)
 93 |             with self.connect() as conn:
 94 |                 return self._query(query, conn=conn)
 95 |         except psycopg2.Error:
 96 |             return None
 97 | 
 98 |     def safe_query(self, qtpl, data, conn=None, retries=0):
 99 |         """Executed binding query
100 |         ex: select * from table where q=%s, d=%s
101 | 
102 |         :query: @todo
103 |         :data: @todo
104 |         :returns: @todo
105 | 
106 |         """
107 |         try:
108 |             if self.requires_commit(qtpl) is False:
109 |                 return self.do_query(qtpl, data)
110 |             with self.connect() as conn:
111 |                 return self.do_query(qtpl, data, conn=conn)
112 |         except psycopg2.IntegrityError:
113 |             self._query('rollback')
114 |             PGSql.logger.debug("IntegrityError: %s", qtpl)
115 |             return -2
116 |         except (psycopg2.InterfaceError, psycopg2.OperationalError,
117 |                 psycopg2.DatabaseError):
118 |             PGSql.logger.debug('closed, reconnecting')
119 |             self.reconnect()
120 |             retries += 1
121 |             if retries > 5:
122 |                 PGSql.logger.exception("Failed to execute_query")
123 |                 return None
124 |             self.safe_query(qtpl, data, conn, retries=retries)
125 |         except psycopg2.Error:
126 |             PGSql.logger.exception('Failed: %s', qtpl)
127 |             return None
128 | 
129 |     def append_data(self, data, table, pkey='id'):
130 |         """adds row to database
131 | 
132 |         :data: data to be saved
133 |         :table: name of the table
134 |         :pk: NEED to provide correct pk (primary key) column, to get last insert id
135 |         """
136 |         qfields = make_columns(data)
137 |         cols = ', '.join(data.keys())
138 |         query = "INSERT INTO %s (%s) VALUES (%s) RETURNING %s"\
139 |                 % (table, cols, qfields, pkey)
140 |         return self.execute_query(data, query)
141 | 
142 |     def append_all_data(self, data, table):
143 |         """adds multiple rows,
144 | 
145 |         tries in single query first
146 |         uses multiple queries if fails
147 |         """
148 |         qfields = make_columns(data[0])
149 |         cols = ', '.join(data[0].keys())
150 |         query = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields)
151 |         state = self.execute_query(data, query, True)
152 |         if state == -2 or state == -3:
153 |             cnt = 0
154 |             for row in data:
155 |                 if self.append_data(row, table):
156 |                     cnt += 1
157 |             return cnt
158 |         else:
159 |             return state
160 | 
161 |     def execute_query(self, data, query, many=False):
162 |         """execute query
163 | 
164 |         :data: data to be saved
165 |         :table: name of the table
166 |         :many: multiple rows to be inserted or not
167 |         :returns: True or None
168 | 
169 |         """
170 |         with self.connect() as conn:
171 |             cur = None
172 |             try:
173 |                 cur = conn.cursor()
174 |                 if many:
175 |                     cur.executemany(query, data)
176 |                 else:
177 |                     cur.execute(query, data)
178 |                     self.lastid = cur.fetchone()[0]
179 |                 return True
180 |             except psycopg2.IntegrityError as iexp:
181 |                 PGSql.logger.debug("duplicate %s %s", query, iexp)
182 |                 return -2
183 |             except psycopg2.DataError as err:
184 |                 PGSql.logger.debug("data error %s, %s", query, err)
185 |                 return -3
186 |             except psycopg2.Error:
187 |                 PGSql.logger.exception("%s %s", query, data)
188 |                 return None
189 | 
190 | 
191 | class TestSQLITE(unittest.TestCase):
192 |     """docstring for TestSQLITE"""
193 | 
194 |     def test_inserts(self):
195 |         """test insert queries
196 |         :returns: @todo
197 | 
198 |         """
199 |         dbc.append_data({'name': 'gmail.com', 'si': 10}, 'tests')
200 |         dbc.append_data({'name': 'inbox.com', 'si': 12}, 'tests')
201 |         dbc.append_data({'name': 'reddit.com', 'si': 1}, 'tests')
202 |         dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests')
203 |         dbc.append_data({'name': 'reddit.com', 'si': 2}, 'tests')
204 |         dbc.query('insert into tests (name, si) values("google.com", 10)')
205 |         self.assertEqual(1, 1)
206 | 
207 |     def test_queries(self):
208 |         """test select queries
209 |         :returns: @todo
210 | 
211 |         """
212 |         # TODO: test for duplicate entries
213 |         result = dbc.select('tests', ['name||sgmail.com'])
214 |         self.assertEqual(0, len(result.fetchall()))
215 |         result = dbc.select('tests', ['name||gmail.com'])
216 |         self.assertEqual(1, len(result.fetchall()))
217 |         result = dbc.select('tests', ['si||2', 'si|or|12'])
218 |         self.assertEqual(3, len(result.fetchall()))
219 |         result = dbc.select('tests', ['name||gmail.com', 'name|or|inbox.com'])
220 |         self.assertEqual(2, len(result.fetchall()))
221 |         result = dbc.select('tests', ['name||reddit.com'], 'count(*)')
222 |         self.assertEqual(3, result.fetchone()[0])
223 |         result = dbc.select('tests', at_end='order by si')
224 |         result = dbc.select('tests', ['name||reddit.com'], 'count(*)',
225 |                             at_end='group by si')
226 | 
227 | 
228 | def main():
229 |     """
230 |     do some tests
231 |     """
232 |     try:
233 |         dbc.query('drop table if exists tests')
234 |         # NOTE: better to use CREATE SEQUENCE <table>_id_seq than serial
235 |         dbc.query('create table tests(id SERIAL, name varchar(20), si integer)')
236 |     except Exception:
237 |         pass
238 |     unittest.main()
239 | 
240 | 
241 | if __name__ == '__main__':
242 |     dbc = PGSql()
243 |     main()
244 | 


--------------------------------------------------------------------------------
/sqlite.py:
--------------------------------------------------------------------------------
  1 | """
  2 | sqlite driver
  3 | """
  4 | import sqlite3 as sqlite
  5 | from libs.config import Config
  6 | from libs.dbbase import DBBase
  7 | import logging
  8 | import unittest
  9 | 
 10 | 
 11 | def dict_factory(cursor, row):
 12 |     """
 13 |     conver row to dict
 14 |     """
 15 |     data = {}
 16 |     for idx, col in enumerate(cursor.description):
 17 |         data[col[0]] = row[idx]
 18 |     return data
 19 | 
 20 | 
 21 | def make_columns(data):
 22 |     """makes column for sqlite
 23 | 
 24 |     :data: @todo
 25 |     :returns: @todo
 26 | 
 27 |     """
 28 |     return ', '.join([':%s' % key for key in data.keys()])
 29 | 
 30 | 
 31 | # pylint: disable=too-many-instance-attributes
 32 | class SQLite(DBBase):
 33 |     """ stores data in a sqlite table """
 34 | 
 35 |     cfg = None
 36 |     logger = None
 37 | 
 38 |     def __init__(self, dbname=None, lazy_commit=False):
 39 |         super(SQLite, self).__init__()
 40 |         SQLite.cfg = Config()
 41 |         txt = '{}.sqlite'.format(SQLite.cfg.g('logger.base'))
 42 |         SQLite.logger = logging.getLogger(txt)
 43 |         self.dbname = dbname if dbname != None else SQLite.cfg.g('db.sqlite.file')
 44 |         self.timeout = SQLite.cfg.g('db.sqlite.timeout')
 45 |         self.query_queued = 0
 46 |         self.lastid = None
 47 |         strd = SQLite.cfg.g('db.sqlite.same_thread', 0)
 48 |         if strd == 0:
 49 |             self.same_thread = False
 50 |         else:
 51 |             self.same_thread = True
 52 |         self.connect()
 53 |         self.set_lazy_commit(lazy_commit)
 54 | 
 55 |     def set_lazy_commit(self, val):
 56 |         """enables lazy_commit
 57 |         :returns: @todo
 58 | 
 59 |         """
 60 |         self.lazy_commit = val
 61 |         if self.lazy_commit:
 62 |             self.commit_func = self.should_commit_lazy
 63 |             self.query_queued = 0
 64 |         else:
 65 |             self.commit_func = self.should_commit
 66 | 
 67 |     def connect(self):
 68 |         """
 69 |         connects to db
 70 | 
 71 |         """
 72 |         self.dbc = sqlite.connect(self.dbname, self.timeout,
 73 |                                   check_same_thread=self.same_thread)
 74 | 
 75 |     def use_dict(self):
 76 |         """
 77 |         use dictionary for rows
 78 |         """
 79 |         self.dbc.row_factory = dict_factory
 80 | 
 81 |     def use_tuple(self):
 82 |         """
 83 |         use tuple for row
 84 |         """
 85 |         self.dbc.row_factory = sqlite.Row
 86 | 
 87 |     def close(self):
 88 |         """
 89 |         close database
 90 |         """
 91 |         self.dbc.close()
 92 | 
 93 |     def clear_database(self, table):
 94 |         """
 95 |         clear table
 96 |         """
 97 |         self.query("delete from %s" % table)
 98 | 
 99 |     def safe_query(self, qtpl, data):
100 |         """Executed binding query
101 |         ex: select * from table where q=:s, d=:k
102 | 
103 |         :query: @todo
104 |         :data: @todo
105 |         :commit: @todo
106 |         :returns: @todo
107 | 
108 |         """
109 |         try:
110 |             return self.do_query(qtpl, data)
111 |         except sqlite.OperationalError:
112 |             SQLite.logger.exception("query failed %s", qtpl)
113 |             return None
114 | 
115 |     def make_condition(self, cond, col, col_name):
116 |         """@todo: Docstring for make_condition.
117 | 
118 |         :cond: @todo
119 |         :col: @todo
120 |         :returns: @todo
121 | 
122 |         """
123 |         return '%s %s=:%s' % (cond, col, col_name)
124 | 
125 |     def query(self, query):
126 |         try:
127 |             return self._query(query)
128 |         except sqlite.OperationalError:
129 |             SQLite.logger.exception("query failed %s", query)
130 |             return None
131 | 
132 |     def count_rows(self, query):
133 |         res = self.query(query)
134 |         result = res.fetchone()
135 |         return result[0]
136 | 
137 |     def append_data(self, data, table):
138 |         """
139 |         add rows to database
140 |         """
141 |         qfields = make_columns(data)
142 |         cols = ', '.join(data.keys())
143 |         sql = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, qfields)
144 |         return self.execute_query(data, sql)
145 | 
146 |     # pylint: disable=unused-argument
147 |     def execute_query(self, data, query, many=False):
148 |         """executes query
149 | 
150 |         :data: used data
151 |         :query: query to execute
152 |         :many: dummy param
153 |         :returns: negative on error
154 | 
155 |         """
156 |         cur = None
157 |         self.lastid = None
158 |         try:
159 |             cur = self.dbc.cursor()
160 |             status = cur.execute(query, data)
161 |             self.commit_func(query)
162 |             try:
163 |                 self.lastid = cur.insert_id()
164 |             except AttributeError:
165 |                 self.lastid = cur.lastrowid
166 |             return status
167 |         except sqlite.IntegrityError as sie:
168 |             SQLite.logger.debug('IntegrityError: %s %s %s', sie, query, data)
169 |             return -2
170 |         except sqlite.DatabaseError as dbe:
171 |             SQLite.logger.debug('DatabaseError: %s %s %s', dbe, query, data)
172 |             return -4
173 |         except sqlite.OperationalError as oie:
174 |             SQLite.logger.debug('OperationalError %s', oie)
175 |             return -3
176 |         finally:
177 |             if cur:
178 |                 cur.close()
179 | 
180 |     def append_all_data(self, data, table):
181 |         """
182 |         append at once
183 | 
184 |         """
185 |         for row in data:
186 |             self.append_data(row, table)
187 |         self.dbc.commit()
188 |         self.query_queued = 0
189 | 
190 |     def should_commit_lazy(self, query):
191 |         """override for should_commit
192 | 
193 |         :query: @todo
194 |         :returns: @todo
195 | 
196 |         """
197 |         self.query_queued += 1
198 |         if self.query_queued >= 30:
199 |             self.should_commit(query)
200 |             self.query_queued = 0
201 | 
202 |     def force_commit(self):
203 |         """forces to commit
204 |         :returns: @todo
205 | 
206 |         """
207 |         self.query_queued = 0
208 |         self.dbc.commit()
209 | 
210 | 
211 | class TestSQLITE(unittest.TestCase):
212 |     """docstring for TestSQLITE"""
213 | 
214 |     def test_inserts(self):
215 |         """test insert queries
216 |         :returns: @todo
217 | 
218 |         """
219 |         db.append_data({'name': 'gmail.com', 'si': 10}, 'tests')
220 |         db.append_data({'name': 'inbox.com', 'si': 12}, 'tests')
221 |         db.append_data({'name': 'reddit.com', 'si': 1}, 'tests')
222 |         db.append_data({'name': 'reddit.com', 'si': 2}, 'tests')
223 |         db.append_data({'name': 'reddit.com', 'si': 2}, 'tests')
224 |         db.query('insert into tests (name, si) values("google.com", 10)')
225 |         cnt = db.count_rows('select count(*) rows from tests')
226 |         self.assertEqual(cnt, 6)
227 | 
228 |     def test_queries(self):
229 |         """test select queries
230 | 
231 |         :returns: @todo
232 |         """
233 |         result = db.select('tests', ['name||sgmail.com'])
234 |         self.assertEqual(0, len(result.fetchall()))
235 |         result = db.select('tests', ['name||gmail.com'])
236 |         self.assertEqual(1, len(result.fetchall()))
237 |         result = db.select('tests', ['si||2', 'si|or|12'])
238 |         self.assertEqual(3, len(result.fetchall()))
239 |         result = db.select('tests', ['name||gmail.com', 'name|or|inbox.com'])
240 |         self.assertEqual(2, len(result.fetchall()))
241 |         result = db.select('tests', ['name||reddit.com'], 'count(*)')
242 |         self.assertEqual(3, result.fetchone()[0])
243 |         result = db.select('tests', ['name||reddit.com'], 'count(*)',
244 |                            at_end='group by si')
245 | 
246 |     def test_non_lazy_commit(self):
247 |         """test with possible unique data
248 |         :returns: @todo
249 | 
250 |         """
251 |         db.set_lazy_commit(False)
252 |         for k in range(0, 1000):
253 |             db.append_data({'name': 'email_%s.com' % k, 'si': k}, 'uniquetests')
254 |         cnt = db.count_rows('select count(*) rows from uniquetests')
255 |         self.assertEqual(1000, cnt)
256 |         db.query('delete from uniquetests')
257 |         db.dbc.commit()
258 | 
259 |     def test_lazy_commit(self):
260 |         """test lazy commit
261 | 
262 |         """
263 |         db.set_lazy_commit(True)
264 |         for k in range(0, 1000):
265 |             db.append_data({'name': 'email_%s.com' % k, 'si': k}, 'uniquetests')
266 |         cnt = db.count_rows('select count(*) rows from uniquetests')
267 |         self.assertEqual(1000, cnt)
268 | 
269 | 
270 | def main():
271 |     """
272 |     test starts here
273 |     """
274 |     try:
275 |         unittest.main()
276 |     # pylint: disable=broad-except
277 |     except Exception:
278 |         pass
279 | 
280 | if __name__ == '__main__':
281 |     import os
282 |     if os.path.exists('db'):
283 |         os.unlink('db')
284 |     # pylint: disable=invalid-name
285 |     db = SQLite()
286 |     db.query('create table tests( name test, si integer)')
287 |     db.query('create table uniquetests(name test unique, si integer)')
288 |     main()
289 |     if os.path.exists('db'):
290 |         os.unlink('db')
291 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | downloader
  4 | """
  5 | import logging
  6 | import requests
  7 | import pycurl
  8 | from io import BytesIO
  9 | import time
 10 | import random
 11 | try:
 12 |     import libs.config as config
 13 |     import libs.utils as utils
 14 | except ImportError:
 15 |     # pylint: disable=relative-import
 16 |     import config
 17 |     import utils
 18 | 
 19 | 
 20 | USER_AGENT = 'Mozilla/5.0 Gecko/20120101 Firefox/40.0'
 21 | SLEEP_AFTER = 10
 22 | SLEEP = 3
 23 | 
 24 | 
 25 | class Error(Exception):
 26 |     """handles exceptions"""
 27 |     def __init__(self, value=None):
 28 |         self.value = value
 29 | 
 30 |     def __str__(self):
 31 |         return repr(self.value)
 32 | 
 33 | 
 34 | class RetryableError(Error):
 35 |     """docstring for ConnectionError"""
 36 |     def __init__(self, value=None):
 37 |         super(RetryableError, self).__init__()
 38 |         self.value = value
 39 | 
 40 | 
 41 | class SSLError(Error):
 42 |     """docstring for SSLError"""
 43 |     def __init__(self, value=None):
 44 |         super(SSLError, self).__init__()
 45 |         self.value = value
 46 | 
 47 | 
 48 | class ConnectionError(Error):
 49 |     """docstring for ConnectionError"""
 50 |     def __init__(self, value=None):
 51 |         super(ConnectionError, self).__init__()
 52 |         self.value = value
 53 | 
 54 | 
 55 | def request_factory(page, proxy, headers, timeout, logger=None):
 56 |     """uses request to download"""
 57 |     logging.getLogger("requests").setLevel(logging.WARNING)
 58 |     try:
 59 |         with requests.Session() as session:
 60 |             session.headers.update(headers)
 61 |             if page.post != None:
 62 |                 response = session.post(page.url, page.post, proxies=proxy,
 63 |                                         timeout=timeout)
 64 |             else:
 65 |                 response = session.get(page.url, proxies=proxy, timeout=timeout)
 66 |         # download page and set response details
 67 |         page.set_text(response.text, response.content) \
 68 |             .set_status_code(response.status_code) \
 69 |             .set_redirected_to_url(response.url)
 70 |     except requests.exceptions.Timeout:
 71 |         logger.error("Timed out: %s", page.url)
 72 |         raise RetryableError('timed out')
 73 |     except requests.packages.urllib3.exceptions.ReadTimeoutError:
 74 |         logger.exception("%s", page.url)
 75 |         raise RetryableError('read timed out')
 76 |     except requests.exceptions.ProxyError:
 77 |         logger.exception("%s", page.url)
 78 |         raise RetryableError(proxy)
 79 |     except requests.exceptions.SSLError:
 80 |         logger.exception("%s", page.url)
 81 |         raise SSLError()
 82 |     except requests.exceptions.InvalidSchema:
 83 |         logger.exception('Failed to parse: %s', page.url)
 84 |         raise ConnectionError()
 85 |     except requests.ConnectionError:
 86 |         logger.exception('Failed to parse: %s', page.url)
 87 |         raise ConnectionError()
 88 | 
 89 | 
 90 | def curl_factory(page, proxy, headers, timeout, logger=None):
 91 |     """uses curl to download"""
 92 |     curl_headers = []
 93 |     for key in headers:
 94 |         curl_headers.append('%s: %s' % (key, headers[key]))
 95 |     curl_headers += ['Accept-Charset: UTF-8']
 96 |     response = BytesIO()
 97 |     headers = BytesIO()
 98 |     curl = pycurl.Curl()
 99 |     try:
100 |         curl.setopt(curl.URL, page.url)
101 |     except UnicodeEncodeError:
102 |         logger.error("URL ISSUE: %s", page.url)
103 |         raise Error()
104 |     curl.setopt(curl.TIMEOUT, timeout)
105 |     curl.setopt(curl.WRITEFUNCTION, response.write)
106 |     curl.setopt(curl.HEADERFUNCTION, headers.write)
107 |     curl.setopt(curl.HTTPHEADER, curl_headers)
108 |     curl.setopt(curl.FOLLOWLOCATION, True)
109 |     curl.setopt(curl.TIMEOUT, timeout * 2)
110 |     if proxy != None:
111 |         logger.debug("setting proxy: %s", proxy)
112 |         curl.setopt(curl.PROXY, proxy['http'])
113 |     if page.post is not None:
114 |         logger.debug("setting post: %s", page.post)
115 |         curl.setopt(curl.POSTFIELD, page.post)
116 |     try:
117 |         curl.perform()
118 |     except pycurl.error:
119 |         logger.exception('failed downloading')
120 |         raise Error()
121 |     text = response.getvalue().decode('UTF-8', errors='ignore')
122 |     status_code = curl.getinfo(curl.RESPONSE_CODE)
123 |     page.set_text(text, response).set_status_code(status_code)
124 |     try:
125 |         headers.seek(0)
126 |         lines = headers.getvalue().decode('UTF-8').split('\r\n')
127 |         redirected_to = page.url
128 |         for line in lines:
129 |             if 'Location' in line:
130 |                 redirected_to = line.split(': ')[-1]
131 |         curl.close()
132 |         page.set_redirected_to_url(redirected_to)
133 |     except Exception:
134 |         logger.exception('failed parsing headers')
135 |         page.set_redirected_to_url(page.url)
136 | 
137 | 
138 | def cleanup_url(url):
139 |     """cleans up the given url of weird stuffs
140 | 
141 |     :url: @todo
142 |     :returns: @todo
143 | 
144 |     """
145 |     url = url.replace(' ', '%20').lower()
146 |     url = url.replace('<br%20>', '')
147 |     url = url.replace('<br%20/>', '')
148 |     return url
149 | 
150 | 
151 | # pylint: disable=too-few-public-methods
152 | class BaseCommon(object):
153 |     """logger class"""
154 | 
155 |     def __init__(self):
156 |         super(BaseCommon, self).__init__()
157 |         self.cfg = config.Config()
158 |         txt = '{}.dm'.format(self.cfg.g('logger.base'))
159 |         self.log = logging.getLogger(txt)
160 | 
161 | 
162 | class BaseDownloader(BaseCommon):
163 |     """docstring for BaseDownloader"""
164 | 
165 |     def __init__(self):
166 |         super(BaseDownloader, self).__init__()
167 |         self.downloads = 0
168 |         self.timeout = self.cfg.g('timeout', 60)
169 |         self.from_cache = False
170 |         self.current_proxy = None
171 |         self.proxy_used = 0
172 |         self.bad_proxies = set()
173 |         self.headers = {'USER_AGENT': USER_AGENT}
174 |         self.use_proxy = self.cfg.g('proxies', 'no') == 'yes'
175 |         self.use_curl = self.cfg.g('use_curl', 'no') == 'yes'
176 |         self.load_bad_proxies()
177 |         self.which_downloader()
178 | 
179 |     def set_logger(self, logger):
180 |         """sets up independent logger
181 | 
182 |         :logger: @todo
183 |         :returns: @todo
184 | 
185 |         """
186 |         self.log = logger
187 | 
188 |     def which_downloader(self):
189 |         """sets which downloader to be used"""
190 |         if self.use_curl:
191 |             self.download_with = curl_factory
192 |         else:
193 |             self.download_with = request_factory
194 | 
195 |     def proxy_enabled(self):
196 |         """check if proxy is enabled
197 |         :returns: @todo
198 | 
199 |         """
200 |         return self.use_proxy
201 | 
202 |     def load_bad_proxies(self):
203 |         """loads up bad proxies
204 |         :returns: @todo
205 | 
206 |         """
207 |         if self.proxy_enabled():
208 |             self.current_proxy = self.get_random_proxy()
209 |             try:
210 |                 self.bad_proxies = set(utils.read_file('bad_proxies', True))
211 |             except OSError:
212 |                 pass
213 | 
214 |     def get_random_proxy(self):
215 |         """returns a proxy from proxies.txt
216 | 
217 |         :returns: @todo
218 |         """
219 |         proxy_file = self.cfg.g('proxy_file', 'proxies.txt')
220 |         proxies = utils.read_file(proxy_file, True)
221 |         while True:
222 |             proxy = random.choice(proxies)
223 |             if proxy in self.bad_proxies:
224 |                 continue
225 |             self.proxy_used = 0
226 |             return proxy
227 | 
228 |     def _download(self, page, proxy=None):
229 |         """does the actual download"""
230 |         error_count = 0
231 |         while True:
232 |             try:
233 |                 self.download_with(page, proxy, self.headers,
234 |                                    self.timeout, self.log)
235 |                 if self.proxy_enabled():
236 |                     self.proxy_used += 1
237 |                 return
238 |             except ConnectionError:
239 |                 return
240 |             except RetryableError:
241 |                 if self.proxy_enabled():
242 |                     self.bad_proxies.add(proxy['http'])
243 |                     utils.append_to_file('bad_proxies', proxy['http'] + '\n')
244 |                     self.current_proxy = self.get_random_proxy()
245 |                     proxy = {'http': self.current_proxy}
246 |                 error_count += 1
247 |                 if error_count > 3:
248 |                     raise Error()
249 | 
250 |     def take_a_nap_after(self, after, duration):
251 |         """force sleep :after: for :duration:"""
252 |         if self.downloads % after == 0:
253 |             time.sleep(duration)
254 | 
255 |     def download(self, page):
256 |         """downloads given url"""
257 |         if self.proxy_used >= self.cfg.g('proxy.used', 100):
258 |             old_proxy = self.current_proxy
259 |             self.current_proxy = self.get_random_proxy()
260 |             self.log.info("proxy: %s -> %s", old_proxy, self.current_proxy)
261 |         if self.proxy_enabled():
262 |             proxy = {'http': self.current_proxy}
263 |         else:
264 |             proxy = None
265 |         url = cleanup_url(page.url)
266 |         if page.url != url:
267 |             page.set_url(url)
268 |         try:
269 |             start_time = time.time()
270 |             self._download(page, proxy)
271 |             end_time = time.time()
272 |             page.set_load_time(end_time - start_time)
273 |             self.take_a_nap_after(SLEEP_AFTER, SLEEP)
274 |             self.downloads = self.downloads + 1
275 |         except requests.ConnectionError:
276 |             self.log.debug("ConnectionError: %s", url)
277 |         except (UnboundLocalError, AttributeError, Error):
278 |             page.set_state(False)
279 | 
280 | 
281 | class CachedDownloader(BaseDownloader):
282 |     """downloads and save webpages"""
283 | 
284 |     def download(self, page):
285 |         content = ''
286 |         fullpath = utils.get_cache_full_path(page.url, page.post)
287 |         if utils.is_valid_cache_file(fullpath):
288 |             content = utils.read_file(fullpath)
289 |             page.set_text(content).set_redirected_to_url(page.url).set_load_time(0)
290 |         else:
291 |             super(CachedDownloader, self).download(page)
292 |             if page.state:
293 |                 utils.save_to_file(fullpath, page.text, True)
294 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import codecs
  3 | import logging
  4 | import logging.handlers
  5 | import os
  6 | import re
  7 | import json
  8 | try:
  9 |     from urllib.parse import urlparse
 10 | except ImportError:
 11 |     from urlparse import urlparse
 12 | try:
 13 |     import libs.config as config
 14 | except ImportError:
 15 |     import config
 16 | try:
 17 |     from hashlib import md5
 18 | except ImportError:
 19 |     from md5 import md5
 20 | 
 21 | 
 22 | def read_file(filename, linewise=False):
 23 |     """
 24 |     reads a file, either as string or by line
 25 | 
 26 |     """
 27 |     try:
 28 |         with open(filename) as fptr:
 29 |             content = fptr.read().strip()
 30 |             if linewise:
 31 |                 content = content.split("\n")
 32 |         return content
 33 |     except Exception as e:
 34 |         raise e
 35 | 
 36 | 
 37 | def uni(text):
 38 |     """get some unicode love
 39 | 
 40 |     :text: @todo
 41 |     :returns: @todo
 42 | 
 43 |     """
 44 |     try:
 45 |         return unicode(text)
 46 |     except NameError:
 47 |         return text
 48 | 
 49 | 
 50 | def save_to_file(filename, content, use_codec=False):
 51 |     if use_codec:
 52 |         with codecs.open(filename, encoding='utf-8', mode='w') as fp:
 53 |             try:
 54 |                 return fp.write(unicode(content))
 55 |             except NameError:
 56 |                 pass
 57 |             try:
 58 |                 return fp.write(content.encode('utf-8'))
 59 |             except TypeError:
 60 |                 pass
 61 |             return fp.write(content)
 62 |     else:
 63 |         with open(filename, mode='w') as fp:
 64 |             fp.write(content)
 65 | 
 66 | 
 67 | def append_to_file(filename, content):
 68 |     with open(filename, mode='a+') as fp:
 69 |         fp.write(content)
 70 | 
 71 | 
 72 | def get(arr, indx):
 73 |     try:
 74 |         return arr[indx]
 75 |     except IndexError:
 76 |         return None
 77 | 
 78 | 
 79 | def remove_extra_whitespace(txt):
 80 |     return re.sub(' +', ' ', txt)
 81 | 
 82 | 
 83 | def cleanup_text(text):
 84 |     t = text.strip()
 85 |     t = re.sub('\t+', '', t)
 86 |     t = re.sub('\n+', '\n', t)
 87 |     t = re.sub(' +', ' ', t)
 88 |     t = re.sub('\xa0', '', t)
 89 |     t = re.sub('\u2022', '', t)
 90 |     return t
 91 | 
 92 | 
 93 | def remove_br(content):
 94 |     """removes <br> tag
 95 | 
 96 |     :content: @todo
 97 |     :returns: @todo
 98 | 
 99 |     """
100 |     content = content.replace('<br>', '\n')
101 |     content = content.replace('</br>', '\n')
102 |     content = content.replace('<br />', '\n')
103 |     content = content.replace('<br%20/>', '\n')
104 |     return content
105 | 
106 | 
107 | def clean_url(lnk, baseurl):
108 |     """cleans up url"""
109 |     lnk = lnk.replace('.html', '')
110 |     lnk = lnk.replace('.htm', '')
111 |     lnk = lnk.replace(baseurl, '')
112 |     lnk = lnk.replace('%20', ' ')
113 |     return lnk.lower()
114 | 
115 | 
116 | def union(l1, l2):
117 |     a = l1.copy()
118 |     b = l2.copy()
119 |     for e in a[:]:
120 |         if e in b:
121 |             a.remove(e)
122 |             b.remove(e)
123 |     return a, b
124 | 
125 | 
126 | def joindict(d1, d2):
127 |     d = d1.copy()
128 |     d.update(d2)
129 |     return d
130 | 
131 | 
132 | def hash(url, data=None):
133 |     """ creates hash of the url and post data (if required and exists)"""
134 |     m = md5()
135 |     m.update(url.encode('utf-8'))
136 |     if data is not None:
137 |         m.update(data)
138 |     return m.hexdigest()
139 | 
140 | 
141 | def setup_logger(load_cfg=None):
142 |     """sets up logging"""
143 |     # {{{ load config
144 |     if load_cfg != None:
145 |         cfg = config.Config(load_cfg)
146 |     else:
147 |         cfg = config.Config()
148 |     # }}}
149 |     # {{{ setting up everything
150 |     logger = logging.getLogger(cfg.g('logger.base'))
151 |     level = getattr(logging, cfg.g('logger.level'))
152 |     clevel = getattr(logging, cfg.g('logger.console.level'))
153 |     flevel = getattr(logging, cfg.g('logger.file.level'))
154 |     logger.setLevel(level)
155 |     logfilepath = cfg.g('logger.path')
156 |     maxsize = cfg.g('logger.backupsize', default=33554432)
157 |     filehandler = logging.handlers.RotatingFileHandler(logfilepath,
158 |                                                        mode='w',
159 |                                                        maxBytes=maxsize,
160 |                                                        backupCount=2)
161 |     template = cfg.get('logger', 'template')
162 |     formatter = logging.Formatter(template)
163 |     formatter.datefmt = cfg.g('logger.datefmt')
164 |     # }}}
165 |     # {{{ configure handlers
166 |     console_off = cfg.g('logger.console.off', 'no')
167 |     if console_off == 'no':
168 |         consolehandler = logging.StreamHandler()
169 |         consolehandler.setFormatter(formatter)
170 |         consolehandler.setLevel(clevel)
171 |         logger.addHandler(consolehandler)
172 |     filehandler.setFormatter(formatter)
173 |     filehandler.setLevel(flevel)
174 |     logger.addHandler(filehandler)
175 |     # }}}
176 |     return logger
177 | 
178 | 
179 | def dict_g(dct, key, default=False):
180 |     keys = key.split('.')
181 |     k = dct
182 |     for kwrd in keys:
183 |         if kwrd not in k:
184 |             return default
185 |         k = k[kwrd]
186 |     return k
187 | 
188 | 
189 | def dict_s(d, ky, val):
190 |     keys = ky.split('.')
191 |     k = d
192 |     if len(keys) == 1:
193 |         d[ky] = val
194 |     while True:
195 |         kw = keys.pop(0)
196 |         if kw not in k:
197 |             if len(keys) == 0:
198 |                 k[kw] = None
199 |             else:
200 |                 k[kw] = {}
201 |         if type(k[kw]).isisntance(type({})):
202 |             k[kw] = val
203 |             break
204 |         else:
205 |             k = k[kw]
206 | 
207 | 
208 | def flat_rows(listing):
209 |     rows = []
210 |     for item in listing:
211 |         rows.append(item[0])
212 |     return '\n'.join(rows)
213 | 
214 | 
215 | def search_line_in_file(filename, text):
216 |     """searchs a text linewise in a file
217 | 
218 |     :filename: @todo
219 |     :text: @todo
220 |     :returns: @todo
221 | 
222 |     """
223 |     with open(filename) as f:
224 |         return text in f
225 |     return False
226 | 
227 | 
228 | def get_timestamp():
229 |     """get current unix timestamp
230 |     :returns: @todo
231 | 
232 |     """
233 |     return (datetime.now() - datetime(1970, 1, 1)).total_seconds()
234 | 
235 | 
236 | def is_valid_cache_file(fullpath, notlessthan=100):
237 |     """check if given file path is not an incomplete
238 |     html file"""
239 |     if not os.path.exists(fullpath):
240 |         return False
241 |     statinfo = os.stat(fullpath)
242 |     return statinfo.st_size > notlessthan
243 | 
244 | 
245 | def get_cache_full_path(url, post=None):
246 |     """generate full path for given URL with POST data
247 | 
248 |     :url:
249 |     :post: post data to pass
250 |     :returns: full path of the cache file
251 | 
252 |     """
253 |     filename = hash(url, post)
254 |     return file_cached_path(filename, url)
255 | 
256 | 
257 | def clean_failed_page_cache(url, post=None):
258 |     """
259 |     remove cached files that failed
260 |     """
261 |     fullpath = get_cache_full_path(url, post)
262 |     if os.path.exists(fullpath):
263 |         os.unlink(fullpath)
264 | 
265 | 
266 | def file_cached_path(filename, url=None):
267 |     """ expects hashed filename """
268 |     burl = ''
269 |     if url:
270 |         burl = url.replace('http://', '')
271 |         burl = burl.replace('https://', '')
272 |         burl = burl.replace('www.', '')
273 |         burl = burl.split('/')[0]
274 |     segsize = 3
275 |     cachepath = 'cache'
276 |     firstpart = filename[0:segsize]
277 |     secondpart = filename[segsize: 2 * segsize]
278 |     fullpath = "%s/%s/%s/%s" % (cachepath, burl, firstpart, secondpart)
279 |     if not os.path.exists(fullpath):
280 |         os.makedirs(fullpath)
281 |     return '%s/%s.html' % (fullpath, filename)
282 | 
283 | 
284 | class DateTimeEncoder(json.JSONEncoder):
285 |     """ encode datetime to proper string for json
286 |         DateTimeEncoder().encode(object)
287 |     """
288 | 
289 |     def default(self, obj):
290 |         if isinstance(obj, datetime.datetime):
291 |             return obj.isoformat()
292 |         elif isinstance(obj, datetime.date):
293 |             return obj.isoformat()
294 |         elif isinstance(obj, datetime.timedelta):
295 |             return (datetime.datetime.min + obj).time().isoformat()
296 |         else:
297 |             return super(DateTimeEncoder, self).default(obj)
298 | 
299 | 
300 | def delete_folder_content(folder, delete_parent=False):
301 |     """delete a folders content
302 | 
303 |     :folder: @todo
304 |     :returns: @todo
305 | 
306 |     """
307 |     import os
308 |     import shutil
309 |     for the_file in os.listdir(folder):
310 |         file_path = os.path.join(folder, the_file)
311 |         try:
312 |             if os.path.isfile(file_path):
313 |                 os.unlink(file_path)
314 |             elif os.path.isdir(file_path):
315 |                 shutil.rmtree(file_path)
316 |         except Exception as e:
317 |             print(e)
318 |     if delete_parent:
319 |         os.rmdir(folder)
320 | 
321 | 
322 | def get_net_loc(url):
323 |     """get net location without domain
324 | 
325 |     :url: url to clean
326 |     :returns: @todo
327 | 
328 |     """
329 |     urlobj = urlparse(url.replace('www.', ''))
330 |     netloc = urlobj.netloc.split('.')
331 |     if len(netloc) > 2:
332 |         return '.'.join(netloc[1:])
333 |     else:
334 |         return urlobj.netloc
335 | 
336 | 
337 | def get_shorted_url(url, length=10):
338 |     """cuts url for logger """
339 |     try:
340 |         segs = url.netloc.split('www.')[1][:length]
341 |         return (''.join(segs)).center(length + 4, '_')
342 |     except IndexError:
343 |         return (''.join(url.netloc[:length])).center(length + 4, '_')
344 |     except AttributeError:
345 |         urlobj = urlparse(url)
346 |     try:
347 |         segs = urlobj.netloc.split('www.')[1][:length]
348 |         return (''.join(segs)).center(length + 4, '_')
349 |     except IndexError:
350 |         return (''.join(urlobj.netloc[:length])).center(length + 4, '_')
351 | 
352 | 
353 | def get_domain(url, tlds):
354 |     """extracts top level domain"""
355 |     try:
356 |         url_elements = urlparse(url)[1].split('.')
357 |     except TypeError:
358 |         return ValueError("Failed to check url")
359 |     for i in range(-len(url_elements), 0):
360 |         last_i_elements = url_elements[i:]
361 |         #    i=-3: ["abcde","co","uk"]
362 |         #    i=-2: ["co","uk"]
363 |         #    i=-1: ["uk"] etc
364 |         # abcde.co.uk, co.uk, uk
365 |         candidate = ".".join(last_i_elements)
366 |         # *.co.uk, *.uk, *
367 |         wildcard_candidate = ".".join(["*"] + last_i_elements[1:])
368 |         exception_candidate = "!" + candidate
369 |         # match tlds:
370 |         if (exception_candidate in tlds):
371 |             return ".".join(url_elements[i:])
372 |         if (candidate in tlds or wildcard_candidate in tlds):
373 |             return ".".join(url_elements[i - 1:])
374 |             # returns "abcde.co.uk"
375 |     raise ValueError("Domain not in global list of TLDs")
376 | 
377 | 
378 | def older_than(filepath, hours):
379 |     """check if file path is older than given time
380 | 
381 |     :filepath: file to check
382 |     :hours: how many hours
383 |     :returns: true/false
384 |     """
385 |     mtime = os.path.getmtime(filepath)
386 |     import time
387 |     ctime = time.time()
388 |     time_passed = (ctime - mtime) / 3600
389 |     if time_passed > hours:
390 |         return True
391 |     return False
392 | 
393 | 
394 | def get_tlds():
395 |     """load tld"""
396 |     from libs.pages import DownloadedPage
397 |     from libs.downloader import BaseDownloader
398 |     fname = "effective_tld_names.dat.txt"
399 |     if not os.path.exists(fname) or older_than(fname, 12):
400 |         url = 'https://publicsuffix.org/list/effective_tld_names.dat'
401 |         dlm = BaseDownloader()
402 |         page = DownloadedPage().set_url(url)
403 |         dlm.download(page=page)
404 |         save_to_file(fname, page.text)
405 |     with open("effective_tld_names.dat.txt") as tld_file:
406 |         tlds = [line.strip() for line in tld_file if line[0] not in "/\n"]
407 |     return tlds
408 | 


--------------------------------------------------------------------------------