├── script ├── __init__.py ├── notification │ ├── __init__.py │ ├── mypushover.py │ ├── ifttt.py │ ├── join.py │ └── gmail.py ├── noBookException.py ├── alreadyClaimedException.py ├── scheduler.py ├── logs.py ├── upload.py ├── notify.py ├── database.py ├── scpUpload.py ├── utils.py ├── onedrive.py ├── googledrive.py ├── spider.py └── packtpub.py ├── .dockerignore ├── Procfile ├── dev ├── public │ ├── urlFromNewsletter.html │ ├── loginGet.html │ ├── loginPost.html │ ├── loginPostNewsletter.html │ └── myEbooks.html ├── package.json └── server.js ├── .gitignore ├── Dockerfile ├── requirements.txt ├── config ├── dev.cfg └── prod_example.cfg ├── LICENSE └── README.md /script/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /script/notification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | dev/ 2 | ebooks/ 3 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | clock: python ./script/scheduler.py 2 | -------------------------------------------------------------------------------- /script/noBookException.py: -------------------------------------------------------------------------------- 1 | class NoBookException(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /dev/public/urlFromNewsletter.html: -------------------------------------------------------------------------------- 1 | http://localhost:8080/loginPostNewsletter.html 2 | -------------------------------------------------------------------------------- /script/alreadyClaimedException.py: -------------------------------------------------------------------------------- 1 | class AlreadyClaimedException(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .DS_Store 3 | 4 | .idea 5 | *.iml 6 | 7 | *.pyc 8 | env/ 9 | 10 | config/prod.cfg 11 | config/client_secrets.json 12 | config/auth_token.json 13 | config/session.onedrive.pickle 14 | config/lastNewsletterUrl 15 | 16 | ebooks/ 17 | node_modules/ 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | 3 | WORKDIR /packtpub-crawler 4 | 5 | COPY script script 6 | COPY config config 7 | COPY requirements.txt requirements.txt 8 | 9 | RUN pip install -r requirements.txt 10 | 11 | CMD ["python", "/packtpub-crawler/script/scheduler.py"] 12 | -------------------------------------------------------------------------------- /dev/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "author": "niqdev", 3 | "name": "packtpub-crawler", 4 | "version": "1.0.0", 5 | "repository": { 6 | "type": "git", 7 | "url": "https://github.com/niqdev/packtpub-crawler.git" 8 | }, 9 | "dependencies": { 10 | "express": "^4.12.4", 11 | "morgan": "^1.6.0", 12 | "serve-static": "^1.9.3" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | APScheduler==3.1.0 2 | beautifulsoup4==4.4.1 3 | clint==0.5.1 4 | google-api-python-client==1.3.2 5 | html5lib==0.9999999 6 | oauth2client==1.4.11 7 | python-magic==0.4.11 8 | requests==2.10.0 9 | termcolor==1.1.0 10 | urllib3==1.15.1 11 | python-firebase==1.2 12 | paramiko==2.0.2 13 | cryptography==1.6 14 | scp==0.10.2 15 | onedrivesdk==1.1.8 16 | python-pushover==0.3 17 | -------------------------------------------------------------------------------- /script/scheduler.py: -------------------------------------------------------------------------------- 1 | from apscheduler.schedulers.blocking import BlockingScheduler 2 | import os 3 | import shutil 4 | 5 | sched = BlockingScheduler() 6 | 7 | @sched.scheduled_job('cron', day_of_week='mon-sun', hour=9) 8 | def scheduled_job(): 9 | print('New job: packtpub-crawler') 10 | shutil.rmtree('./ebooks', ignore_errors=True) 11 | os.system('python script/spider.py --config config/prod.cfg --upload googledrive --store firebase --notify gmail') 12 | 13 | sched.start() 14 | -------------------------------------------------------------------------------- /dev/server.js: -------------------------------------------------------------------------------- 1 | /* 2 | // setup project 3 | npm init 4 | // build 5 | npm install 6 | 7 | // run dev 8 | node server.js 9 | */ 10 | 11 | var express = require('express'); 12 | var morgan = require('morgan'); 13 | var serveStatic = require('serve-static'); 14 | 15 | var PATH = __dirname + '/public'; 16 | var PORT = 8080; 17 | 18 | var app = express(); 19 | 20 | // logger 21 | app.use(morgan('dev')); 22 | 23 | app.use(serveStatic(PATH)).listen(PORT, function() { 24 | console.log("listening on port " + PORT); 25 | }); 26 | -------------------------------------------------------------------------------- /config/dev.cfg: -------------------------------------------------------------------------------- 1 | [url] 2 | url.base=http://localhost:8080 3 | url.loginGet=/loginGet.html 4 | url.loginPost=/loginPost.html 5 | url.account=/myEbooks.html 6 | # params: 0=id, 1=format 7 | url.download=/ebook_download/{0}/{1} 8 | url.bookFromNewsletter=http://localhost:8080/urlFromNewsletter.html 9 | 10 | #time in seconds 11 | [delay] 12 | delay.requests=2 13 | 14 | [credential] 15 | credential.email=test@mail.com 16 | credential.password=testPwd 17 | 18 | [path] 19 | path.ebooks=ebooks 20 | path.extras=ebooks/extras 21 | #path.group=true 22 | 23 | [googledrive] 24 | googledrive.oauth2_scope=https://www.googleapis.com/auth/drive 25 | googledrive.client_secrets=config/client_secrets.json 26 | googledrive.auth_token=config/auth_token.json 27 | googledrive.gmail=g00gler@gmail.com 28 | -------------------------------------------------------------------------------- /script/logs.py: -------------------------------------------------------------------------------- 1 | from termcolor import cprint 2 | import json 3 | import sys, os, traceback 4 | 5 | def log_error(message): 6 | cprint(message, 'red') 7 | 8 | def log_warn(message): 9 | cprint(message, 'yellow') 10 | 11 | def log_info(message): 12 | cprint(message, 'cyan') 13 | 14 | def log_success(message): 15 | cprint(message, 'green') 16 | 17 | def log_json(list_dict): 18 | print json.dumps(list_dict, indent=2) 19 | 20 | def log_dict(dict): 21 | for key, elem in dict.items(): 22 | print '\t[{0}] {1}'.format(key, elem) 23 | 24 | def log_debug(e, stacktrace=True): 25 | exc_type, exc_value, exc_traceback = sys.exc_info() 26 | fname = os.path.split(exc_traceback.tb_frame.f_code.co_filename)[1] 27 | 28 | log_warn('[-] {0} {1} | {2}@{3}'.format(exc_type, e, fname, exc_traceback.tb_lineno)) 29 | 30 | if stacktrace: 31 | traceback.print_exc() 32 | -------------------------------------------------------------------------------- /script/notification/mypushover.py: -------------------------------------------------------------------------------- 1 | from logs import * 2 | import requests 3 | from pushover import Client 4 | 5 | class Pushover(object): 6 | """ 7 | """ 8 | 9 | def __init__(self, config, packpub_info, upload_info): 10 | self.__config = config 11 | self.__packpub_info = packpub_info 12 | self.__client = Client(self.__config.get('pushover', 'pushover.user_key'), api_token=self.__config.get('pushover', 'pushover.api_key')) 13 | 14 | 15 | def send(self): 16 | self.__client.send_message(self.__packpub_info['description'].encode('utf-8'), title="New book downloaded from Packt: " + self.__packpub_info['title'].encode('utf-8'), url="https://www.packtpub.com/packt/offers/free-learning", url_title="See more") 17 | log_success('[+] notification sent to pushover') 18 | 19 | def sendError(self, exception, source): 20 | self.__client.send_message(repr(exception), title='packtpub-crawler {source}: Could not download ebook'.format(source=source)) 21 | log_success('[+] error notification sent to pushover') 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 niqdev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /script/notification/ifttt.py: -------------------------------------------------------------------------------- 1 | from logs import * 2 | import requests 3 | 4 | class Ifttt(object): 5 | """ 6 | """ 7 | 8 | def __init__(self, config, packpub_info, upload_info): 9 | self.__packpub_info = packpub_info 10 | self.__url = "https://maker.ifttt.com/trigger/{eventName}/with/key/{apiKey}".format( 11 | eventName=config.get('ifttt', 'ifttt.event_name'), 12 | apiKey=config.get('ifttt', 'ifttt.key') 13 | ) 14 | 15 | def send(self): 16 | r = requests.post(self.__url, data = { 17 | 'value1':self.__packpub_info['title'].encode('utf-8'), 18 | 'value2':self.__packpub_info['description'].encode('utf-8'), 19 | 'value3':self.__packpub_info['url_image'] 20 | }) 21 | log_success('[+] notification sent to IFTTT') 22 | 23 | def sendError(self, exception, source): 24 | title = "packtpub-crawler [{source}]: Could not download ebook".format(source=source) 25 | r = requests.post(self.__url, data = {'value1':title, 'value2':repr(exception), 'value3':self.__packpub_info['landingPageUrl']}) 26 | 27 | log_success('[+] error notification sent to IFTTT') 28 | -------------------------------------------------------------------------------- /script/upload.py: -------------------------------------------------------------------------------- 1 | from googledrive import GoogleDrive 2 | from onedrive import OneDrive 3 | from scpUpload import ScpUpload 4 | from logs import * 5 | 6 | SERVICE_GOOGLE_DRIVE = 'googledrive' 7 | SERVICE_ONEDRIVE = 'onedrive' 8 | SERVICE_DROPBOX = 'DROPBOX' 9 | SERVICE_SCP = 'scp' 10 | 11 | class Upload(object): 12 | """ 13 | TODO interface or abstract class for upload services 14 | """ 15 | 16 | def __init__(self, config, service_type): 17 | self.__config = config 18 | self.info = { 19 | 'details': [] 20 | } 21 | if service_type == SERVICE_GOOGLE_DRIVE: 22 | self.service = GoogleDrive(config) 23 | elif service_type == SERVICE_ONEDRIVE: 24 | self.service = OneDrive(config) 25 | elif service_type == SERVICE_DROPBOX: 26 | raise NotImplementedError('not implemented yet!') 27 | elif service_type == SERVICE_SCP: 28 | self.service = ScpUpload(config) 29 | 30 | def run(self, paths): 31 | """ 32 | """ 33 | for path in paths: 34 | self.service.upload(path) 35 | self.info['details'].append(self.service.info) 36 | log_dict(self.service.info) 37 | -------------------------------------------------------------------------------- /script/notify.py: -------------------------------------------------------------------------------- 1 | from notification.gmail import Gmail 2 | from notification.ifttt import Ifttt 3 | from logs import * 4 | from notification.join import Join 5 | from notification.mypushover import Pushover 6 | 7 | SERVICE_GMAIL = 'gmail' 8 | SERVICE_IFTTT = 'ifttt' 9 | SERVICE_JOIN = 'join' 10 | SERVICE_PUSHOVER = 'pushover' 11 | 12 | class Notify(object): 13 | """ 14 | TODO interface or abstract class for notification services 15 | """ 16 | 17 | def __init__(self, config, packpub_info, upload_info, service_type): 18 | self.__config = config 19 | self.info = { 20 | 'details': [] 21 | } 22 | if service_type == SERVICE_GMAIL: 23 | self.service = Gmail(config, packpub_info, upload_info) 24 | elif service_type == SERVICE_IFTTT: 25 | self.service = Ifttt(config, packpub_info, upload_info) 26 | elif service_type == SERVICE_JOIN: 27 | self.service = Join(config, packpub_info, upload_info) 28 | elif service_type == SERVICE_PUSHOVER: 29 | self.service = Pushover(config, packpub_info, upload_info) 30 | 31 | def run(self): 32 | """ 33 | """ 34 | self.service.send() 35 | 36 | 37 | def sendError(self, exception, source): 38 | """ 39 | """ 40 | self.service.sendError(exception, source) 41 | -------------------------------------------------------------------------------- /script/database.py: -------------------------------------------------------------------------------- 1 | from logs import * 2 | from firebase.firebase import FirebaseApplication, FirebaseAuthentication 3 | import datetime 4 | 5 | DB_FIREBASE = 'firebase' 6 | 7 | class Database(object): 8 | """ 9 | Store to database 10 | """ 11 | 12 | def __init__(self, config, database_type, packpub_info, upload_info): 13 | self.__config = config 14 | self.__database_type = database_type 15 | 16 | data = packpub_info.copy() 17 | data['datetime'] = datetime.datetime.utcnow().isoformat() 18 | data.pop('paths', None) 19 | data.update(upload_info) 20 | self.__data = data 21 | 22 | def store(self): 23 | """ 24 | """ 25 | #log_json(self.__data) 26 | 27 | if self.__database_type == DB_FIREBASE: 28 | self.__store_firebase() 29 | 30 | def __store_firebase(self): 31 | """ 32 | """ 33 | 34 | authentication = FirebaseAuthentication(self.__config.get('firebase', 'firebase.database_secret'), None) 35 | #user = authentication.get_user() 36 | #print authentication.extra 37 | #print user.firebase_auth_token 38 | 39 | firebase = FirebaseApplication(self.__config.get('firebase', 'firebase.url'), authentication) 40 | result = firebase.post(self.__config.get('firebase', 'firebase.path'), self.__data) 41 | 42 | log_success('[+] Stored on firebase: {0}'.format(result['name'])) 43 | -------------------------------------------------------------------------------- /dev/public/loginGet.html: -------------------------------------------------------------------------------- 1 | 13 |
14 |
15 |
16 |
17 |
18 | 19 |
20 |
21 |
22 |
23 | 24 |
25 |
26 |
27 | 28 |
29 |
30 |
31 | Forgotten your password? 32 | 33 | 34 |
35 |
36 | -------------------------------------------------------------------------------- /script/notification/join.py: -------------------------------------------------------------------------------- 1 | from logs import * 2 | import requests 3 | 4 | class Join(object): 5 | """ 6 | """ 7 | 8 | def __init__(self, config, packpub_info, upload_info): 9 | self.__config = config 10 | self.__packpub_info = packpub_info 11 | 12 | def send(self): 13 | url = "https://joinjoaomgcd.appspot.com/_ah/api/messaging/v1/sendPush?apikey={apiKey}&deviceId={deviceIds}&title={title}&text={description}".format( 14 | apiKey=self.__config.get('join', 'join.api_key'), 15 | deviceIds=self.__config.get('join', 'join.device_ids'), 16 | title="New book downloaded from Packt: " + self.__packpub_info['title'].encode('utf-8'), 17 | description=self.__packpub_info['description'].encode('utf-8') 18 | ) 19 | 20 | r = requests.post(url) 21 | 22 | log_success('[+] notification sent to Join') 23 | 24 | def sendError(self, exception, source): 25 | url = "https://joinjoaomgcd.appspot.com/_ah/api/messaging/v1/sendPush?apikey={apiKey}&deviceId={deviceIds}&title={title}&text={description}&url={url}".format( 26 | apiKey=self.__config.get('join', 'join.api_key'), 27 | deviceIds=self.__config.get('join', 'join.device_ids'), 28 | title='packtpub-crawler {source}: Could not download ebook: {title}'.format(source=source, title=self.__packpub_info['title']), 29 | description=repr(exception), 30 | url=self.__packpub_info['landingPageUrl'] 31 | ) 32 | 33 | r = requests.post(url) 34 | 35 | log_success('[+] error notification sent to Join') 36 | -------------------------------------------------------------------------------- /script/scpUpload.py: -------------------------------------------------------------------------------- 1 | from os.path import exists 2 | import magic 3 | from utils import thread_loader 4 | from logs import * 5 | import paramiko 6 | from scp import SCPClient 7 | 8 | class ScpUpload(object): 9 | """ 10 | """ 11 | 12 | def __init__(self, config): 13 | self.__config = config 14 | self.info = {} 15 | 16 | def __guess_info(self, file_path): 17 | if not exists(file_path): 18 | raise IOError('file not found!') 19 | 20 | self.info = { 21 | 'path': file_path, 22 | 'name': file_path.split('/')[-1], 23 | 'mime_type': magic.from_file(file_path, mime=True), 24 | } 25 | log_info('[+] new file upload via scp:') 26 | # log_dict(self.file_info) 27 | 28 | def __insert_file(self): 29 | print '[+] uploading file...' 30 | ssh = paramiko.SSHClient() 31 | ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 32 | 33 | # get config settings 34 | host = self.__config.get('scp', 'scp.host') 35 | user = self.__config.get('scp', 'scp.user') 36 | password = self.__config.get('scp', 'scp.password') 37 | timeout = self.__config.get('scp', 'scp.timeout') 38 | self.info['upload_path'] = self.__config.get('scp', 'scp.path') 39 | 40 | ssh.connect(host, username=user, password=password) 41 | scpclient = SCPClient(ssh.get_transport(), socket_timeout=float(timeout)) 42 | scpclient.put(self.info['path'], self.info['upload_path'] + self.info['name']) 43 | 44 | def upload(self, file_path): 45 | self.__guess_info(file_path) 46 | thread_loader(self.__insert_file) 47 | -------------------------------------------------------------------------------- /config/prod_example.cfg: -------------------------------------------------------------------------------- 1 | [url] 2 | url.base=https://www.packtpub.com 3 | url.login=/packt/offers/free-learning 4 | # params: 0=id, 1=format 5 | url.download=/ebook_download/{0}/{1} 6 | url.bookFromNewsletter=https://goo.gl/kUciut 7 | 8 | #time in seconds 9 | [delay] 10 | delay.requests=2 11 | 12 | [credential] 13 | credential.email=PACKTPUB_EMAIL 14 | credential.password=PACKTPUB_PASSWORD 15 | 16 | [path] 17 | path.ebooks=ebooks 18 | path.extras=ebooks/extras 19 | #path.group=true 20 | 21 | [googledrive] 22 | googledrive.oauth2_scope=https://www.googleapis.com/auth/drive 23 | googledrive.client_secrets=config/client_secrets.json 24 | googledrive.auth_token=config/auth_token.json 25 | googledrive.gmail=GOOGLE_DRIVE@gmail.com 26 | googledrive.default_folder=packtpub 27 | #googledrive.upload_folder=FOLDER_ID 28 | 29 | [onedrive] 30 | onedrive.api_base_url=https://api.onedrive.com/v1.0/ 31 | onedrive.client_id=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx 32 | onedrive.client_secret=XxXxXxXxXxXxXxXxXxXxXxX 33 | onedrive.session_file=config/session.onedrive.pickle 34 | onedrive.folder=packtpub 35 | 36 | [scp] 37 | scp.host=SCP_HOST 38 | scp.user=SCP_USER 39 | scp.password=SCP_PASSWORD 40 | scp.timeout=15 41 | scp.path=SCP_UPLOAD_PATH 42 | 43 | [gmail] 44 | gmail.host=smtp.gmail.com 45 | gmail.port=587 46 | gmail.username=EMAIL_USERNAME@gmail.com 47 | gmail.password=EMAIL_PASSWORD 48 | gmail.from=FROM_EMAIL@gmail.com 49 | gmail.to=TO_EMAIL_1@gmail.com,TO_EMAIL_2@gmail.com 50 | 51 | [ifttt] 52 | ifttt.event_name=packtpub-crawler 53 | ifttt.key=IFTTT_MAKER_KEY 54 | 55 | [join] 56 | join.device_ids=DEVICE_IDS_COMMA_SEPARATED_OR_GROUP_NAME 57 | join.api_key=API_KEY 58 | 59 | [firebase] 60 | firebase.database_secret=FIREBASE_DATABASE_SECRET 61 | firebase.url=FIREBASE_URL 62 | firebase.path=/books 63 | 64 | [pushover] 65 | pushover.user_key=PUSHOVER_USER_KEY 66 | pushover.api_key=PUSHOVER_API_KEY 67 | -------------------------------------------------------------------------------- /script/utils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import ConfigParser 3 | from bs4 import BeautifulSoup 4 | from time import sleep 5 | from clint.textui import progress 6 | import os, sys, itertools 7 | from threading import Thread 8 | from logs import * 9 | 10 | def ip_address(): 11 | """ 12 | Gets current IP address 13 | """ 14 | 15 | response = requests.get('http://www.ip-addr.es') 16 | print '[-] GET {0} | {1}'.format(response.status_code, response.url) 17 | log_info('[+] ip address is: {0}'.format(response.text.strip())) 18 | 19 | def config_file(path): 20 | """ 21 | Reads configuration file 22 | """ 23 | if not os.path.exists(path): 24 | raise IOError('file not found!') 25 | 26 | log_info('[*] configuration file: {0}'.format(path)) 27 | config = ConfigParser.ConfigParser() 28 | config.read(path) 29 | return config 30 | 31 | def make_soup(response, debug=False): 32 | """ 33 | Makes soup from response 34 | """ 35 | 36 | print '[*] fetching url... {0} | {1}'.format(response.status_code, response.url) 37 | soup = BeautifulSoup(response.text, 'html5lib') 38 | if debug: 39 | print soup.prettify().encode('utf-8') 40 | return soup 41 | 42 | def wait(delay, isDev): 43 | if delay > 0: 44 | if isDev: 45 | print '[-] going to sleep {0} seconds'.format(delay) 46 | sleep(delay) 47 | 48 | def download_file(r, url, directory, filename, headers): 49 | """ 50 | Downloads file with progress bar 51 | """ 52 | if not os.path.exists(directory): 53 | # creates directories recursively 54 | os.makedirs(directory) 55 | log_info('[+] created new directory: ' + directory) 56 | 57 | filename = filename.replace(':', '-') 58 | path = os.path.join(directory, filename) 59 | 60 | print '[-] downloading file from url: {0}'.format(url) 61 | response = r.get(url, headers=headers, stream=True) 62 | #log_dict(response.headers) 63 | total_length = 0 64 | test_length = response.headers.get('content-length') 65 | if test_length is not None: 66 | total_length = int(test_length) 67 | 68 | with open(path, 'wb') as f: 69 | for chunk in progress.bar(response.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1): 70 | if chunk: 71 | f.write(chunk) 72 | f.flush() 73 | log_success('[+] new download: {0}'.format(path)) 74 | return path 75 | 76 | def thread_loader(function): 77 | """ 78 | Starts a thread with loading bar 79 | """ 80 | 81 | thread = Thread(target=function) 82 | thread.start() 83 | spinner = itertools.cycle(['-', '/', '|', '\\']) 84 | while thread.is_alive(): 85 | sys.stdout.write(spinner.next()) 86 | sys.stdout.flush() 87 | # erase the last written char 88 | sys.stdout.write('\b') 89 | -------------------------------------------------------------------------------- /dev/public/loginPost.html: -------------------------------------------------------------------------------- 1 | 19 | 20 |
21 |
22 |
23 | 24 | 25 | 32 | 33 | 34 |
35 |
36 | Time is running out to claim this free ebook 37 | 38 |
39 | 40 |
41 |

Scaling Big Data with Hadoop and Solr

42 |
43 |
44 | 45 |
46 | Create an enterprise ready search engine by combining the power of Hadoop and flexibility of Solr thanks to this free ebook! Discover the basics of Hadoop and Solr that can be taken to any iteration, before learning how to index your data and fine-tune 47 | your big data search as you gain everything you need to make a distributed search platform optimized to your needs. 48 |
49 | 50 |
51 |
52 | 62 |
63 | 64 |
65 |
66 |
67 |
68 | -------------------------------------------------------------------------------- /dev/public/loginPostNewsletter.html: -------------------------------------------------------------------------------- 1 | 19 |
20 |
21 |
22 |
23 |
24 | 25 |
26 |
27 |
28 |
29 | 30 |
31 |
32 |
33 | 34 |
35 |
36 |
37 | Forgotten your password? 38 | 39 | 40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 | 48 | 49 | 50 |
51 |
52 |
53 |
54 |
55 |
56 |

Optimize your JavaScript applications with this free 208 page advanced eBook

57 |

Studying JavaScript performance in depth will make you capable of tackling the complex and important tasks required to solve performance issues. With this full-length free eBook, discover how to optimize your JavaScript for both mobile and web applications.

58 |
    59 |
  • Create your own build system for JavaScript projects using Node.js and GulpJS
  • 60 |
  • Get to know best performance-focused practices when writing JavaScript code
  • 61 |
  • Use the DOM more efficiently by optimizing JavaScript with CSS3 animations
  • 62 |
  • Learn how to unit test JavaScript code using Jasmine
  • 63 |
64 |
65 |
66 | 67 |
Get Your Free Ebook
68 |
69 |
70 |
71 |
72 |
73 | -------------------------------------------------------------------------------- /script/notification/gmail.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | from email.mime.multipart import MIMEMultipart 3 | from email.mime.text import MIMEText 4 | from logs import * 5 | 6 | class Gmail(object): 7 | """ 8 | """ 9 | 10 | def __init__(self, config, packpub_info, upload_info): 11 | self.__config = config 12 | self.__packpub_info = packpub_info 13 | self.__upload_info = upload_info 14 | 15 | def __prepare_message(self): 16 | """ 17 | """ 18 | #log_json(self.__packpub_info) 19 | #log_json(self.__upload_info) 20 | 21 | msg = MIMEMultipart('alternative') 22 | msg['Subject'] = "[packtpub-crawler]" 23 | msg['From'] = self.__config.get('gmail', 'gmail.from') 24 | msg['To'] = self.__config.get('gmail', 'gmail.to') 25 | 26 | text = "Enjoy your daily FREE eBook!" 27 | html = """\ 28 | 29 | 30 | 31 |
{title}
32 |
{description}
33 | """.format(title=self.__packpub_info['title'].encode('utf-8'), 34 | description=self.__packpub_info['description'].encode('utf-8')) 35 | 36 | if self.__upload_info is not None: 37 | html += "" 42 | 43 | html += """\ 44 | cover 45 |
Powered by packtpub-crawler
46 | 47 | 48 | """.format(image=self.__packpub_info['url_image']) 49 | 50 | part1 = MIMEText(text, 'plain') 51 | part2 = MIMEText(html, 'html') 52 | 53 | msg.attach(part1) 54 | msg.attach(part2) 55 | 56 | return msg 57 | 58 | def __prepare_error_message(self, exception, source): 59 | """ 60 | """ 61 | #log_json(self.__packpub_info) 62 | #log_json(self.__upload_info) 63 | 64 | msg = MIMEMultipart('alternative') 65 | msg['Subject'] = "[packtpub-crawler]" 66 | msg['From'] = self.__config.get('gmail', 'gmail.from') 67 | msg['To'] = self.__config.get('gmail', 'gmail.to') 68 | 69 | text = "Error downloading today's ebook [{source}]".format(source=source) 70 | html = """\ 71 | 72 | 73 | 74 |
{title}
75 |
Download manually: {url}
76 |
{description}
77 | """.format(title=text, 78 | description=repr(exception), 79 | url=self.__packpub_info['landingPageUrl']) 80 | 81 | html += """\ 82 |
Powered by packtpub-crawler
83 | 84 | 85 | """ 86 | 87 | part1 = MIMEText(text, 'plain') 88 | part2 = MIMEText(html, 'html') 89 | 90 | msg.attach(part1) 91 | msg.attach(part2) 92 | 93 | return msg 94 | 95 | def send(self): 96 | server = smtplib.SMTP(self.__config.get('gmail', 'gmail.host'), self.__config.get('gmail', 'gmail.port')) 97 | server.starttls() 98 | server.login(self.__config.get('gmail', 'gmail.username'), self.__config.get('gmail', 'gmail.password')) 99 | 100 | message = self.__prepare_message() 101 | receivers = message['To'].split(",") 102 | server.sendmail(message['From'], receivers, message.as_string()) 103 | server.quit() 104 | 105 | log_success('[+] notified to: {0}'.format(receivers)) 106 | 107 | def sendError(self, exception, source): 108 | server = smtplib.SMTP(self.__config.get('gmail', 'gmail.host'), self.__config.get('gmail', 'gmail.port')) 109 | server.starttls() 110 | server.login(self.__config.get('gmail', 'gmail.username'), self.__config.get('gmail', 'gmail.password')) 111 | 112 | message = self.__prepare_error_message(exception, source) 113 | receivers = message['To'].split(",") 114 | server.sendmail(message['From'], receivers, message.as_string()) 115 | server.quit() 116 | 117 | log_success('[+] error notifikation sent to: {0}'.format(receivers)) 118 | -------------------------------------------------------------------------------- /script/onedrive.py: -------------------------------------------------------------------------------- 1 | from os.path import exists 2 | 3 | import magic 4 | import onedrivesdk 5 | from onedrivesdk.helpers import GetAuthCodeServer 6 | 7 | from logs import * 8 | from utils import thread_loader 9 | 10 | 11 | class OneDrive(object): 12 | """ 13 | """ 14 | 15 | def __init__(self, config): 16 | self.__config = config 17 | self.__onedrive_service = None 18 | self.__scopes = ['offline_access', 'onedrive.readwrite'] 19 | self.info = {} 20 | 21 | def __guess_info(self, file_path): 22 | if not exists(file_path): 23 | raise IOError('file not found!') 24 | 25 | self.info = { 26 | 'path': file_path, 27 | 'name': file_path.split('/')[-1], 28 | 'mime_type': magic.from_file(file_path, mime=True), 29 | } 30 | log_info('[+] new file upload on OneDrive:') 31 | log_info(self.info['name']) 32 | 33 | def __init_service(self): 34 | api_base_url = self.__config.get('onedrive', 'onedrive.api_base_url') 35 | client_id = self.__config.get('onedrive', 'onedrive.client_id') 36 | session_file = self.__config.get('onedrive', 'onedrive.session_file') 37 | 38 | if not exists(session_file): 39 | self.__save_credentials(session_file) 40 | 41 | http_provider = onedrivesdk.HttpProvider() 42 | auth_provider = onedrivesdk.AuthProvider(http_provider, 43 | client_id, 44 | self.__scopes) 45 | 46 | # Load the session 47 | auth_provider.load_session(path=session_file) 48 | auth_provider.refresh_token() 49 | self.__onedrive_service = onedrivesdk.OneDriveClient(api_base_url, auth_provider, http_provider) 50 | 51 | def __save_credentials(self, session_file): 52 | # api_base_url = self.__config.get('onedrive', 'onedrive.api_base_url') 53 | redirect_uri = 'http://localhost:8080/' 54 | client_id = self.__config.get('onedrive', 'onedrive.client_id') 55 | client_secret = self.__config.get('onedrive', 'onedrive.client_secret') 56 | 57 | client = onedrivesdk.get_default_client(client_id=client_id, scopes=self.__scopes) 58 | 59 | auth_url = client.auth_provider.get_auth_url(redirect_uri) 60 | 61 | # this will block until we have the code 62 | code = GetAuthCodeServer.get_auth_code(auth_url, redirect_uri) 63 | 64 | client.auth_provider.authenticate(code, redirect_uri, client_secret) 65 | 66 | # Save the session for later 67 | client.auth_provider.save_session(path=session_file) 68 | log_info('[+] new credentials saved') 69 | 70 | def __create_folder(self, item_id, folder_name): #Create folder with provided name 71 | f = onedrivesdk.Folder() 72 | i = onedrivesdk.Item() 73 | i.name = folder_name 74 | i.folder = f 75 | 76 | folder = self.__onedrive_service.item(drive='me', id=item_id).children.add(i) 77 | 78 | log_success('[+] creating new directory...') 79 | 80 | return folder.id #Return folder object ID 81 | 82 | def __get_folder(self): #Get folder name settings 83 | try: #Check folder name 84 | folder_name = self.__config.get('onedrive', 'onedrive.folder') 85 | except: 86 | folder_name = 'packtpub' 87 | 88 | item_id = 'root' 89 | directories = folder_name.split('/') 90 | for d in directories: 91 | if d == '.': 92 | continue 93 | try: # get folder if exists 94 | parent = self.__onedrive_service.item(drive='me', id=item_id) 95 | item = parent.children[d].get() 96 | item_id = item.id 97 | except: 98 | item_id = self.__create_folder(item_id, d) 99 | 100 | return item_id 101 | 102 | def __insert_file(self): 103 | print '[+] uploading file...' 104 | tries = 5 105 | while tries > 0: 106 | try: 107 | tries -= 1 108 | item = self.__onedrive_service.item(drive='me', id=self.__get_folder()) 109 | file = item.children[self.info['name']].upload(self.info['path']) 110 | tries = 0 111 | except: 112 | print '[x] upload failed' 113 | if tries <= 0: 114 | pass 115 | else: 116 | print '[x] retrying ...' 117 | 118 | self.info['id'] = file.id 119 | self.info['download_url'] = file.web_url 120 | 121 | def upload(self, file_path): 122 | self.__guess_info(file_path) 123 | self.__init_service() 124 | thread_loader(self.__insert_file) 125 | -------------------------------------------------------------------------------- /script/googledrive.py: -------------------------------------------------------------------------------- 1 | from os.path import exists 2 | import webbrowser 3 | from oauth2client.client import flow_from_clientsecrets, OOB_CALLBACK_URN 4 | from oauth2client.file import Storage 5 | import httplib2 6 | import magic 7 | from googleapiclient.discovery import build 8 | from googleapiclient.http import MediaFileUpload 9 | from utils import thread_loader 10 | from logs import * 11 | 12 | class GoogleDrive(object): 13 | """ 14 | """ 15 | 16 | def __init__(self, config): 17 | self.__config = config 18 | self.__googledrive_service = None 19 | self.info = {} 20 | 21 | def __guess_info(self, file_path): 22 | if not exists(file_path): 23 | raise IOError('file not found!') 24 | 25 | self.info = { 26 | 'path': file_path, 27 | 'name': file_path.split('/')[-1], 28 | 'mime_type': magic.from_file(file_path, mime=True), 29 | } 30 | log_info('[+] new file upload on Google Drive:') 31 | # log_dict(self.file_info) 32 | 33 | def __init_service(self): 34 | auth_token = self.__config.get('googledrive', 'googledrive.auth_token') 35 | 36 | if not exists(auth_token): 37 | self.__save_credentials(auth_token) 38 | 39 | storage = Storage(auth_token) 40 | credentials = storage.get() 41 | 42 | http = httplib2.Http() 43 | http = credentials.authorize(http) 44 | self.__googledrive_service = build('drive', 'v2', http=http) 45 | 46 | def __save_credentials(self, auth_token): 47 | flow = flow_from_clientsecrets( 48 | self.__config.get('googledrive', 'googledrive.client_secrets'), 49 | self.__config.get('googledrive', 'googledrive.oauth2_scope'), 50 | OOB_CALLBACK_URN) 51 | 52 | authorize_url = flow.step1_get_authorize_url() 53 | 54 | print '[-] open browser...' 55 | webbrowser.open(authorize_url) 56 | 57 | code = raw_input('[*] Please, enter verification code: ').strip() 58 | credentials = flow.step2_exchange(code) 59 | 60 | storage = Storage(auth_token) 61 | storage.put(credentials) 62 | log_info('[+] new credentials saved') 63 | 64 | def __create_folder(self): #Create folder with provided name 65 | try: #Check default folder name 66 | default_folder_name = self.__config.get('googledrive', 'googledrive.default_folder') 67 | except: 68 | default_folder_name = 'packtpub' 69 | 70 | metadata = { 71 | 'title': default_folder_name, 72 | 'mimeType' : 'application/vnd.google-apps.folder' 73 | } 74 | folder = self.__googledrive_service.files().insert(body = metadata).execute() 75 | self.__config.set('googledrive', 'googledrive.upload_folder', folder['id']) 76 | log_success('[+] creating new directory...') 77 | print '[+] updating folder permissions...' 78 | permissions = { 79 | 'role': 'reader', 80 | 'type': 'anyone', 81 | 'value': self.__config.get('googledrive', 'googledrive.gmail') 82 | } 83 | self.__googledrive_service.permissions().insert(fileId=folder['id'], body=permissions).execute() 84 | log_dict({'folder_name': default_folder_name, 85 | 'id': folder['id'],}) 86 | #'share_link': folder['webContentLink']}) #TODO Fix 87 | log_success('[+] Please add this line after [googledrive] in your configuration file:') 88 | log_info('googledrive.upload_folder=' + folder.get('id')) 89 | 90 | return folder.get('id') #Return folder object ID 91 | 92 | def __get_folder(self): #Get folder name settings 93 | try: 94 | return self.__config.get('googledrive', 'googledrive.upload_folder') 95 | except: 96 | return self.__create_folder() #new folder ID 97 | 98 | def __insert_file(self): 99 | print '[+] uploading file...' 100 | media_body = MediaFileUpload( 101 | self.info['path'], mimetype=self.info['mime_type'], resumable=True) 102 | body = { 103 | 'title': self.info['name'], 104 | 'description': 'uploaded with packtpub-crawler', 105 | 'mimeType': self.info['mime_type'], 106 | 'parents': [{'id': self.__get_folder()}] 107 | } 108 | file = self.__googledrive_service.files().insert(body=body, media_body=media_body).execute() 109 | # log_dict(file) 110 | 111 | print '[+] updating file permissions...' 112 | permissions = { 113 | 'role': 'reader', 114 | 'type': 'anyone', 115 | 'value': self.__config.get('googledrive', 'googledrive.gmail') 116 | } 117 | self.__googledrive_service.permissions().insert(fileId=file['id'], body=permissions).execute() 118 | 119 | # self.__googledrive_service.files().get(fileId=file['id']).execute() 120 | 121 | self.info['id'] = file['id'] 122 | self.info['download_url'] = file['webContentLink'] 123 | 124 | def upload(self, file_path): 125 | self.__guess_info(file_path) 126 | self.__init_service() 127 | thread_loader(self.__insert_file) 128 | -------------------------------------------------------------------------------- /script/spider.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import argparse 4 | import datetime 5 | import requests 6 | import os 7 | from utils import ip_address, config_file 8 | from packtpub import Packtpub 9 | from upload import Upload, SERVICE_GOOGLE_DRIVE, SERVICE_ONEDRIVE, SERVICE_DROPBOX, SERVICE_SCP 10 | from database import Database, DB_FIREBASE 11 | from logs import * 12 | from notify import Notify, SERVICE_GMAIL, SERVICE_IFTTT, SERVICE_JOIN, SERVICE_PUSHOVER 13 | from noBookException import NoBookException 14 | from alreadyClaimedException import AlreadyClaimedException 15 | 16 | def parse_types(args): 17 | if args.types is None: 18 | return [args.type] 19 | else: 20 | return args.types 21 | 22 | def handleClaim(packtpub, args, config, dir_path): 23 | if args.dev: 24 | log_json(packtpub.info) 25 | 26 | log_success('[+] book successfully claimed') 27 | 28 | upload = None 29 | upload_info = None 30 | 31 | if not args.claimOnly: 32 | types = parse_types(args) 33 | 34 | packtpub.download_ebooks(types, dir_path) 35 | 36 | if args.extras: 37 | packtpub.download_extras(dir_path) 38 | 39 | if args.archive: 40 | raise NotImplementedError('not implemented yet!') 41 | 42 | if args.upload is not None: 43 | upload = Upload(config, args.upload) 44 | upload.run(packtpub.info['paths']) 45 | 46 | if args.store is not None: 47 | if args.upload == SERVICE_GOOGLE_DRIVE or args.upload == SERVICE_ONEDRIVE: 48 | Database(config, args.store, packtpub.info, upload.info).store() 49 | else: 50 | log_warn('[-] skip store info: missing upload info') 51 | 52 | 53 | if args.notify: 54 | if upload is not None: 55 | upload_info = upload.info 56 | 57 | Notify(config, packtpub.info, upload_info, args.notify).run() 58 | 59 | def main(): 60 | parser = argparse.ArgumentParser( 61 | description='Download FREE eBook every day from www.packtpub.com', 62 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 63 | version='2.4.0') 64 | 65 | parser.add_argument('-c', '--config', required=True, help='configuration file') 66 | parser.add_argument('-d', '--dev', action='store_true', help='only for development') 67 | parser.add_argument('-e', '--extras', action='store_true', help='download source code (if exists) and book cover') 68 | parser.add_argument('-u', '--upload', choices=[SERVICE_GOOGLE_DRIVE, SERVICE_ONEDRIVE, SERVICE_DROPBOX, SERVICE_SCP], help='upload to cloud') 69 | parser.add_argument('-a', '--archive', action='store_true', help='compress all file') 70 | parser.add_argument('-n', '--notify', choices=[SERVICE_GMAIL, SERVICE_IFTTT, SERVICE_JOIN, SERVICE_PUSHOVER], help='notify after claim/download') 71 | parser.add_argument('-s', '--store', choices=[DB_FIREBASE], help='store info') 72 | parser.add_argument('-o', '--claimOnly', action='store_true', help='only claim books (no downloads/uploads)') 73 | 74 | group = parser.add_mutually_exclusive_group() 75 | group.add_argument('-t', '--type', choices=['pdf', 'epub', 'mobi'], 76 | default='pdf', help='specify eBook type') 77 | group.add_argument('--all', dest='types', action='store_const', 78 | const=['pdf', 'epub', 'mobi'], help='all eBook types') 79 | 80 | args = parser.parse_args() 81 | 82 | now = datetime.datetime.now() 83 | log_info('[*] {date} - fetching today\'s eBooks'.format(date=now.strftime("%Y-%m-%d %H:%M"))) 84 | 85 | packtpub = None 86 | 87 | try: 88 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + os.path.sep 89 | 90 | config = config_file(dir_path + args.config) 91 | packtpub = Packtpub(config, args.dev) 92 | 93 | #ip_address() 94 | log_info('[*] getting daily free eBook') 95 | 96 | try: 97 | packtpub.runDaily() 98 | handleClaim(packtpub, args, config, dir_path) 99 | except NoBookException as e: 100 | log_info('[*] ' + e.message) 101 | except Exception as e: 102 | log_debug(e) 103 | if args.notify: 104 | Notify(config, packtpub.info, None, args.notify).sendError(e, 'daily') 105 | 106 | lastNewsletterUrlPath = dir_path + 'config/lastNewsletterUrl' 107 | lastNewsletterUrl = None 108 | 109 | if os.path.isfile(lastNewsletterUrlPath): 110 | with open(lastNewsletterUrlPath, 'r+') as f: 111 | lastNewsletterUrl = f.read().strip() 112 | 113 | # the default URL is generated by an Google apps script, see README for details and self-hosting 114 | currentNewsletterUrl = requests.get(config.get('url', 'url.bookFromNewsletter')).text.strip() 115 | 116 | if currentNewsletterUrl == '': 117 | log_info('[*] no free eBook from newsletter right now') 118 | elif not currentNewsletterUrl.startswith('https://www.packtpub.com'): 119 | log_warn('[-] invalid URL from newsletter: ' + currentNewsletterUrl) 120 | elif lastNewsletterUrl != currentNewsletterUrl: 121 | log_info('[*] getting free eBook from newsletter') 122 | try: 123 | packtpub.resetInfo() 124 | packtpub.runNewsletter(currentNewsletterUrl) 125 | handleClaim(packtpub, args, config, dir_path) 126 | 127 | with open(lastNewsletterUrlPath, 'w+') as f: 128 | f.write(currentNewsletterUrl) 129 | 130 | except AlreadyClaimedException as a: 131 | log_info('[*] book was already claimed, skipping') 132 | with open(lastNewsletterUrlPath, 'w+') as f: 133 | f.write(currentNewsletterUrl) 134 | except Exception as e: 135 | log_debug(e) 136 | if args.notify: 137 | Notify(config, packtpub.info, None, args.notify).sendError(e, 'newsletter') 138 | else: 139 | log_info('[*] already got latest ebook from newsletter, skipping') 140 | 141 | except KeyboardInterrupt: 142 | log_error('[-] interrupted manually') 143 | 144 | except Exception as e: 145 | log_debug(e) 146 | if args.notify: 147 | Notify(config, None, None, args.notify).sendError(e, 'global') 148 | 149 | log_info('[*] done') 150 | 151 | if __name__ == '__main__': 152 | print (""" 153 | __ __ __ __ 154 | ____ ____ ______/ /__/ /_____ __ __/ /_ ______________ __ __/ /__ _____ 155 | / __ \/ __ `/ ___/ //_/ __/ __ \/ / / / __ \______/ ___/ ___/ __ `/ | /| / / / _ \/ ___/ 156 | / /_/ / /_/ / /__/ ,< / /_/ /_/ / /_/ / /_/ /_____/ /__/ / / /_/ /| |/ |/ / / __/ / 157 | / .___/\__,_/\___/_/|_|\__/ .___/\__,_/_.___/ \___/_/ \__,_/ |__/|__/_/\___/_/ 158 | /_/ /_/ 159 | 160 | Download FREE eBook every day from www.packtpub.com 161 | @see github.com/niqdev/packtpub-crawler 162 | """) 163 | main() 164 | -------------------------------------------------------------------------------- /dev/public/myEbooks.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 |
6 | 14 |
15 |
Instant Handlebars.js [eBook]
16 |
17 | Gabriel Manricks 18 |
19 |
20 | 38 |
39 |
+
40 |
41 | 90 |
91 | 92 | 93 |
94 |
95 | 103 |
104 |
Scaling Big Data with Hadoop and Solr [eBook]
105 |
106 | Hrishikesh Vijay Karambelkar 107 |
108 |
109 | 127 |
128 |
+
129 |
130 | 186 |
187 | 188 | 189 |
190 | 191 |
192 | -------------------------------------------------------------------------------- /script/packtpub.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | from os.path import split, join 4 | from utils import make_soup, wait, download_file 5 | from logs import * 6 | from noBookException import NoBookException 7 | from alreadyClaimedException import AlreadyClaimedException 8 | 9 | class Packtpub(object): 10 | """ 11 | """ 12 | 13 | def __init__(self, config, dev): 14 | self.__config = config 15 | self.__dev = dev 16 | self.__delay = float(self.__config.get('delay', 'delay.requests')) 17 | self.__url_base = self.__config.get('url', 'url.base') 18 | self.__headers = self.__init_headers() 19 | self.__session = requests.Session() 20 | self.resetInfo() 21 | 22 | def resetInfo(self): 23 | self.info = { 24 | 'paths': [] 25 | } 26 | 27 | def __init_headers(self): 28 | # improvement: random user agent 29 | return { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 31 | 'Accept-Encoding': 'gzip, deflate', 32 | 'Connection': 'keep-alive', 33 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0.1; SM-G920V Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36' 34 | } 35 | 36 | def __log_response(self, response, method='GET', detail=False): 37 | if detail: 38 | print '[-] {0} {1} | {2}'.format(method, response.url, response.status_code) 39 | print '[-] cookies:' 40 | log_dict(requests.utils.dict_from_cookiejar(self.__session.cookies)) 41 | print '[-] headers:' 42 | log_dict(response.headers) 43 | 44 | def __GET_login(self, url): 45 | response = self.__session.get(url, headers=self.__headers) 46 | self.__log_response(response, 'GET', self.__dev) 47 | 48 | soup = make_soup(response) 49 | 50 | form = soup.find('form', {'id': 'packt-user-login-form'}) 51 | 52 | if form is None: 53 | raise Exception('Could not find login form') 54 | 55 | self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value'] 56 | self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value'] 57 | 58 | return soup 59 | 60 | def __POST_login(self, url): 61 | data = self.info.copy() 62 | data['email'] = self.__config.get('credential', 'credential.email') 63 | data['password'] = self.__config.get('credential', 'credential.password') 64 | data['op'] = 'Login' 65 | # print '[-] data: {0}'.format(urllib.urlencode(data)) 66 | 67 | response = None 68 | if self.__dev: 69 | response = self.__session.get(url, headers=self.__headers, data=data) 70 | self.__log_response(response, 'GET', self.__dev) 71 | else: 72 | response = self.__session.post(url, headers=self.__headers, data=data) 73 | self.__log_response(response, 'POST', self.__dev) 74 | 75 | soup = make_soup(response) 76 | 77 | error_node = soup.find('div', {'class': 'messages error'}) 78 | 79 | if error_node is not None: 80 | raise Exception(error_node.text.strip()) 81 | 82 | def __parseDailyBookInfo(self, soup): 83 | div_target = soup.find('div', {'id': 'deal-of-the-day'}) 84 | 85 | if div_target is None: 86 | raise NoBookException('no free eBook today') 87 | 88 | title = div_target.select('div.dotd-title > h2')[0].text.strip() 89 | self.info['title'] = title 90 | self.info['filename'] = title.encode('ascii', 'ignore').replace(' ', '_') 91 | self.info['description'] = div_target.select('div.dotd-main-book-summary > div')[2].text.strip() 92 | self.info['url_image'] = 'https:' + div_target.select('div.dotd-main-book-image img')[0]['data-original'] 93 | self.info['url_claim'] = self.__url_base + div_target.select('a.twelve-days-claim')[0]['href'] 94 | # remove useless info 95 | self.info.pop('form_build_id', None) 96 | self.info.pop('form_id', None) 97 | 98 | def __parseNewsletterBookInfo(self, soup): 99 | div_target = soup.find('div', {'id': 'main-book'}) 100 | 101 | urlWithTitle = div_target.select('div.promo-landing-book-picture a')[0]['href'] 102 | title = urlWithTitle.split('/')[-1].replace('-', ' ').title() 103 | claimNode = div_target.select('div.promo-landing-book-info a') 104 | 105 | self.info['title'] = title 106 | self.info['filename'] = title.replace(' ', '_').encode('ascii', 'ignore') 107 | self.info['description'] = div_target.select('div.promo-landing-book-body > div')[0].text.strip() 108 | self.info['url_image'] = 'https:' + div_target.select('div.promo-landing-book-picture img')[0]['src'] 109 | self.info['url_claim'] = self.__url_base + claimNode[0]['href'] 110 | # remove useless info 111 | self.info.pop('form_build_id', None) 112 | self.info.pop('form_id', None) 113 | 114 | def __GET_claim(self): 115 | if self.__dev: 116 | url = self.__url_base + self.__config.get('url', 'url.account') 117 | else: 118 | url = self.info['url_claim'] 119 | 120 | response = self.__session.get(url, headers=self.__headers) 121 | self.__log_response(response, 'GET', self.__dev) 122 | 123 | soup = make_soup(response) 124 | div_target = soup.find('div', {'id': 'product-account-list'}) 125 | 126 | if div_target is None: 127 | raise Exception('Could not access claim page. This is most likely caused by invalid credentials') 128 | 129 | errorMessage = soup.find(id='messages-container') 130 | 131 | if errorMessage is not None and errorMessage.text.strip() == 'You have already claimed this promotion.': 132 | raise AlreadyClaimedException() 133 | 134 | # only last one just claimed 135 | div_claimed_book = div_target.select('.product-line')[0] 136 | self.info['book_id'] = div_claimed_book['nid'] 137 | self.info['author'] = div_claimed_book.find(class_='author').text.strip() 138 | 139 | source_code = div_claimed_book.find(href=re.compile('/code_download/*')) 140 | if source_code is not None: 141 | self.info['url_source_code'] = self.__url_base + source_code['href'] 142 | 143 | def runDaily(self): 144 | """ 145 | """ 146 | if self.__dev: 147 | loginUrl = self.__url_base + self.__config.get('url', 'url.loginGet') 148 | else: 149 | loginUrl = self.__url_base + self.__config.get('url', 'url.login') 150 | 151 | self.info['landingPageUrl'] = loginUrl 152 | 153 | soup = self.__GET_login(loginUrl) 154 | wait(self.__delay, self.__dev) 155 | 156 | if self.__dev: 157 | loginUrl = self.__url_base + self.__config.get('url', 'url.loginPost') 158 | 159 | self.__POST_login(loginUrl) 160 | wait(self.__delay, self.__dev) 161 | self.__parseDailyBookInfo(soup) 162 | wait(self.__delay, self.__dev) 163 | self.__GET_claim() 164 | wait(self.__delay, self.__dev) 165 | 166 | def runNewsletter(self, currentNewsletterUrl): 167 | """ 168 | """ 169 | 170 | soup = self.__GET_login(currentNewsletterUrl) 171 | self.info['landingPageUrl'] = currentNewsletterUrl 172 | 173 | self.__parseNewsletterBookInfo(soup) 174 | wait(self.__delay, self.__dev) 175 | self.__GET_claim() 176 | wait(self.__delay, self.__dev) 177 | 178 | def download_ebooks(self, types, base_path): 179 | """ 180 | """ 181 | downloads_info = [dict(type=type, 182 | url=self.__url_base + self.__config.get('url', 'url.download').format(self.info['book_id'], type), 183 | filename=self.info['filename'] + '.' + type) 184 | for type in types] 185 | 186 | # https://github.com/niqdev/packtpub-crawler/pull/27 187 | if self.__config.has_option('path', 'path.group'): 188 | 189 | folder_name = self.info['title'].encode('ascii', 'ignore').replace(' ', '_') + \ 190 | self.info['author'].encode('ascii', 'ignore').replace(' ', '_') 191 | 192 | directory = base_path + join(self.__config.get('path', 'path.ebooks'), folder_name) 193 | else: 194 | directory = base_path + self.__config.get('path', 'path.ebooks') 195 | 196 | for download in downloads_info: 197 | self.info['paths'].append( 198 | download_file(self.__session, download['url'], directory, download['filename'], self.__headers)) 199 | 200 | def download_extras(self, base_path): 201 | """ 202 | """ 203 | 204 | # https://github.com/niqdev/packtpub-crawler/pull/27 205 | if self.__config.has_option('path', 'path.group'): 206 | 207 | folder_name = self.info['title'].encode('ascii', 'ignore').replace(' ', '_') + \ 208 | self.info['author'].encode('ascii', 'ignore').replace(' ', '_') 209 | 210 | directory = base_path + join(self.__config.get('path', 'path.ebooks'), folder_name, self.__config.get('path', 'path.extras')) 211 | else: 212 | directory = base_path + self.__config.get('path', 'path.extras') 213 | 214 | url_image = self.info['url_image'] 215 | filename = self.info['filename'] + '_' + split(url_image)[1] 216 | self.info['paths'].append(download_file(self.__session, url_image, directory, filename, self.__headers)) 217 | 218 | if 'url_source_code' in self.info: 219 | self.info['paths'].append(download_file(self.__session, self.info['url_source_code'], directory, 220 | self.info['filename'] + '.zip', self.__headers)) 221 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # packtpub-crawler 2 | 3 | ### Download FREE eBook every day from [www.packtpub.com](https://www.packtpub.com/packt/offers/free-learning) 4 | 5 | This crawler automates the following step: 6 | 7 | * access to private account 8 | * claim the daily free eBook and weekly Newsletter 9 | * parse title, description and useful information 10 | * download favorite format *.pdf .epub .mobi* 11 | * download source code and book cover 12 | * upload files to Google Drive, OneDrive or via scp 13 | * store data on Firebase 14 | * notify via Gmail, IFTTT, Join or Pushover (on success and errors) 15 | * schedule daily job on Heroku or with Docker 16 | 17 | ### Default command 18 | ```bash 19 | # upload pdf to googledrive, store data and notify via email 20 | python script/spider.py -c config/prod.cfg -u googledrive -s firebase -n gmail 21 | ``` 22 | 23 | ### Other options 24 | ```bash 25 | # download all format 26 | python script/spider.py --config config/prod.cfg --all 27 | 28 | # download only one format: pdf|epub|mobi 29 | python script/spider.py --config config/prod.cfg --type pdf 30 | 31 | # download also additional material: source code (if exists) and book cover 32 | python script/spider.py --config config/prod.cfg -t pdf --extras 33 | # equivalent (default is pdf) 34 | python script/spider.py -c config/prod.cfg -e 35 | 36 | # download and then upload to Google Drive (given the download url anyone can download it) 37 | python script/spider.py -c config/prod.cfg -t epub --upload googledrive 38 | python script/spider.py --config config/prod.cfg --all --extras --upload googledrive 39 | 40 | # download and then upload to OneDrive (given the download url anyone can download it) 41 | python script/spider.py -c config/prod.cfg -t epub --upload onedrive 42 | python script/spider.py --config config/prod.cfg --all --extras --upload onedrive 43 | 44 | # download and notify: gmail|ifttt|join|pushover 45 | python script/spider.py -c config/prod.cfg --notify gmail 46 | 47 | # only claim book (no downloads): 48 | python script/spider.py -c config/prod.cfg --notify gmail --claimOnly 49 | ``` 50 | 51 | ### Basic setup 52 | 53 | Before you start you should 54 | 55 | * Verify that your currently installed version of Python is **2.x** with `python --version` 56 | * Clone the repository `git clone https://github.com/niqdev/packtpub-crawler.git` 57 | * Install all the dependencies `pip install -r requirements.txt` (see also [virtualenv](https://github.com/niqdev/packtpub-crawler#virtualenv)) 58 | * Create a [config](https://github.com/niqdev/packtpub-crawler/blob/master/config/prod_example.cfg) file `cp config/prod_example.cfg config/prod.cfg` 59 | * Change your Packtpub credentials in the config file 60 | ``` 61 | [credential] 62 | credential.email=PACKTPUB_EMAIL 63 | credential.password=PACKTPUB_PASSWORD 64 | ``` 65 | 66 | Now you should be able to claim and download your first eBook 67 | ``` 68 | python script/spider.py --config config/prod.cfg 69 | ``` 70 | 71 | ### Google Drive 72 | 73 | From the documentation, Google Drive API requires OAuth2.0 for authentication, so to upload files you should: 74 | 75 | * Go to [Google APIs Console](https://code.google.com/apis/console) and create a new [Google Drive](https://console.developers.google.com/apis/api/drive/overview) project named **PacktpubDrive** 76 | * On *API manager > Overview* menu 77 | * Enable Google Drive API 78 | * On *API manager > Credentials* menu 79 | * In *OAuth consent screen* tab set **PacktpubDrive** as the product name shown to users 80 | * In *Credentials* tab create credentials of type *OAuth client ID* and choose Application type *Other* named **PacktpubDriveCredentials** 81 | * Click *Download JSON* and save the file `config/client_secrets.json` 82 | * Change your Google Drive credentials in the config file 83 | 84 | ``` 85 | [googledrive] 86 | ... 87 | googledrive.client_secrets=config/client_secrets.json 88 | googledrive.gmail=GOOGLE_DRIVE@gmail.com 89 | ``` 90 | 91 | Now you should be able to upload your eBook to Google Drive 92 | ``` 93 | python script/spider.py --config config/prod.cfg --upload googledrive 94 | ``` 95 | 96 | Only the first time you will be prompted to login in a browser which has javascript enabled (no text-based browser) to generate `config/auth_token.json`. 97 | You should also copy and paste in the config the *FOLDER_ID*, otherwise every time a new folder with the same name will be created. 98 | ``` 99 | [googledrive] 100 | ... 101 | googledrive.default_folder=packtpub 102 | googledrive.upload_folder=FOLDER_ID 103 | ``` 104 | 105 | Documentation: [OAuth](https://developers.google.com/api-client-library/python/guide/aaa_oauth), [Quickstart](https://developers.google.com/drive/v3/web/quickstart/python), [example](https://github.com/googledrive/python-quickstart) and [permissions](https://developers.google.com/drive/v2/reference/permissions) 106 | 107 | ### OneDrive 108 | 109 | From the documentation, OneDrive API requires OAuth2.0 for authentication, so to upload files you should: 110 | 111 | 112 | * Go to the [Microsoft Application Registration Portal](https://apps.dev.microsoft.com/?referrer=https%3A%2F%2Fdev.onedrive.com%2Fapp-registration.htm). 113 | * When prompted, sign in with your Microsoft account credentials. 114 | * Find **My applications** and click **Add an app**. 115 | * Enter **PacktpubDrive** as the app's name and click **Create application**. 116 | * Scroll to the bottom of the page and check the **Live SDK support** box. 117 | * Change your OneDrive credentials in the config file 118 | * Copy your **Application Id** into the config file to **onedrive.client_id** 119 | * Click **Generate New Password** and copy the password shown into the config file to **onedrive.client_secret** 120 | * Click **Add Platform** and select **Web** 121 | * Enter **http://localhost:8080/** as the **Redirect URL** 122 | * Click **Save** at the bottom of the page 123 | 124 | ``` 125 | [onedrive] 126 | ... 127 | onedrive.client_id=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx 128 | onedrive.client_secret=XxXxXxXxXxXxXxXxXxXxXxX 129 | ``` 130 | 131 | Now you should be able to upload your eBook to OneDrive 132 | ``` 133 | python script/spider.py --config config/prod.cfg --upload onedrive 134 | ``` 135 | 136 | Only the first time you will be prompted to login in a browser which has javascript enabled (no text-based browser) to generate `config/session.onedrive.pickle`. 137 | ``` 138 | [onedrive] 139 | ... 140 | onedrive.folder=packtpub 141 | ``` 142 | 143 | Documentation: [Registration](https://dev.onedrive.com/app-registration.htm), [Python API](https://github.com/OneDrive/onedrive-sdk-python) 144 | 145 | ### Scp 146 | 147 | To upload your eBook via `scp` on a remote server update the configs 148 | 149 | ``` 150 | [scp] 151 | scp.host=SCP_HOST 152 | scp.user=SCP_USER 153 | scp.password=SCP_PASSWORD 154 | scp.path=SCP_UPLOAD_PATH 155 | ``` 156 | 157 | Now you should be able to upload your eBook 158 | ``` 159 | python script/spider.py --config config/prod.cfg --upload scp 160 | ``` 161 | 162 | Note: 163 | * the destination folder `scp.path` on the remote server must exists in advance 164 | * the option `--upload scp` is incompatible with `--store` and `--notify` 165 | 166 | ### Firebase 167 | 168 | Create a new Firebase [project](https://console.firebase.google.com/), copy the database secret from your settings 169 | ``` 170 | https://console.firebase.google.com/project/PROJECT_NAME/settings/database 171 | ``` 172 | and update the configs 173 | ``` 174 | [firebase] 175 | firebase.database_secret=DATABASE_SECRET 176 | firebase.url=https://PROJECT_NAME.firebaseio.com 177 | ``` 178 | 179 | Now you should be able to store your eBook details on Firebase 180 | ``` 181 | python script/spider.py --config config/prod.cfg --upload googledrive --store firebase 182 | ``` 183 | 184 | ### Gmail notification 185 | 186 | To *send* a notification via email using Gmail you should: 187 | 188 | * Allow ["less secure apps"](https://www.google.com/settings/security/lesssecureapps) and ["DisplayUnlockCaptcha"](https://accounts.google.com/DisplayUnlockCaptcha) on your account 189 | * [Troubleshoot](https://support.google.com/mail/answer/78754) sign-in problems and [examples](http://stackoverflow.com/questions/10147455/how-to-send-an-email-with-gmail-as-provider-using-python) 190 | * Change your Gmail credentials in the config file 191 | 192 | ``` 193 | [gmail] 194 | ... 195 | gmail.username=EMAIL_USERNAME@gmail.com 196 | gmail.password=EMAIL_PASSWORD 197 | gmail.from=FROM_EMAIL@gmail.com 198 | gmail.to=TO_EMAIL_1@gmail.com,TO_EMAIL_2@gmail.com 199 | ``` 200 | 201 | Now you should be able to notify your accounts 202 | ``` 203 | python script/spider.py --config config/prod.cfg --notify gmail 204 | ``` 205 | 206 | ### IFTTT notification 207 | 208 | * Get an account on [IFTTT](https://ifttt.com) 209 | * Go to your Maker [settings](https://ifttt.com/services/maker/settings) and activate the channel 210 | * [Create](https://ifttt.com/create) a new applet using the Maker service with the trigger "Receive a web request" and the event name "packtpub-crawler" 211 | * Change your IFTTT [key](https://internal-api.ifttt.com/maker) in the config file 212 | 213 | ``` 214 | [ifttt] 215 | ifttt.event_name=packtpub-crawler 216 | ifttt.key=IFTTT_MAKER_KEY 217 | ``` 218 | 219 | Now you should be able to trigger the applet 220 | ``` 221 | python script/spider.py --config config/prod.cfg --notify ifttt 222 | ``` 223 | 224 | Value mappings: 225 | * value1: title 226 | * value2: description 227 | * value3: landing page URL 228 | 229 | ### Join notification 230 | 231 | * Get the Join [Chrome extension](https://chrome.google.com/webstore/detail/join-by-joaoapps/flejfacjooompmliegamfbpjjdlhokhj) and/or [App](https://play.google.com/store/apps/details?id=com.joaomgcd.join) 232 | * You can find your device ids [here](https://joinjoaomgcd.appspot.com/) 233 | * (Optional) You can use multiple devices or groups (group.all, group.android, group.chrome, group.windows10, group.phone, group.tablet, group.pc) separated by comma 234 | * Change your Join credentials in the config file 235 | 236 | ``` 237 | [join] 238 | join.device_ids=DEVICE_IDS_COMMA_SEPARATED_OR_GROUP_NAME 239 | join.api_key=API_KEY 240 | ``` 241 | 242 | Now you should be able to trigger the event 243 | ``` 244 | python script/spider.py --config config/prod.cfg --notify join 245 | ``` 246 | 247 | ### Pushover notification 248 | 249 | * Get your [USER_KEY](https://pushover.net/) 250 | * Create a [new application](https://pushover.net/apps/build) 251 | * (Optional) Add an [icon](https://pushover.net/icons/9aqpv697p9g6wzo.png) 252 | * Change your pushover credentials in the config file 253 | 254 | ``` 255 | [pushover] 256 | pushover.user_key=PUSHOVER_USER_KEY 257 | pushover.api_key=PUSHOVER_API_KEY 258 | ``` 259 | 260 | ### Heroku 261 | 262 | Create a new branch 263 | ``` 264 | git checkout -b heroku-scheduler 265 | ``` 266 | 267 | Update the `.gitignore` and commit your changes 268 | ```bash 269 | # remove 270 | config/prod.cfg 271 | config/client_secrets.json 272 | config/auth_token.json 273 | # add 274 | dev/ 275 | config/dev.cfg 276 | config/prod_example.cfg 277 | ``` 278 | 279 | Create, config and deploy the scheduler 280 | ```bash 281 | heroku login 282 | # create a new app 283 | heroku create APP_NAME --region eu 284 | # or if you already have an existing app 285 | heroku git:remote -a APP_NAME 286 | 287 | # deploy your app 288 | git push -u heroku heroku-scheduler:master 289 | heroku ps:scale clock=1 290 | 291 | # useful commands 292 | heroku ps 293 | heroku logs --ps clock.1 294 | heroku logs --tail 295 | heroku run bash 296 | ``` 297 | 298 | Update `script/scheduler.py` with your own preferences. 299 | 300 | More info about Heroku [Scheduler](https://devcenter.heroku.com/articles/scheduler), [Clock Processes](https://devcenter.heroku.com/articles/clock-processes-python), [Add-on](https://elements.heroku.com/addons/scheduler) and [APScheduler](http://apscheduler.readthedocs.io/en/latest/userguide.html) 301 | 302 | ### Docker 303 | 304 | Build your image 305 | ``` 306 | docker build -t niqdev/packtpub-crawler:2.4.0 . 307 | ``` 308 | 309 | Run manually 310 | ``` 311 | docker run \ 312 | --rm \ 313 | --name my-packtpub-crawler \ 314 | niqdev/packtpub-crawler:2.4.0 \ 315 | python script/spider.py --config config/prod.cfg 316 | ``` 317 | 318 | Run scheduled crawler in background 319 | ``` 320 | docker run \ 321 | --detach \ 322 | --name my-packtpub-crawler \ 323 | niqdev/packtpub-crawler:2.4.0 324 | 325 | # useful commands 326 | docker exec -i -t my-packtpub-crawler bash 327 | docker logs -f my-packtpub-crawler 328 | ``` 329 | 330 | Alternatively you can pull from [Docker Hub](https://hub.docker.com/r/kuchy/packtpub-crawler/) this [fork](https://github.com/kuchy/packtpub-crawler/tree/docker_cron) 331 | ``` 332 | docker pull kuchy/packtpub-crawler 333 | ``` 334 | 335 | ### Cron job 336 | Add this to your crontab to run the job daily at 9 AM: 337 | ``` 338 | crontab -e 339 | 340 | 00 09 * * * cd PATH_TO_PROJECT/packtpub-crawler && /usr/bin/python script/spider.py --config config/prod.cfg >> /tmp/packtpub.log 2>&1 341 | ``` 342 | 343 | 344 | ### Systemd service 345 | Create two files in /etc/systemd/system: 346 | 347 | 1. packtpub-crawler.service 348 | ``` 349 | [Unit] 350 | Description=run packtpub-crawler 351 | 352 | [Service] 353 | User=USER_THAT_SHOULD_RUN_THE_SCRIPT 354 | ExecStart=/usr/bin/python2.7 PATH_TO_PROJECT/packtpub-crawler/script/spider.py -c config/prod.cfg 355 | 356 | [Install] 357 | WantedBy=multi-user.target 358 | ``` 359 | 360 | 2. packtpub-crawler.timer 361 | ``` 362 | [Unit] 363 | Description=Runs packtpub-crawler every day at 7 364 | 365 | [Timer] 366 | OnBootSec=10min 367 | OnActiveSec=1s 368 | OnCalendar=*-*-* 07:00:00 369 | Unit=packtpub_crawler.service 370 | Persistent=true 371 | 372 | [Install] 373 | WantedBy=multi-user.target 374 | ``` 375 | 376 | Enable the script with ```sudo systemctl enable packtpub_crawler.timer```. 377 | You can test the service with ```sudo systemctl start packtpub_crawler.timer``` and see the output with ```sudo journalctl -u packtpub_crawler.service -f```. 378 | 379 | 380 | ### Newsletter 381 | The script downloads also the free ebooks from the weekly packtpub newsletter. 382 | The [URL](https://goo.gl/kUciut) is generated by a Google Apps Script which parses all the mails. 383 | You can get the code [here](https://gist.github.com/juzim/af0ef80f1233de51614d88551514b0ad), if you want to see the actual script, please clone the [spreadsheet](https://docs.google.com/spreadsheets/d/1jN5gV45uVkE0EEF4Nb-yVNfIr3o8OoiVveUZJRMiLFw) and go to `Tools > Script editor...`. 384 | 385 | To use your own source, modify in the config 386 | ``` 387 | url.bookFromNewsletter=https://goo.gl/kUciut 388 | ``` 389 | 390 | The URL should point to a file containing only the URL (no semicolons, HTML, JSON, etc). 391 | 392 | You can also clone the [spreadsheet](https://docs.google.com/spreadsheets/d/1jN5gV45uVkE0EEF4Nb-yVNfIr3o8OoiVveUZJRMiLFw) to use your own Gmail account. Subscribe to the [newsletter](https://www.packtpub.com) (on the bottom of the page) and create a filter to tag your mails accordingly. 393 | 394 | 395 | ### Troubleshooting 396 | * ImportError: No module named paramiko 397 | 398 | Install paramiko with `sudo -H pip install paramiko --ignore-installed` 399 | 400 | * Failed building wheel for cryptography 401 | 402 | Install missing dependencies as described [here](https://cryptography.io/en/latest/installation/#building-cryptography-on-windows) 403 | 404 | ### virtualenv 405 | 406 | ``` 407 | # install pip + setuptools 408 | curl https://bootstrap.pypa.io/get-pip.py | python - 409 | 410 | # upgrade pip 411 | pip install -U pip 412 | 413 | # install virtualenv globally 414 | sudo pip install virtualenv 415 | 416 | # create virtualenv 417 | virtualenv env 418 | 419 | # activate virtualenv 420 | source env/bin/activate 421 | 422 | # verify virtualenv 423 | which python 424 | python --version 425 | 426 | # deactivate virtualenv 427 | deactivate 428 | ``` 429 | 430 | ### Development (only for spidering) 431 | Run a simple static server with 432 | ``` 433 | node dev/server.js 434 | ``` 435 | and test the crawler with 436 | ``` 437 | python script/spider.py --dev --config config/dev.cfg --all 438 | ``` 439 | 440 | ### Disclaimer 441 | 442 | This project is just a Proof of Concept and not intended for any illegal usage. I'm not responsible for any damage or abuse, use it at your own risk. 443 | --------------------------------------------------------------------------------