├── script
├── __init__.py
├── notification
│ ├── __init__.py
│ ├── mypushover.py
│ ├── ifttt.py
│ ├── join.py
│ └── gmail.py
├── noBookException.py
├── alreadyClaimedException.py
├── scheduler.py
├── logs.py
├── upload.py
├── notify.py
├── database.py
├── scpUpload.py
├── utils.py
├── onedrive.py
├── googledrive.py
├── spider.py
└── packtpub.py
├── .dockerignore
├── Procfile
├── dev
├── public
│ ├── urlFromNewsletter.html
│ ├── loginGet.html
│ ├── loginPost.html
│ ├── loginPostNewsletter.html
│ └── myEbooks.html
├── package.json
└── server.js
├── .gitignore
├── Dockerfile
├── requirements.txt
├── config
├── dev.cfg
└── prod_example.cfg
├── LICENSE
└── README.md
/script/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/script/notification/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | dev/
2 | ebooks/
3 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | clock: python ./script/scheduler.py
2 |
--------------------------------------------------------------------------------
/script/noBookException.py:
--------------------------------------------------------------------------------
1 | class NoBookException(Exception):
2 | pass
3 |
--------------------------------------------------------------------------------
/dev/public/urlFromNewsletter.html:
--------------------------------------------------------------------------------
1 | http://localhost:8080/loginPostNewsletter.html
2 |
--------------------------------------------------------------------------------
/script/alreadyClaimedException.py:
--------------------------------------------------------------------------------
1 | class AlreadyClaimedException(Exception):
2 | pass
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | .DS_Store
3 |
4 | .idea
5 | *.iml
6 |
7 | *.pyc
8 | env/
9 |
10 | config/prod.cfg
11 | config/client_secrets.json
12 | config/auth_token.json
13 | config/session.onedrive.pickle
14 | config/lastNewsletterUrl
15 |
16 | ebooks/
17 | node_modules/
18 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:2.7
2 |
3 | WORKDIR /packtpub-crawler
4 |
5 | COPY script script
6 | COPY config config
7 | COPY requirements.txt requirements.txt
8 |
9 | RUN pip install -r requirements.txt
10 |
11 | CMD ["python", "/packtpub-crawler/script/scheduler.py"]
12 |
--------------------------------------------------------------------------------
/dev/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "author": "niqdev",
3 | "name": "packtpub-crawler",
4 | "version": "1.0.0",
5 | "repository": {
6 | "type": "git",
7 | "url": "https://github.com/niqdev/packtpub-crawler.git"
8 | },
9 | "dependencies": {
10 | "express": "^4.12.4",
11 | "morgan": "^1.6.0",
12 | "serve-static": "^1.9.3"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | APScheduler==3.1.0
2 | beautifulsoup4==4.4.1
3 | clint==0.5.1
4 | google-api-python-client==1.3.2
5 | html5lib==0.9999999
6 | oauth2client==1.4.11
7 | python-magic==0.4.11
8 | requests==2.10.0
9 | termcolor==1.1.0
10 | urllib3==1.15.1
11 | python-firebase==1.2
12 | paramiko==2.0.2
13 | cryptography==1.6
14 | scp==0.10.2
15 | onedrivesdk==1.1.8
16 | python-pushover==0.3
17 |
--------------------------------------------------------------------------------
/script/scheduler.py:
--------------------------------------------------------------------------------
1 | from apscheduler.schedulers.blocking import BlockingScheduler
2 | import os
3 | import shutil
4 |
5 | sched = BlockingScheduler()
6 |
7 | @sched.scheduled_job('cron', day_of_week='mon-sun', hour=9)
8 | def scheduled_job():
9 | print('New job: packtpub-crawler')
10 | shutil.rmtree('./ebooks', ignore_errors=True)
11 | os.system('python script/spider.py --config config/prod.cfg --upload googledrive --store firebase --notify gmail')
12 |
13 | sched.start()
14 |
--------------------------------------------------------------------------------
/dev/server.js:
--------------------------------------------------------------------------------
1 | /*
2 | // setup project
3 | npm init
4 | // build
5 | npm install
6 |
7 | // run dev
8 | node server.js
9 | */
10 |
11 | var express = require('express');
12 | var morgan = require('morgan');
13 | var serveStatic = require('serve-static');
14 |
15 | var PATH = __dirname + '/public';
16 | var PORT = 8080;
17 |
18 | var app = express();
19 |
20 | // logger
21 | app.use(morgan('dev'));
22 |
23 | app.use(serveStatic(PATH)).listen(PORT, function() {
24 | console.log("listening on port " + PORT);
25 | });
26 |
--------------------------------------------------------------------------------
/config/dev.cfg:
--------------------------------------------------------------------------------
1 | [url]
2 | url.base=http://localhost:8080
3 | url.loginGet=/loginGet.html
4 | url.loginPost=/loginPost.html
5 | url.account=/myEbooks.html
6 | # params: 0=id, 1=format
7 | url.download=/ebook_download/{0}/{1}
8 | url.bookFromNewsletter=http://localhost:8080/urlFromNewsletter.html
9 |
10 | #time in seconds
11 | [delay]
12 | delay.requests=2
13 |
14 | [credential]
15 | credential.email=test@mail.com
16 | credential.password=testPwd
17 |
18 | [path]
19 | path.ebooks=ebooks
20 | path.extras=ebooks/extras
21 | #path.group=true
22 |
23 | [googledrive]
24 | googledrive.oauth2_scope=https://www.googleapis.com/auth/drive
25 | googledrive.client_secrets=config/client_secrets.json
26 | googledrive.auth_token=config/auth_token.json
27 | googledrive.gmail=g00gler@gmail.com
28 |
--------------------------------------------------------------------------------
/script/logs.py:
--------------------------------------------------------------------------------
1 | from termcolor import cprint
2 | import json
3 | import sys, os, traceback
4 |
5 | def log_error(message):
6 | cprint(message, 'red')
7 |
8 | def log_warn(message):
9 | cprint(message, 'yellow')
10 |
11 | def log_info(message):
12 | cprint(message, 'cyan')
13 |
14 | def log_success(message):
15 | cprint(message, 'green')
16 |
17 | def log_json(list_dict):
18 | print json.dumps(list_dict, indent=2)
19 |
20 | def log_dict(dict):
21 | for key, elem in dict.items():
22 | print '\t[{0}] {1}'.format(key, elem)
23 |
24 | def log_debug(e, stacktrace=True):
25 | exc_type, exc_value, exc_traceback = sys.exc_info()
26 | fname = os.path.split(exc_traceback.tb_frame.f_code.co_filename)[1]
27 |
28 | log_warn('[-] {0} {1} | {2}@{3}'.format(exc_type, e, fname, exc_traceback.tb_lineno))
29 |
30 | if stacktrace:
31 | traceback.print_exc()
32 |
--------------------------------------------------------------------------------
/script/notification/mypushover.py:
--------------------------------------------------------------------------------
1 | from logs import *
2 | import requests
3 | from pushover import Client
4 |
5 | class Pushover(object):
6 | """
7 | """
8 |
9 | def __init__(self, config, packpub_info, upload_info):
10 | self.__config = config
11 | self.__packpub_info = packpub_info
12 | self.__client = Client(self.__config.get('pushover', 'pushover.user_key'), api_token=self.__config.get('pushover', 'pushover.api_key'))
13 |
14 |
15 | def send(self):
16 | self.__client.send_message(self.__packpub_info['description'].encode('utf-8'), title="New book downloaded from Packt: " + self.__packpub_info['title'].encode('utf-8'), url="https://www.packtpub.com/packt/offers/free-learning", url_title="See more")
17 | log_success('[+] notification sent to pushover')
18 |
19 | def sendError(self, exception, source):
20 | self.__client.send_message(repr(exception), title='packtpub-crawler {source}: Could not download ebook'.format(source=source))
21 | log_success('[+] error notification sent to pushover')
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 niqdev
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/script/notification/ifttt.py:
--------------------------------------------------------------------------------
1 | from logs import *
2 | import requests
3 |
4 | class Ifttt(object):
5 | """
6 | """
7 |
8 | def __init__(self, config, packpub_info, upload_info):
9 | self.__packpub_info = packpub_info
10 | self.__url = "https://maker.ifttt.com/trigger/{eventName}/with/key/{apiKey}".format(
11 | eventName=config.get('ifttt', 'ifttt.event_name'),
12 | apiKey=config.get('ifttt', 'ifttt.key')
13 | )
14 |
15 | def send(self):
16 | r = requests.post(self.__url, data = {
17 | 'value1':self.__packpub_info['title'].encode('utf-8'),
18 | 'value2':self.__packpub_info['description'].encode('utf-8'),
19 | 'value3':self.__packpub_info['url_image']
20 | })
21 | log_success('[+] notification sent to IFTTT')
22 |
23 | def sendError(self, exception, source):
24 | title = "packtpub-crawler [{source}]: Could not download ebook".format(source=source)
25 | r = requests.post(self.__url, data = {'value1':title, 'value2':repr(exception), 'value3':self.__packpub_info['landingPageUrl']})
26 |
27 | log_success('[+] error notification sent to IFTTT')
28 |
--------------------------------------------------------------------------------
/script/upload.py:
--------------------------------------------------------------------------------
1 | from googledrive import GoogleDrive
2 | from onedrive import OneDrive
3 | from scpUpload import ScpUpload
4 | from logs import *
5 |
6 | SERVICE_GOOGLE_DRIVE = 'googledrive'
7 | SERVICE_ONEDRIVE = 'onedrive'
8 | SERVICE_DROPBOX = 'DROPBOX'
9 | SERVICE_SCP = 'scp'
10 |
11 | class Upload(object):
12 | """
13 | TODO interface or abstract class for upload services
14 | """
15 |
16 | def __init__(self, config, service_type):
17 | self.__config = config
18 | self.info = {
19 | 'details': []
20 | }
21 | if service_type == SERVICE_GOOGLE_DRIVE:
22 | self.service = GoogleDrive(config)
23 | elif service_type == SERVICE_ONEDRIVE:
24 | self.service = OneDrive(config)
25 | elif service_type == SERVICE_DROPBOX:
26 | raise NotImplementedError('not implemented yet!')
27 | elif service_type == SERVICE_SCP:
28 | self.service = ScpUpload(config)
29 |
30 | def run(self, paths):
31 | """
32 | """
33 | for path in paths:
34 | self.service.upload(path)
35 | self.info['details'].append(self.service.info)
36 | log_dict(self.service.info)
37 |
--------------------------------------------------------------------------------
/script/notify.py:
--------------------------------------------------------------------------------
1 | from notification.gmail import Gmail
2 | from notification.ifttt import Ifttt
3 | from logs import *
4 | from notification.join import Join
5 | from notification.mypushover import Pushover
6 |
7 | SERVICE_GMAIL = 'gmail'
8 | SERVICE_IFTTT = 'ifttt'
9 | SERVICE_JOIN = 'join'
10 | SERVICE_PUSHOVER = 'pushover'
11 |
12 | class Notify(object):
13 | """
14 | TODO interface or abstract class for notification services
15 | """
16 |
17 | def __init__(self, config, packpub_info, upload_info, service_type):
18 | self.__config = config
19 | self.info = {
20 | 'details': []
21 | }
22 | if service_type == SERVICE_GMAIL:
23 | self.service = Gmail(config, packpub_info, upload_info)
24 | elif service_type == SERVICE_IFTTT:
25 | self.service = Ifttt(config, packpub_info, upload_info)
26 | elif service_type == SERVICE_JOIN:
27 | self.service = Join(config, packpub_info, upload_info)
28 | elif service_type == SERVICE_PUSHOVER:
29 | self.service = Pushover(config, packpub_info, upload_info)
30 |
31 | def run(self):
32 | """
33 | """
34 | self.service.send()
35 |
36 |
37 | def sendError(self, exception, source):
38 | """
39 | """
40 | self.service.sendError(exception, source)
41 |
--------------------------------------------------------------------------------
/script/database.py:
--------------------------------------------------------------------------------
1 | from logs import *
2 | from firebase.firebase import FirebaseApplication, FirebaseAuthentication
3 | import datetime
4 |
5 | DB_FIREBASE = 'firebase'
6 |
7 | class Database(object):
8 | """
9 | Store to database
10 | """
11 |
12 | def __init__(self, config, database_type, packpub_info, upload_info):
13 | self.__config = config
14 | self.__database_type = database_type
15 |
16 | data = packpub_info.copy()
17 | data['datetime'] = datetime.datetime.utcnow().isoformat()
18 | data.pop('paths', None)
19 | data.update(upload_info)
20 | self.__data = data
21 |
22 | def store(self):
23 | """
24 | """
25 | #log_json(self.__data)
26 |
27 | if self.__database_type == DB_FIREBASE:
28 | self.__store_firebase()
29 |
30 | def __store_firebase(self):
31 | """
32 | """
33 |
34 | authentication = FirebaseAuthentication(self.__config.get('firebase', 'firebase.database_secret'), None)
35 | #user = authentication.get_user()
36 | #print authentication.extra
37 | #print user.firebase_auth_token
38 |
39 | firebase = FirebaseApplication(self.__config.get('firebase', 'firebase.url'), authentication)
40 | result = firebase.post(self.__config.get('firebase', 'firebase.path'), self.__data)
41 |
42 | log_success('[+] Stored on firebase: {0}'.format(result['name']))
43 |
--------------------------------------------------------------------------------
/dev/public/loginGet.html:
--------------------------------------------------------------------------------
1 |
13 |
36 |
--------------------------------------------------------------------------------
/script/notification/join.py:
--------------------------------------------------------------------------------
1 | from logs import *
2 | import requests
3 |
4 | class Join(object):
5 | """
6 | """
7 |
8 | def __init__(self, config, packpub_info, upload_info):
9 | self.__config = config
10 | self.__packpub_info = packpub_info
11 |
12 | def send(self):
13 | url = "https://joinjoaomgcd.appspot.com/_ah/api/messaging/v1/sendPush?apikey={apiKey}&deviceId={deviceIds}&title={title}&text={description}".format(
14 | apiKey=self.__config.get('join', 'join.api_key'),
15 | deviceIds=self.__config.get('join', 'join.device_ids'),
16 | title="New book downloaded from Packt: " + self.__packpub_info['title'].encode('utf-8'),
17 | description=self.__packpub_info['description'].encode('utf-8')
18 | )
19 |
20 | r = requests.post(url)
21 |
22 | log_success('[+] notification sent to Join')
23 |
24 | def sendError(self, exception, source):
25 | url = "https://joinjoaomgcd.appspot.com/_ah/api/messaging/v1/sendPush?apikey={apiKey}&deviceId={deviceIds}&title={title}&text={description}&url={url}".format(
26 | apiKey=self.__config.get('join', 'join.api_key'),
27 | deviceIds=self.__config.get('join', 'join.device_ids'),
28 | title='packtpub-crawler {source}: Could not download ebook: {title}'.format(source=source, title=self.__packpub_info['title']),
29 | description=repr(exception),
30 | url=self.__packpub_info['landingPageUrl']
31 | )
32 |
33 | r = requests.post(url)
34 |
35 | log_success('[+] error notification sent to Join')
36 |
--------------------------------------------------------------------------------
/script/scpUpload.py:
--------------------------------------------------------------------------------
1 | from os.path import exists
2 | import magic
3 | from utils import thread_loader
4 | from logs import *
5 | import paramiko
6 | from scp import SCPClient
7 |
8 | class ScpUpload(object):
9 | """
10 | """
11 |
12 | def __init__(self, config):
13 | self.__config = config
14 | self.info = {}
15 |
16 | def __guess_info(self, file_path):
17 | if not exists(file_path):
18 | raise IOError('file not found!')
19 |
20 | self.info = {
21 | 'path': file_path,
22 | 'name': file_path.split('/')[-1],
23 | 'mime_type': magic.from_file(file_path, mime=True),
24 | }
25 | log_info('[+] new file upload via scp:')
26 | # log_dict(self.file_info)
27 |
28 | def __insert_file(self):
29 | print '[+] uploading file...'
30 | ssh = paramiko.SSHClient()
31 | ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
32 |
33 | # get config settings
34 | host = self.__config.get('scp', 'scp.host')
35 | user = self.__config.get('scp', 'scp.user')
36 | password = self.__config.get('scp', 'scp.password')
37 | timeout = self.__config.get('scp', 'scp.timeout')
38 | self.info['upload_path'] = self.__config.get('scp', 'scp.path')
39 |
40 | ssh.connect(host, username=user, password=password)
41 | scpclient = SCPClient(ssh.get_transport(), socket_timeout=float(timeout))
42 | scpclient.put(self.info['path'], self.info['upload_path'] + self.info['name'])
43 |
44 | def upload(self, file_path):
45 | self.__guess_info(file_path)
46 | thread_loader(self.__insert_file)
47 |
--------------------------------------------------------------------------------
/config/prod_example.cfg:
--------------------------------------------------------------------------------
1 | [url]
2 | url.base=https://www.packtpub.com
3 | url.login=/packt/offers/free-learning
4 | # params: 0=id, 1=format
5 | url.download=/ebook_download/{0}/{1}
6 | url.bookFromNewsletter=https://goo.gl/kUciut
7 |
8 | #time in seconds
9 | [delay]
10 | delay.requests=2
11 |
12 | [credential]
13 | credential.email=PACKTPUB_EMAIL
14 | credential.password=PACKTPUB_PASSWORD
15 |
16 | [path]
17 | path.ebooks=ebooks
18 | path.extras=ebooks/extras
19 | #path.group=true
20 |
21 | [googledrive]
22 | googledrive.oauth2_scope=https://www.googleapis.com/auth/drive
23 | googledrive.client_secrets=config/client_secrets.json
24 | googledrive.auth_token=config/auth_token.json
25 | googledrive.gmail=GOOGLE_DRIVE@gmail.com
26 | googledrive.default_folder=packtpub
27 | #googledrive.upload_folder=FOLDER_ID
28 |
29 | [onedrive]
30 | onedrive.api_base_url=https://api.onedrive.com/v1.0/
31 | onedrive.client_id=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
32 | onedrive.client_secret=XxXxXxXxXxXxXxXxXxXxXxX
33 | onedrive.session_file=config/session.onedrive.pickle
34 | onedrive.folder=packtpub
35 |
36 | [scp]
37 | scp.host=SCP_HOST
38 | scp.user=SCP_USER
39 | scp.password=SCP_PASSWORD
40 | scp.timeout=15
41 | scp.path=SCP_UPLOAD_PATH
42 |
43 | [gmail]
44 | gmail.host=smtp.gmail.com
45 | gmail.port=587
46 | gmail.username=EMAIL_USERNAME@gmail.com
47 | gmail.password=EMAIL_PASSWORD
48 | gmail.from=FROM_EMAIL@gmail.com
49 | gmail.to=TO_EMAIL_1@gmail.com,TO_EMAIL_2@gmail.com
50 |
51 | [ifttt]
52 | ifttt.event_name=packtpub-crawler
53 | ifttt.key=IFTTT_MAKER_KEY
54 |
55 | [join]
56 | join.device_ids=DEVICE_IDS_COMMA_SEPARATED_OR_GROUP_NAME
57 | join.api_key=API_KEY
58 |
59 | [firebase]
60 | firebase.database_secret=FIREBASE_DATABASE_SECRET
61 | firebase.url=FIREBASE_URL
62 | firebase.path=/books
63 |
64 | [pushover]
65 | pushover.user_key=PUSHOVER_USER_KEY
66 | pushover.api_key=PUSHOVER_API_KEY
67 |
--------------------------------------------------------------------------------
/script/utils.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import ConfigParser
3 | from bs4 import BeautifulSoup
4 | from time import sleep
5 | from clint.textui import progress
6 | import os, sys, itertools
7 | from threading import Thread
8 | from logs import *
9 |
10 | def ip_address():
11 | """
12 | Gets current IP address
13 | """
14 |
15 | response = requests.get('http://www.ip-addr.es')
16 | print '[-] GET {0} | {1}'.format(response.status_code, response.url)
17 | log_info('[+] ip address is: {0}'.format(response.text.strip()))
18 |
19 | def config_file(path):
20 | """
21 | Reads configuration file
22 | """
23 | if not os.path.exists(path):
24 | raise IOError('file not found!')
25 |
26 | log_info('[*] configuration file: {0}'.format(path))
27 | config = ConfigParser.ConfigParser()
28 | config.read(path)
29 | return config
30 |
31 | def make_soup(response, debug=False):
32 | """
33 | Makes soup from response
34 | """
35 |
36 | print '[*] fetching url... {0} | {1}'.format(response.status_code, response.url)
37 | soup = BeautifulSoup(response.text, 'html5lib')
38 | if debug:
39 | print soup.prettify().encode('utf-8')
40 | return soup
41 |
42 | def wait(delay, isDev):
43 | if delay > 0:
44 | if isDev:
45 | print '[-] going to sleep {0} seconds'.format(delay)
46 | sleep(delay)
47 |
48 | def download_file(r, url, directory, filename, headers):
49 | """
50 | Downloads file with progress bar
51 | """
52 | if not os.path.exists(directory):
53 | # creates directories recursively
54 | os.makedirs(directory)
55 | log_info('[+] created new directory: ' + directory)
56 |
57 | filename = filename.replace(':', '-')
58 | path = os.path.join(directory, filename)
59 |
60 | print '[-] downloading file from url: {0}'.format(url)
61 | response = r.get(url, headers=headers, stream=True)
62 | #log_dict(response.headers)
63 | total_length = 0
64 | test_length = response.headers.get('content-length')
65 | if test_length is not None:
66 | total_length = int(test_length)
67 |
68 | with open(path, 'wb') as f:
69 | for chunk in progress.bar(response.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1):
70 | if chunk:
71 | f.write(chunk)
72 | f.flush()
73 | log_success('[+] new download: {0}'.format(path))
74 | return path
75 |
76 | def thread_loader(function):
77 | """
78 | Starts a thread with loading bar
79 | """
80 |
81 | thread = Thread(target=function)
82 | thread.start()
83 | spinner = itertools.cycle(['-', '/', '|', '\\'])
84 | while thread.is_alive():
85 | sys.stdout.write(spinner.next())
86 | sys.stdout.flush()
87 | # erase the last written char
88 | sys.stdout.write('\b')
89 |
--------------------------------------------------------------------------------
/dev/public/loginPost.html:
--------------------------------------------------------------------------------
1 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
32 |
33 |
34 |
35 |
36 | Time is running out to claim this free ebook
37 |
38 |
39 |
40 |
41 |
Scaling Big Data with Hadoop and Solr
42 |
43 |
44 |
45 |
46 | Create an enterprise ready search engine by combining the power of Hadoop and flexibility of Solr thanks to this free ebook! Discover the basics of Hadoop and Solr that can be taken to any iteration, before learning how to index your data and fine-tune
47 | your big data search as you gain everything you need to make a distributed search platform optimized to your needs.
48 |
49 |
50 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/dev/public/loginPostNewsletter.html:
--------------------------------------------------------------------------------
1 |
19 |
42 |
73 |
--------------------------------------------------------------------------------
/script/notification/gmail.py:
--------------------------------------------------------------------------------
1 | import smtplib
2 | from email.mime.multipart import MIMEMultipart
3 | from email.mime.text import MIMEText
4 | from logs import *
5 |
6 | class Gmail(object):
7 | """
8 | """
9 |
10 | def __init__(self, config, packpub_info, upload_info):
11 | self.__config = config
12 | self.__packpub_info = packpub_info
13 | self.__upload_info = upload_info
14 |
15 | def __prepare_message(self):
16 | """
17 | """
18 | #log_json(self.__packpub_info)
19 | #log_json(self.__upload_info)
20 |
21 | msg = MIMEMultipart('alternative')
22 | msg['Subject'] = "[packtpub-crawler]"
23 | msg['From'] = self.__config.get('gmail', 'gmail.from')
24 | msg['To'] = self.__config.get('gmail', 'gmail.to')
25 |
26 | text = "Enjoy your daily FREE eBook!"
27 | html = """\
28 |
29 |
30 |
31 | {title}
32 | {description}
33 | """.format(title=self.__packpub_info['title'].encode('utf-8'),
34 | description=self.__packpub_info['description'].encode('utf-8'))
35 |
36 | if self.__upload_info is not None:
37 | html += ""
38 | for detail in self.__upload_info['details']:
39 | html += """- {mime_type} - {name}
"""\
40 | .format(mime_type=detail['mime_type'], download_url=detail['download_url'], name=detail['name'])
41 | html += "
"
42 |
43 | html += """\
44 |
45 |
46 |
47 |
48 | """.format(image=self.__packpub_info['url_image'])
49 |
50 | part1 = MIMEText(text, 'plain')
51 | part2 = MIMEText(html, 'html')
52 |
53 | msg.attach(part1)
54 | msg.attach(part2)
55 |
56 | return msg
57 |
58 | def __prepare_error_message(self, exception, source):
59 | """
60 | """
61 | #log_json(self.__packpub_info)
62 | #log_json(self.__upload_info)
63 |
64 | msg = MIMEMultipart('alternative')
65 | msg['Subject'] = "[packtpub-crawler]"
66 | msg['From'] = self.__config.get('gmail', 'gmail.from')
67 | msg['To'] = self.__config.get('gmail', 'gmail.to')
68 |
69 | text = "Error downloading today's ebook [{source}]".format(source=source)
70 | html = """\
71 |
72 |
73 |
74 | {title}
75 |
76 | {description}
77 | """.format(title=text,
78 | description=repr(exception),
79 | url=self.__packpub_info['landingPageUrl'])
80 |
81 | html += """\
82 |
83 |
84 |
85 | """
86 |
87 | part1 = MIMEText(text, 'plain')
88 | part2 = MIMEText(html, 'html')
89 |
90 | msg.attach(part1)
91 | msg.attach(part2)
92 |
93 | return msg
94 |
95 | def send(self):
96 | server = smtplib.SMTP(self.__config.get('gmail', 'gmail.host'), self.__config.get('gmail', 'gmail.port'))
97 | server.starttls()
98 | server.login(self.__config.get('gmail', 'gmail.username'), self.__config.get('gmail', 'gmail.password'))
99 |
100 | message = self.__prepare_message()
101 | receivers = message['To'].split(",")
102 | server.sendmail(message['From'], receivers, message.as_string())
103 | server.quit()
104 |
105 | log_success('[+] notified to: {0}'.format(receivers))
106 |
107 | def sendError(self, exception, source):
108 | server = smtplib.SMTP(self.__config.get('gmail', 'gmail.host'), self.__config.get('gmail', 'gmail.port'))
109 | server.starttls()
110 | server.login(self.__config.get('gmail', 'gmail.username'), self.__config.get('gmail', 'gmail.password'))
111 |
112 | message = self.__prepare_error_message(exception, source)
113 | receivers = message['To'].split(",")
114 | server.sendmail(message['From'], receivers, message.as_string())
115 | server.quit()
116 |
117 | log_success('[+] error notifikation sent to: {0}'.format(receivers))
118 |
--------------------------------------------------------------------------------
/script/onedrive.py:
--------------------------------------------------------------------------------
1 | from os.path import exists
2 |
3 | import magic
4 | import onedrivesdk
5 | from onedrivesdk.helpers import GetAuthCodeServer
6 |
7 | from logs import *
8 | from utils import thread_loader
9 |
10 |
11 | class OneDrive(object):
12 | """
13 | """
14 |
15 | def __init__(self, config):
16 | self.__config = config
17 | self.__onedrive_service = None
18 | self.__scopes = ['offline_access', 'onedrive.readwrite']
19 | self.info = {}
20 |
21 | def __guess_info(self, file_path):
22 | if not exists(file_path):
23 | raise IOError('file not found!')
24 |
25 | self.info = {
26 | 'path': file_path,
27 | 'name': file_path.split('/')[-1],
28 | 'mime_type': magic.from_file(file_path, mime=True),
29 | }
30 | log_info('[+] new file upload on OneDrive:')
31 | log_info(self.info['name'])
32 |
33 | def __init_service(self):
34 | api_base_url = self.__config.get('onedrive', 'onedrive.api_base_url')
35 | client_id = self.__config.get('onedrive', 'onedrive.client_id')
36 | session_file = self.__config.get('onedrive', 'onedrive.session_file')
37 |
38 | if not exists(session_file):
39 | self.__save_credentials(session_file)
40 |
41 | http_provider = onedrivesdk.HttpProvider()
42 | auth_provider = onedrivesdk.AuthProvider(http_provider,
43 | client_id,
44 | self.__scopes)
45 |
46 | # Load the session
47 | auth_provider.load_session(path=session_file)
48 | auth_provider.refresh_token()
49 | self.__onedrive_service = onedrivesdk.OneDriveClient(api_base_url, auth_provider, http_provider)
50 |
51 | def __save_credentials(self, session_file):
52 | # api_base_url = self.__config.get('onedrive', 'onedrive.api_base_url')
53 | redirect_uri = 'http://localhost:8080/'
54 | client_id = self.__config.get('onedrive', 'onedrive.client_id')
55 | client_secret = self.__config.get('onedrive', 'onedrive.client_secret')
56 |
57 | client = onedrivesdk.get_default_client(client_id=client_id, scopes=self.__scopes)
58 |
59 | auth_url = client.auth_provider.get_auth_url(redirect_uri)
60 |
61 | # this will block until we have the code
62 | code = GetAuthCodeServer.get_auth_code(auth_url, redirect_uri)
63 |
64 | client.auth_provider.authenticate(code, redirect_uri, client_secret)
65 |
66 | # Save the session for later
67 | client.auth_provider.save_session(path=session_file)
68 | log_info('[+] new credentials saved')
69 |
70 | def __create_folder(self, item_id, folder_name): #Create folder with provided name
71 | f = onedrivesdk.Folder()
72 | i = onedrivesdk.Item()
73 | i.name = folder_name
74 | i.folder = f
75 |
76 | folder = self.__onedrive_service.item(drive='me', id=item_id).children.add(i)
77 |
78 | log_success('[+] creating new directory...')
79 |
80 | return folder.id #Return folder object ID
81 |
82 | def __get_folder(self): #Get folder name settings
83 | try: #Check folder name
84 | folder_name = self.__config.get('onedrive', 'onedrive.folder')
85 | except:
86 | folder_name = 'packtpub'
87 |
88 | item_id = 'root'
89 | directories = folder_name.split('/')
90 | for d in directories:
91 | if d == '.':
92 | continue
93 | try: # get folder if exists
94 | parent = self.__onedrive_service.item(drive='me', id=item_id)
95 | item = parent.children[d].get()
96 | item_id = item.id
97 | except:
98 | item_id = self.__create_folder(item_id, d)
99 |
100 | return item_id
101 |
102 | def __insert_file(self):
103 | print '[+] uploading file...'
104 | tries = 5
105 | while tries > 0:
106 | try:
107 | tries -= 1
108 | item = self.__onedrive_service.item(drive='me', id=self.__get_folder())
109 | file = item.children[self.info['name']].upload(self.info['path'])
110 | tries = 0
111 | except:
112 | print '[x] upload failed'
113 | if tries <= 0:
114 | pass
115 | else:
116 | print '[x] retrying ...'
117 |
118 | self.info['id'] = file.id
119 | self.info['download_url'] = file.web_url
120 |
121 | def upload(self, file_path):
122 | self.__guess_info(file_path)
123 | self.__init_service()
124 | thread_loader(self.__insert_file)
125 |
--------------------------------------------------------------------------------
/script/googledrive.py:
--------------------------------------------------------------------------------
1 | from os.path import exists
2 | import webbrowser
3 | from oauth2client.client import flow_from_clientsecrets, OOB_CALLBACK_URN
4 | from oauth2client.file import Storage
5 | import httplib2
6 | import magic
7 | from googleapiclient.discovery import build
8 | from googleapiclient.http import MediaFileUpload
9 | from utils import thread_loader
10 | from logs import *
11 |
12 | class GoogleDrive(object):
13 | """
14 | """
15 |
16 | def __init__(self, config):
17 | self.__config = config
18 | self.__googledrive_service = None
19 | self.info = {}
20 |
21 | def __guess_info(self, file_path):
22 | if not exists(file_path):
23 | raise IOError('file not found!')
24 |
25 | self.info = {
26 | 'path': file_path,
27 | 'name': file_path.split('/')[-1],
28 | 'mime_type': magic.from_file(file_path, mime=True),
29 | }
30 | log_info('[+] new file upload on Google Drive:')
31 | # log_dict(self.file_info)
32 |
33 | def __init_service(self):
34 | auth_token = self.__config.get('googledrive', 'googledrive.auth_token')
35 |
36 | if not exists(auth_token):
37 | self.__save_credentials(auth_token)
38 |
39 | storage = Storage(auth_token)
40 | credentials = storage.get()
41 |
42 | http = httplib2.Http()
43 | http = credentials.authorize(http)
44 | self.__googledrive_service = build('drive', 'v2', http=http)
45 |
46 | def __save_credentials(self, auth_token):
47 | flow = flow_from_clientsecrets(
48 | self.__config.get('googledrive', 'googledrive.client_secrets'),
49 | self.__config.get('googledrive', 'googledrive.oauth2_scope'),
50 | OOB_CALLBACK_URN)
51 |
52 | authorize_url = flow.step1_get_authorize_url()
53 |
54 | print '[-] open browser...'
55 | webbrowser.open(authorize_url)
56 |
57 | code = raw_input('[*] Please, enter verification code: ').strip()
58 | credentials = flow.step2_exchange(code)
59 |
60 | storage = Storage(auth_token)
61 | storage.put(credentials)
62 | log_info('[+] new credentials saved')
63 |
64 | def __create_folder(self): #Create folder with provided name
65 | try: #Check default folder name
66 | default_folder_name = self.__config.get('googledrive', 'googledrive.default_folder')
67 | except:
68 | default_folder_name = 'packtpub'
69 |
70 | metadata = {
71 | 'title': default_folder_name,
72 | 'mimeType' : 'application/vnd.google-apps.folder'
73 | }
74 | folder = self.__googledrive_service.files().insert(body = metadata).execute()
75 | self.__config.set('googledrive', 'googledrive.upload_folder', folder['id'])
76 | log_success('[+] creating new directory...')
77 | print '[+] updating folder permissions...'
78 | permissions = {
79 | 'role': 'reader',
80 | 'type': 'anyone',
81 | 'value': self.__config.get('googledrive', 'googledrive.gmail')
82 | }
83 | self.__googledrive_service.permissions().insert(fileId=folder['id'], body=permissions).execute()
84 | log_dict({'folder_name': default_folder_name,
85 | 'id': folder['id'],})
86 | #'share_link': folder['webContentLink']}) #TODO Fix
87 | log_success('[+] Please add this line after [googledrive] in your configuration file:')
88 | log_info('googledrive.upload_folder=' + folder.get('id'))
89 |
90 | return folder.get('id') #Return folder object ID
91 |
92 | def __get_folder(self): #Get folder name settings
93 | try:
94 | return self.__config.get('googledrive', 'googledrive.upload_folder')
95 | except:
96 | return self.__create_folder() #new folder ID
97 |
98 | def __insert_file(self):
99 | print '[+] uploading file...'
100 | media_body = MediaFileUpload(
101 | self.info['path'], mimetype=self.info['mime_type'], resumable=True)
102 | body = {
103 | 'title': self.info['name'],
104 | 'description': 'uploaded with packtpub-crawler',
105 | 'mimeType': self.info['mime_type'],
106 | 'parents': [{'id': self.__get_folder()}]
107 | }
108 | file = self.__googledrive_service.files().insert(body=body, media_body=media_body).execute()
109 | # log_dict(file)
110 |
111 | print '[+] updating file permissions...'
112 | permissions = {
113 | 'role': 'reader',
114 | 'type': 'anyone',
115 | 'value': self.__config.get('googledrive', 'googledrive.gmail')
116 | }
117 | self.__googledrive_service.permissions().insert(fileId=file['id'], body=permissions).execute()
118 |
119 | # self.__googledrive_service.files().get(fileId=file['id']).execute()
120 |
121 | self.info['id'] = file['id']
122 | self.info['download_url'] = file['webContentLink']
123 |
124 | def upload(self, file_path):
125 | self.__guess_info(file_path)
126 | self.__init_service()
127 | thread_loader(self.__insert_file)
128 |
--------------------------------------------------------------------------------
/script/spider.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 |
3 | import argparse
4 | import datetime
5 | import requests
6 | import os
7 | from utils import ip_address, config_file
8 | from packtpub import Packtpub
9 | from upload import Upload, SERVICE_GOOGLE_DRIVE, SERVICE_ONEDRIVE, SERVICE_DROPBOX, SERVICE_SCP
10 | from database import Database, DB_FIREBASE
11 | from logs import *
12 | from notify import Notify, SERVICE_GMAIL, SERVICE_IFTTT, SERVICE_JOIN, SERVICE_PUSHOVER
13 | from noBookException import NoBookException
14 | from alreadyClaimedException import AlreadyClaimedException
15 |
16 | def parse_types(args):
17 | if args.types is None:
18 | return [args.type]
19 | else:
20 | return args.types
21 |
22 | def handleClaim(packtpub, args, config, dir_path):
23 | if args.dev:
24 | log_json(packtpub.info)
25 |
26 | log_success('[+] book successfully claimed')
27 |
28 | upload = None
29 | upload_info = None
30 |
31 | if not args.claimOnly:
32 | types = parse_types(args)
33 |
34 | packtpub.download_ebooks(types, dir_path)
35 |
36 | if args.extras:
37 | packtpub.download_extras(dir_path)
38 |
39 | if args.archive:
40 | raise NotImplementedError('not implemented yet!')
41 |
42 | if args.upload is not None:
43 | upload = Upload(config, args.upload)
44 | upload.run(packtpub.info['paths'])
45 |
46 | if args.store is not None:
47 | if args.upload == SERVICE_GOOGLE_DRIVE or args.upload == SERVICE_ONEDRIVE:
48 | Database(config, args.store, packtpub.info, upload.info).store()
49 | else:
50 | log_warn('[-] skip store info: missing upload info')
51 |
52 |
53 | if args.notify:
54 | if upload is not None:
55 | upload_info = upload.info
56 |
57 | Notify(config, packtpub.info, upload_info, args.notify).run()
58 |
59 | def main():
60 | parser = argparse.ArgumentParser(
61 | description='Download FREE eBook every day from www.packtpub.com',
62 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
63 | version='2.4.0')
64 |
65 | parser.add_argument('-c', '--config', required=True, help='configuration file')
66 | parser.add_argument('-d', '--dev', action='store_true', help='only for development')
67 | parser.add_argument('-e', '--extras', action='store_true', help='download source code (if exists) and book cover')
68 | parser.add_argument('-u', '--upload', choices=[SERVICE_GOOGLE_DRIVE, SERVICE_ONEDRIVE, SERVICE_DROPBOX, SERVICE_SCP], help='upload to cloud')
69 | parser.add_argument('-a', '--archive', action='store_true', help='compress all file')
70 | parser.add_argument('-n', '--notify', choices=[SERVICE_GMAIL, SERVICE_IFTTT, SERVICE_JOIN, SERVICE_PUSHOVER], help='notify after claim/download')
71 | parser.add_argument('-s', '--store', choices=[DB_FIREBASE], help='store info')
72 | parser.add_argument('-o', '--claimOnly', action='store_true', help='only claim books (no downloads/uploads)')
73 |
74 | group = parser.add_mutually_exclusive_group()
75 | group.add_argument('-t', '--type', choices=['pdf', 'epub', 'mobi'],
76 | default='pdf', help='specify eBook type')
77 | group.add_argument('--all', dest='types', action='store_const',
78 | const=['pdf', 'epub', 'mobi'], help='all eBook types')
79 |
80 | args = parser.parse_args()
81 |
82 | now = datetime.datetime.now()
83 | log_info('[*] {date} - fetching today\'s eBooks'.format(date=now.strftime("%Y-%m-%d %H:%M")))
84 |
85 | packtpub = None
86 |
87 | try:
88 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + os.path.sep
89 |
90 | config = config_file(dir_path + args.config)
91 | packtpub = Packtpub(config, args.dev)
92 |
93 | #ip_address()
94 | log_info('[*] getting daily free eBook')
95 |
96 | try:
97 | packtpub.runDaily()
98 | handleClaim(packtpub, args, config, dir_path)
99 | except NoBookException as e:
100 | log_info('[*] ' + e.message)
101 | except Exception as e:
102 | log_debug(e)
103 | if args.notify:
104 | Notify(config, packtpub.info, None, args.notify).sendError(e, 'daily')
105 |
106 | lastNewsletterUrlPath = dir_path + 'config/lastNewsletterUrl'
107 | lastNewsletterUrl = None
108 |
109 | if os.path.isfile(lastNewsletterUrlPath):
110 | with open(lastNewsletterUrlPath, 'r+') as f:
111 | lastNewsletterUrl = f.read().strip()
112 |
113 | # the default URL is generated by an Google apps script, see README for details and self-hosting
114 | currentNewsletterUrl = requests.get(config.get('url', 'url.bookFromNewsletter')).text.strip()
115 |
116 | if currentNewsletterUrl == '':
117 | log_info('[*] no free eBook from newsletter right now')
118 | elif not currentNewsletterUrl.startswith('https://www.packtpub.com'):
119 | log_warn('[-] invalid URL from newsletter: ' + currentNewsletterUrl)
120 | elif lastNewsletterUrl != currentNewsletterUrl:
121 | log_info('[*] getting free eBook from newsletter')
122 | try:
123 | packtpub.resetInfo()
124 | packtpub.runNewsletter(currentNewsletterUrl)
125 | handleClaim(packtpub, args, config, dir_path)
126 |
127 | with open(lastNewsletterUrlPath, 'w+') as f:
128 | f.write(currentNewsletterUrl)
129 |
130 | except AlreadyClaimedException as a:
131 | log_info('[*] book was already claimed, skipping')
132 | with open(lastNewsletterUrlPath, 'w+') as f:
133 | f.write(currentNewsletterUrl)
134 | except Exception as e:
135 | log_debug(e)
136 | if args.notify:
137 | Notify(config, packtpub.info, None, args.notify).sendError(e, 'newsletter')
138 | else:
139 | log_info('[*] already got latest ebook from newsletter, skipping')
140 |
141 | except KeyboardInterrupt:
142 | log_error('[-] interrupted manually')
143 |
144 | except Exception as e:
145 | log_debug(e)
146 | if args.notify:
147 | Notify(config, None, None, args.notify).sendError(e, 'global')
148 |
149 | log_info('[*] done')
150 |
151 | if __name__ == '__main__':
152 | print ("""
153 | __ __ __ __
154 | ____ ____ ______/ /__/ /_____ __ __/ /_ ______________ __ __/ /__ _____
155 | / __ \/ __ `/ ___/ //_/ __/ __ \/ / / / __ \______/ ___/ ___/ __ `/ | /| / / / _ \/ ___/
156 | / /_/ / /_/ / /__/ ,< / /_/ /_/ / /_/ / /_/ /_____/ /__/ / / /_/ /| |/ |/ / / __/ /
157 | / .___/\__,_/\___/_/|_|\__/ .___/\__,_/_.___/ \___/_/ \__,_/ |__/|__/_/\___/_/
158 | /_/ /_/
159 |
160 | Download FREE eBook every day from www.packtpub.com
161 | @see github.com/niqdev/packtpub-crawler
162 | """)
163 | main()
164 |
--------------------------------------------------------------------------------
/dev/public/myEbooks.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
14 |
15 |
Instant Handlebars.js [eBook]
16 |
17 | Gabriel Manricks
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 | | Price |
26 | Order Reference |
27 | Order Date |
28 |
29 |
30 | |
31 | €0.00
32 | |
33 | PAC-15-3292046-1289610 |
34 | 17/06/15 |
35 |
36 |
37 |
38 |
39 |
+
40 |
41 |
90 |
91 |
92 |
93 |
94 |
95 |
103 |
104 |
Scaling Big Data with Hadoop and Solr [eBook]
105 |
106 | Hrishikesh Vijay Karambelkar
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 | | Price |
115 | Order Reference |
116 | Order Date |
117 |
118 |
119 | |
120 | €0.00
121 | |
122 | PAC-15-3282005-1289610 |
123 | 16/06/15 |
124 |
125 |
126 |
127 |
128 |
+
129 |
130 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
--------------------------------------------------------------------------------
/script/packtpub.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 | from os.path import split, join
4 | from utils import make_soup, wait, download_file
5 | from logs import *
6 | from noBookException import NoBookException
7 | from alreadyClaimedException import AlreadyClaimedException
8 |
9 | class Packtpub(object):
10 | """
11 | """
12 |
13 | def __init__(self, config, dev):
14 | self.__config = config
15 | self.__dev = dev
16 | self.__delay = float(self.__config.get('delay', 'delay.requests'))
17 | self.__url_base = self.__config.get('url', 'url.base')
18 | self.__headers = self.__init_headers()
19 | self.__session = requests.Session()
20 | self.resetInfo()
21 |
22 | def resetInfo(self):
23 | self.info = {
24 | 'paths': []
25 | }
26 |
27 | def __init_headers(self):
28 | # improvement: random user agent
29 | return {
30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
31 | 'Accept-Encoding': 'gzip, deflate',
32 | 'Connection': 'keep-alive',
33 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0.1; SM-G920V Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36'
34 | }
35 |
36 | def __log_response(self, response, method='GET', detail=False):
37 | if detail:
38 | print '[-] {0} {1} | {2}'.format(method, response.url, response.status_code)
39 | print '[-] cookies:'
40 | log_dict(requests.utils.dict_from_cookiejar(self.__session.cookies))
41 | print '[-] headers:'
42 | log_dict(response.headers)
43 |
44 | def __GET_login(self, url):
45 | response = self.__session.get(url, headers=self.__headers)
46 | self.__log_response(response, 'GET', self.__dev)
47 |
48 | soup = make_soup(response)
49 |
50 | form = soup.find('form', {'id': 'packt-user-login-form'})
51 |
52 | if form is None:
53 | raise Exception('Could not find login form')
54 |
55 | self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value']
56 | self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value']
57 |
58 | return soup
59 |
60 | def __POST_login(self, url):
61 | data = self.info.copy()
62 | data['email'] = self.__config.get('credential', 'credential.email')
63 | data['password'] = self.__config.get('credential', 'credential.password')
64 | data['op'] = 'Login'
65 | # print '[-] data: {0}'.format(urllib.urlencode(data))
66 |
67 | response = None
68 | if self.__dev:
69 | response = self.__session.get(url, headers=self.__headers, data=data)
70 | self.__log_response(response, 'GET', self.__dev)
71 | else:
72 | response = self.__session.post(url, headers=self.__headers, data=data)
73 | self.__log_response(response, 'POST', self.__dev)
74 |
75 | soup = make_soup(response)
76 |
77 | error_node = soup.find('div', {'class': 'messages error'})
78 |
79 | if error_node is not None:
80 | raise Exception(error_node.text.strip())
81 |
82 | def __parseDailyBookInfo(self, soup):
83 | div_target = soup.find('div', {'id': 'deal-of-the-day'})
84 |
85 | if div_target is None:
86 | raise NoBookException('no free eBook today')
87 |
88 | title = div_target.select('div.dotd-title > h2')[0].text.strip()
89 | self.info['title'] = title
90 | self.info['filename'] = title.encode('ascii', 'ignore').replace(' ', '_')
91 | self.info['description'] = div_target.select('div.dotd-main-book-summary > div')[2].text.strip()
92 | self.info['url_image'] = 'https:' + div_target.select('div.dotd-main-book-image img')[0]['data-original']
93 | self.info['url_claim'] = self.__url_base + div_target.select('a.twelve-days-claim')[0]['href']
94 | # remove useless info
95 | self.info.pop('form_build_id', None)
96 | self.info.pop('form_id', None)
97 |
98 | def __parseNewsletterBookInfo(self, soup):
99 | div_target = soup.find('div', {'id': 'main-book'})
100 |
101 | urlWithTitle = div_target.select('div.promo-landing-book-picture a')[0]['href']
102 | title = urlWithTitle.split('/')[-1].replace('-', ' ').title()
103 | claimNode = div_target.select('div.promo-landing-book-info a')
104 |
105 | self.info['title'] = title
106 | self.info['filename'] = title.replace(' ', '_').encode('ascii', 'ignore')
107 | self.info['description'] = div_target.select('div.promo-landing-book-body > div')[0].text.strip()
108 | self.info['url_image'] = 'https:' + div_target.select('div.promo-landing-book-picture img')[0]['src']
109 | self.info['url_claim'] = self.__url_base + claimNode[0]['href']
110 | # remove useless info
111 | self.info.pop('form_build_id', None)
112 | self.info.pop('form_id', None)
113 |
114 | def __GET_claim(self):
115 | if self.__dev:
116 | url = self.__url_base + self.__config.get('url', 'url.account')
117 | else:
118 | url = self.info['url_claim']
119 |
120 | response = self.__session.get(url, headers=self.__headers)
121 | self.__log_response(response, 'GET', self.__dev)
122 |
123 | soup = make_soup(response)
124 | div_target = soup.find('div', {'id': 'product-account-list'})
125 |
126 | if div_target is None:
127 | raise Exception('Could not access claim page. This is most likely caused by invalid credentials')
128 |
129 | errorMessage = soup.find(id='messages-container')
130 |
131 | if errorMessage is not None and errorMessage.text.strip() == 'You have already claimed this promotion.':
132 | raise AlreadyClaimedException()
133 |
134 | # only last one just claimed
135 | div_claimed_book = div_target.select('.product-line')[0]
136 | self.info['book_id'] = div_claimed_book['nid']
137 | self.info['author'] = div_claimed_book.find(class_='author').text.strip()
138 |
139 | source_code = div_claimed_book.find(href=re.compile('/code_download/*'))
140 | if source_code is not None:
141 | self.info['url_source_code'] = self.__url_base + source_code['href']
142 |
143 | def runDaily(self):
144 | """
145 | """
146 | if self.__dev:
147 | loginUrl = self.__url_base + self.__config.get('url', 'url.loginGet')
148 | else:
149 | loginUrl = self.__url_base + self.__config.get('url', 'url.login')
150 |
151 | self.info['landingPageUrl'] = loginUrl
152 |
153 | soup = self.__GET_login(loginUrl)
154 | wait(self.__delay, self.__dev)
155 |
156 | if self.__dev:
157 | loginUrl = self.__url_base + self.__config.get('url', 'url.loginPost')
158 |
159 | self.__POST_login(loginUrl)
160 | wait(self.__delay, self.__dev)
161 | self.__parseDailyBookInfo(soup)
162 | wait(self.__delay, self.__dev)
163 | self.__GET_claim()
164 | wait(self.__delay, self.__dev)
165 |
166 | def runNewsletter(self, currentNewsletterUrl):
167 | """
168 | """
169 |
170 | soup = self.__GET_login(currentNewsletterUrl)
171 | self.info['landingPageUrl'] = currentNewsletterUrl
172 |
173 | self.__parseNewsletterBookInfo(soup)
174 | wait(self.__delay, self.__dev)
175 | self.__GET_claim()
176 | wait(self.__delay, self.__dev)
177 |
178 | def download_ebooks(self, types, base_path):
179 | """
180 | """
181 | downloads_info = [dict(type=type,
182 | url=self.__url_base + self.__config.get('url', 'url.download').format(self.info['book_id'], type),
183 | filename=self.info['filename'] + '.' + type)
184 | for type in types]
185 |
186 | # https://github.com/niqdev/packtpub-crawler/pull/27
187 | if self.__config.has_option('path', 'path.group'):
188 |
189 | folder_name = self.info['title'].encode('ascii', 'ignore').replace(' ', '_') + \
190 | self.info['author'].encode('ascii', 'ignore').replace(' ', '_')
191 |
192 | directory = base_path + join(self.__config.get('path', 'path.ebooks'), folder_name)
193 | else:
194 | directory = base_path + self.__config.get('path', 'path.ebooks')
195 |
196 | for download in downloads_info:
197 | self.info['paths'].append(
198 | download_file(self.__session, download['url'], directory, download['filename'], self.__headers))
199 |
200 | def download_extras(self, base_path):
201 | """
202 | """
203 |
204 | # https://github.com/niqdev/packtpub-crawler/pull/27
205 | if self.__config.has_option('path', 'path.group'):
206 |
207 | folder_name = self.info['title'].encode('ascii', 'ignore').replace(' ', '_') + \
208 | self.info['author'].encode('ascii', 'ignore').replace(' ', '_')
209 |
210 | directory = base_path + join(self.__config.get('path', 'path.ebooks'), folder_name, self.__config.get('path', 'path.extras'))
211 | else:
212 | directory = base_path + self.__config.get('path', 'path.extras')
213 |
214 | url_image = self.info['url_image']
215 | filename = self.info['filename'] + '_' + split(url_image)[1]
216 | self.info['paths'].append(download_file(self.__session, url_image, directory, filename, self.__headers))
217 |
218 | if 'url_source_code' in self.info:
219 | self.info['paths'].append(download_file(self.__session, self.info['url_source_code'], directory,
220 | self.info['filename'] + '.zip', self.__headers))
221 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # packtpub-crawler
2 |
3 | ### Download FREE eBook every day from [www.packtpub.com](https://www.packtpub.com/packt/offers/free-learning)
4 |
5 | This crawler automates the following step:
6 |
7 | * access to private account
8 | * claim the daily free eBook and weekly Newsletter
9 | * parse title, description and useful information
10 | * download favorite format *.pdf .epub .mobi*
11 | * download source code and book cover
12 | * upload files to Google Drive, OneDrive or via scp
13 | * store data on Firebase
14 | * notify via Gmail, IFTTT, Join or Pushover (on success and errors)
15 | * schedule daily job on Heroku or with Docker
16 |
17 | ### Default command
18 | ```bash
19 | # upload pdf to googledrive, store data and notify via email
20 | python script/spider.py -c config/prod.cfg -u googledrive -s firebase -n gmail
21 | ```
22 |
23 | ### Other options
24 | ```bash
25 | # download all format
26 | python script/spider.py --config config/prod.cfg --all
27 |
28 | # download only one format: pdf|epub|mobi
29 | python script/spider.py --config config/prod.cfg --type pdf
30 |
31 | # download also additional material: source code (if exists) and book cover
32 | python script/spider.py --config config/prod.cfg -t pdf --extras
33 | # equivalent (default is pdf)
34 | python script/spider.py -c config/prod.cfg -e
35 |
36 | # download and then upload to Google Drive (given the download url anyone can download it)
37 | python script/spider.py -c config/prod.cfg -t epub --upload googledrive
38 | python script/spider.py --config config/prod.cfg --all --extras --upload googledrive
39 |
40 | # download and then upload to OneDrive (given the download url anyone can download it)
41 | python script/spider.py -c config/prod.cfg -t epub --upload onedrive
42 | python script/spider.py --config config/prod.cfg --all --extras --upload onedrive
43 |
44 | # download and notify: gmail|ifttt|join|pushover
45 | python script/spider.py -c config/prod.cfg --notify gmail
46 |
47 | # only claim book (no downloads):
48 | python script/spider.py -c config/prod.cfg --notify gmail --claimOnly
49 | ```
50 |
51 | ### Basic setup
52 |
53 | Before you start you should
54 |
55 | * Verify that your currently installed version of Python is **2.x** with `python --version`
56 | * Clone the repository `git clone https://github.com/niqdev/packtpub-crawler.git`
57 | * Install all the dependencies `pip install -r requirements.txt` (see also [virtualenv](https://github.com/niqdev/packtpub-crawler#virtualenv))
58 | * Create a [config](https://github.com/niqdev/packtpub-crawler/blob/master/config/prod_example.cfg) file `cp config/prod_example.cfg config/prod.cfg`
59 | * Change your Packtpub credentials in the config file
60 | ```
61 | [credential]
62 | credential.email=PACKTPUB_EMAIL
63 | credential.password=PACKTPUB_PASSWORD
64 | ```
65 |
66 | Now you should be able to claim and download your first eBook
67 | ```
68 | python script/spider.py --config config/prod.cfg
69 | ```
70 |
71 | ### Google Drive
72 |
73 | From the documentation, Google Drive API requires OAuth2.0 for authentication, so to upload files you should:
74 |
75 | * Go to [Google APIs Console](https://code.google.com/apis/console) and create a new [Google Drive](https://console.developers.google.com/apis/api/drive/overview) project named **PacktpubDrive**
76 | * On *API manager > Overview* menu
77 | * Enable Google Drive API
78 | * On *API manager > Credentials* menu
79 | * In *OAuth consent screen* tab set **PacktpubDrive** as the product name shown to users
80 | * In *Credentials* tab create credentials of type *OAuth client ID* and choose Application type *Other* named **PacktpubDriveCredentials**
81 | * Click *Download JSON* and save the file `config/client_secrets.json`
82 | * Change your Google Drive credentials in the config file
83 |
84 | ```
85 | [googledrive]
86 | ...
87 | googledrive.client_secrets=config/client_secrets.json
88 | googledrive.gmail=GOOGLE_DRIVE@gmail.com
89 | ```
90 |
91 | Now you should be able to upload your eBook to Google Drive
92 | ```
93 | python script/spider.py --config config/prod.cfg --upload googledrive
94 | ```
95 |
96 | Only the first time you will be prompted to login in a browser which has javascript enabled (no text-based browser) to generate `config/auth_token.json`.
97 | You should also copy and paste in the config the *FOLDER_ID*, otherwise every time a new folder with the same name will be created.
98 | ```
99 | [googledrive]
100 | ...
101 | googledrive.default_folder=packtpub
102 | googledrive.upload_folder=FOLDER_ID
103 | ```
104 |
105 | Documentation: [OAuth](https://developers.google.com/api-client-library/python/guide/aaa_oauth), [Quickstart](https://developers.google.com/drive/v3/web/quickstart/python), [example](https://github.com/googledrive/python-quickstart) and [permissions](https://developers.google.com/drive/v2/reference/permissions)
106 |
107 | ### OneDrive
108 |
109 | From the documentation, OneDrive API requires OAuth2.0 for authentication, so to upload files you should:
110 |
111 |
112 | * Go to the [Microsoft Application Registration Portal](https://apps.dev.microsoft.com/?referrer=https%3A%2F%2Fdev.onedrive.com%2Fapp-registration.htm).
113 | * When prompted, sign in with your Microsoft account credentials.
114 | * Find **My applications** and click **Add an app**.
115 | * Enter **PacktpubDrive** as the app's name and click **Create application**.
116 | * Scroll to the bottom of the page and check the **Live SDK support** box.
117 | * Change your OneDrive credentials in the config file
118 | * Copy your **Application Id** into the config file to **onedrive.client_id**
119 | * Click **Generate New Password** and copy the password shown into the config file to **onedrive.client_secret**
120 | * Click **Add Platform** and select **Web**
121 | * Enter **http://localhost:8080/** as the **Redirect URL**
122 | * Click **Save** at the bottom of the page
123 |
124 | ```
125 | [onedrive]
126 | ...
127 | onedrive.client_id=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
128 | onedrive.client_secret=XxXxXxXxXxXxXxXxXxXxXxX
129 | ```
130 |
131 | Now you should be able to upload your eBook to OneDrive
132 | ```
133 | python script/spider.py --config config/prod.cfg --upload onedrive
134 | ```
135 |
136 | Only the first time you will be prompted to login in a browser which has javascript enabled (no text-based browser) to generate `config/session.onedrive.pickle`.
137 | ```
138 | [onedrive]
139 | ...
140 | onedrive.folder=packtpub
141 | ```
142 |
143 | Documentation: [Registration](https://dev.onedrive.com/app-registration.htm), [Python API](https://github.com/OneDrive/onedrive-sdk-python)
144 |
145 | ### Scp
146 |
147 | To upload your eBook via `scp` on a remote server update the configs
148 |
149 | ```
150 | [scp]
151 | scp.host=SCP_HOST
152 | scp.user=SCP_USER
153 | scp.password=SCP_PASSWORD
154 | scp.path=SCP_UPLOAD_PATH
155 | ```
156 |
157 | Now you should be able to upload your eBook
158 | ```
159 | python script/spider.py --config config/prod.cfg --upload scp
160 | ```
161 |
162 | Note:
163 | * the destination folder `scp.path` on the remote server must exists in advance
164 | * the option `--upload scp` is incompatible with `--store` and `--notify`
165 |
166 | ### Firebase
167 |
168 | Create a new Firebase [project](https://console.firebase.google.com/), copy the database secret from your settings
169 | ```
170 | https://console.firebase.google.com/project/PROJECT_NAME/settings/database
171 | ```
172 | and update the configs
173 | ```
174 | [firebase]
175 | firebase.database_secret=DATABASE_SECRET
176 | firebase.url=https://PROJECT_NAME.firebaseio.com
177 | ```
178 |
179 | Now you should be able to store your eBook details on Firebase
180 | ```
181 | python script/spider.py --config config/prod.cfg --upload googledrive --store firebase
182 | ```
183 |
184 | ### Gmail notification
185 |
186 | To *send* a notification via email using Gmail you should:
187 |
188 | * Allow ["less secure apps"](https://www.google.com/settings/security/lesssecureapps) and ["DisplayUnlockCaptcha"](https://accounts.google.com/DisplayUnlockCaptcha) on your account
189 | * [Troubleshoot](https://support.google.com/mail/answer/78754) sign-in problems and [examples](http://stackoverflow.com/questions/10147455/how-to-send-an-email-with-gmail-as-provider-using-python)
190 | * Change your Gmail credentials in the config file
191 |
192 | ```
193 | [gmail]
194 | ...
195 | gmail.username=EMAIL_USERNAME@gmail.com
196 | gmail.password=EMAIL_PASSWORD
197 | gmail.from=FROM_EMAIL@gmail.com
198 | gmail.to=TO_EMAIL_1@gmail.com,TO_EMAIL_2@gmail.com
199 | ```
200 |
201 | Now you should be able to notify your accounts
202 | ```
203 | python script/spider.py --config config/prod.cfg --notify gmail
204 | ```
205 |
206 | ### IFTTT notification
207 |
208 | * Get an account on [IFTTT](https://ifttt.com)
209 | * Go to your Maker [settings](https://ifttt.com/services/maker/settings) and activate the channel
210 | * [Create](https://ifttt.com/create) a new applet using the Maker service with the trigger "Receive a web request" and the event name "packtpub-crawler"
211 | * Change your IFTTT [key](https://internal-api.ifttt.com/maker) in the config file
212 |
213 | ```
214 | [ifttt]
215 | ifttt.event_name=packtpub-crawler
216 | ifttt.key=IFTTT_MAKER_KEY
217 | ```
218 |
219 | Now you should be able to trigger the applet
220 | ```
221 | python script/spider.py --config config/prod.cfg --notify ifttt
222 | ```
223 |
224 | Value mappings:
225 | * value1: title
226 | * value2: description
227 | * value3: landing page URL
228 |
229 | ### Join notification
230 |
231 | * Get the Join [Chrome extension](https://chrome.google.com/webstore/detail/join-by-joaoapps/flejfacjooompmliegamfbpjjdlhokhj) and/or [App](https://play.google.com/store/apps/details?id=com.joaomgcd.join)
232 | * You can find your device ids [here](https://joinjoaomgcd.appspot.com/)
233 | * (Optional) You can use multiple devices or groups (group.all, group.android, group.chrome, group.windows10, group.phone, group.tablet, group.pc) separated by comma
234 | * Change your Join credentials in the config file
235 |
236 | ```
237 | [join]
238 | join.device_ids=DEVICE_IDS_COMMA_SEPARATED_OR_GROUP_NAME
239 | join.api_key=API_KEY
240 | ```
241 |
242 | Now you should be able to trigger the event
243 | ```
244 | python script/spider.py --config config/prod.cfg --notify join
245 | ```
246 |
247 | ### Pushover notification
248 |
249 | * Get your [USER_KEY](https://pushover.net/)
250 | * Create a [new application](https://pushover.net/apps/build)
251 | * (Optional) Add an [icon](https://pushover.net/icons/9aqpv697p9g6wzo.png)
252 | * Change your pushover credentials in the config file
253 |
254 | ```
255 | [pushover]
256 | pushover.user_key=PUSHOVER_USER_KEY
257 | pushover.api_key=PUSHOVER_API_KEY
258 | ```
259 |
260 | ### Heroku
261 |
262 | Create a new branch
263 | ```
264 | git checkout -b heroku-scheduler
265 | ```
266 |
267 | Update the `.gitignore` and commit your changes
268 | ```bash
269 | # remove
270 | config/prod.cfg
271 | config/client_secrets.json
272 | config/auth_token.json
273 | # add
274 | dev/
275 | config/dev.cfg
276 | config/prod_example.cfg
277 | ```
278 |
279 | Create, config and deploy the scheduler
280 | ```bash
281 | heroku login
282 | # create a new app
283 | heroku create APP_NAME --region eu
284 | # or if you already have an existing app
285 | heroku git:remote -a APP_NAME
286 |
287 | # deploy your app
288 | git push -u heroku heroku-scheduler:master
289 | heroku ps:scale clock=1
290 |
291 | # useful commands
292 | heroku ps
293 | heroku logs --ps clock.1
294 | heroku logs --tail
295 | heroku run bash
296 | ```
297 |
298 | Update `script/scheduler.py` with your own preferences.
299 |
300 | More info about Heroku [Scheduler](https://devcenter.heroku.com/articles/scheduler), [Clock Processes](https://devcenter.heroku.com/articles/clock-processes-python), [Add-on](https://elements.heroku.com/addons/scheduler) and [APScheduler](http://apscheduler.readthedocs.io/en/latest/userguide.html)
301 |
302 | ### Docker
303 |
304 | Build your image
305 | ```
306 | docker build -t niqdev/packtpub-crawler:2.4.0 .
307 | ```
308 |
309 | Run manually
310 | ```
311 | docker run \
312 | --rm \
313 | --name my-packtpub-crawler \
314 | niqdev/packtpub-crawler:2.4.0 \
315 | python script/spider.py --config config/prod.cfg
316 | ```
317 |
318 | Run scheduled crawler in background
319 | ```
320 | docker run \
321 | --detach \
322 | --name my-packtpub-crawler \
323 | niqdev/packtpub-crawler:2.4.0
324 |
325 | # useful commands
326 | docker exec -i -t my-packtpub-crawler bash
327 | docker logs -f my-packtpub-crawler
328 | ```
329 |
330 | Alternatively you can pull from [Docker Hub](https://hub.docker.com/r/kuchy/packtpub-crawler/) this [fork](https://github.com/kuchy/packtpub-crawler/tree/docker_cron)
331 | ```
332 | docker pull kuchy/packtpub-crawler
333 | ```
334 |
335 | ### Cron job
336 | Add this to your crontab to run the job daily at 9 AM:
337 | ```
338 | crontab -e
339 |
340 | 00 09 * * * cd PATH_TO_PROJECT/packtpub-crawler && /usr/bin/python script/spider.py --config config/prod.cfg >> /tmp/packtpub.log 2>&1
341 | ```
342 |
343 |
344 | ### Systemd service
345 | Create two files in /etc/systemd/system:
346 |
347 | 1. packtpub-crawler.service
348 | ```
349 | [Unit]
350 | Description=run packtpub-crawler
351 |
352 | [Service]
353 | User=USER_THAT_SHOULD_RUN_THE_SCRIPT
354 | ExecStart=/usr/bin/python2.7 PATH_TO_PROJECT/packtpub-crawler/script/spider.py -c config/prod.cfg
355 |
356 | [Install]
357 | WantedBy=multi-user.target
358 | ```
359 |
360 | 2. packtpub-crawler.timer
361 | ```
362 | [Unit]
363 | Description=Runs packtpub-crawler every day at 7
364 |
365 | [Timer]
366 | OnBootSec=10min
367 | OnActiveSec=1s
368 | OnCalendar=*-*-* 07:00:00
369 | Unit=packtpub_crawler.service
370 | Persistent=true
371 |
372 | [Install]
373 | WantedBy=multi-user.target
374 | ```
375 |
376 | Enable the script with ```sudo systemctl enable packtpub_crawler.timer```.
377 | You can test the service with ```sudo systemctl start packtpub_crawler.timer``` and see the output with ```sudo journalctl -u packtpub_crawler.service -f```.
378 |
379 |
380 | ### Newsletter
381 | The script downloads also the free ebooks from the weekly packtpub newsletter.
382 | The [URL](https://goo.gl/kUciut) is generated by a Google Apps Script which parses all the mails.
383 | You can get the code [here](https://gist.github.com/juzim/af0ef80f1233de51614d88551514b0ad), if you want to see the actual script, please clone the [spreadsheet](https://docs.google.com/spreadsheets/d/1jN5gV45uVkE0EEF4Nb-yVNfIr3o8OoiVveUZJRMiLFw) and go to `Tools > Script editor...`.
384 |
385 | To use your own source, modify in the config
386 | ```
387 | url.bookFromNewsletter=https://goo.gl/kUciut
388 | ```
389 |
390 | The URL should point to a file containing only the URL (no semicolons, HTML, JSON, etc).
391 |
392 | You can also clone the [spreadsheet](https://docs.google.com/spreadsheets/d/1jN5gV45uVkE0EEF4Nb-yVNfIr3o8OoiVveUZJRMiLFw) to use your own Gmail account. Subscribe to the [newsletter](https://www.packtpub.com) (on the bottom of the page) and create a filter to tag your mails accordingly.
393 |
394 |
395 | ### Troubleshooting
396 | * ImportError: No module named paramiko
397 |
398 | Install paramiko with `sudo -H pip install paramiko --ignore-installed`
399 |
400 | * Failed building wheel for cryptography
401 |
402 | Install missing dependencies as described [here](https://cryptography.io/en/latest/installation/#building-cryptography-on-windows)
403 |
404 | ### virtualenv
405 |
406 | ```
407 | # install pip + setuptools
408 | curl https://bootstrap.pypa.io/get-pip.py | python -
409 |
410 | # upgrade pip
411 | pip install -U pip
412 |
413 | # install virtualenv globally
414 | sudo pip install virtualenv
415 |
416 | # create virtualenv
417 | virtualenv env
418 |
419 | # activate virtualenv
420 | source env/bin/activate
421 |
422 | # verify virtualenv
423 | which python
424 | python --version
425 |
426 | # deactivate virtualenv
427 | deactivate
428 | ```
429 |
430 | ### Development (only for spidering)
431 | Run a simple static server with
432 | ```
433 | node dev/server.js
434 | ```
435 | and test the crawler with
436 | ```
437 | python script/spider.py --dev --config config/dev.cfg --all
438 | ```
439 |
440 | ### Disclaimer
441 |
442 | This project is just a Proof of Concept and not intended for any illegal usage. I'm not responsible for any damage or abuse, use it at your own risk.
443 |
--------------------------------------------------------------------------------