├── .gitignore ├── .travis.yml ├── LICENSE ├── README.rst ├── requirements.txt ├── scribdl ├── __init__.py └── scribdl.py ├── setup.py └── test ├── expect_text.txt ├── test_images.py └── test_text.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | .cache/ 4 | scribd_downloader.egg-info/ 5 | build/ 6 | dist/ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | install: pip install -r requirements.txt 8 | script: python -m pytest test 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Ritiek Malhotra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Scribd-Downloader 2 | ================= 3 | 4 | |Build Status| 5 | 6 | - This python script allows downloading of Scribd documents. 7 | 8 | - It does not matter if the pages are blurred or require 9 | authentication, this script will still do the job. 10 | 11 | - There are two types of documents on Scribd: 12 | 13 | - Documents made up using a collection of images and 14 | - Actual documents where the text can be selected, copied etc. 15 | 16 | This script takes a different approach to both of them: 17 | 18 | - Documents consisting of a collection of images is straightforward and 19 | this script will simply download the induvidual images which can 20 | later be combined into a PDF using a suitable software. Simple. 21 | 22 | - Actual documents where the text can be selected are hard to tackle. 23 | If you feed such a document in this script, only the text present in 24 | document will be downloaded. I do not know much about JS and since 25 | Scribd uses JS to combine text and images for each induvidual page, I 26 | do not yet know how they do it. Ideas welcome on combining images and 27 | text! 28 | 29 | Installation 30 | ------------ 31 | 32 | :: 33 | 34 | pip install scribd-downloader 35 | 36 | or if you like to live on the bleeding edge: 37 | 38 | :: 39 | 40 | python setup.py install 41 | 42 | Usage 43 | ----- 44 | 45 | :: 46 | 47 | usage: scribdl [-h] [-i] DOC 48 | 49 | A Scribd-Downloader that actually works 50 | 51 | positional arguments: 52 | DOC scribd document to download 53 | 54 | optional arguments: 55 | -h, --help show this help message and exit 56 | -i, --images download document made up of images 57 | 58 | - To download text from document containing selectable text: 59 | - example: 60 | ``scribdl https://www.scribd.com/document/55949937/33-Strategies-of-War`` 61 | 62 | (Text will be saved side by side in a ``.txt`` file in your current 63 | working directory) 64 | 65 | - To download document containing images; use the ``--images`` option (the tool cannot figure out this on its own): 66 | - example: 67 | ``scribdl -i http://scribd.com/doc/17142797/Case-in-Point`` 68 | 69 | (Images will be saved in your current working directory) 70 | 71 | Disclaimer 72 | ---------- 73 | 74 | Downloading books from Scribd for free maybe prohibited. This tool is 75 | meant for educational purposes only. Please support the authors by buying 76 | their titles. 77 | 78 | License 79 | ------- 80 | 81 | ``The MIT License`` 82 | 83 | .. |Build Status| image:: https://travis-ci.org/ritiek/scribd-downloader.svg?branch=master 84 | :target: https://travis-ci.org/ritiek/scribd-downloader 85 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup4 2 | requests 3 | -------------------------------------------------------------------------------- /scribdl/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.3' 2 | 3 | from .scribdl import * 4 | -------------------------------------------------------------------------------- /scribdl/scribdl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from bs4 import BeautifulSoup 4 | import requests 5 | import shutil 6 | import sys 7 | import argparse 8 | 9 | 10 | def get_arguments(): 11 | parser = argparse.ArgumentParser( 12 | description='A Scribd-Downloader that actually works') 13 | 14 | parser.add_argument( 15 | 'doc', 16 | metavar='DOC', 17 | type=str, 18 | help='scribd document to download') 19 | parser.add_argument( 20 | '-i', 21 | '--images', 22 | help="download document made up of images", 23 | action='store_true', 24 | default=False) 25 | 26 | return parser.parse_args() 27 | 28 | 29 | # fix encoding issues in python2 30 | def fix_encoding(query): 31 | if sys.version_info > (3, 0): 32 | return query 33 | else: 34 | return query.encode('utf-8') 35 | 36 | 37 | def save_image(jsonp, imagename): 38 | replacement = jsonp.replace('/pages/', '/images/').replace('jsonp', 'jpg') 39 | response = requests.get(replacement, stream=True) 40 | 41 | with open(imagename, 'wb') as out_file: 42 | shutil.copyfileobj(response.raw, out_file) 43 | 44 | 45 | def save_text(jsonp, filename): 46 | response = requests.get(url=jsonp).text 47 | page_no = response[11:12] 48 | 49 | response_head = ( 50 | response).replace('window.page' + page_no + '_callback(["', 51 | '').replace('\\n', '').replace('\\', '').replace( 52 | '"]);', '') 53 | soup_content = BeautifulSoup(response_head, 'html.parser') 54 | 55 | for x in soup_content.find_all('span', {'class': 'a'}): 56 | xtext = fix_encoding(x.get_text()) 57 | print(xtext) 58 | 59 | extraction = xtext + '\n' 60 | with open(filename, 'a') as feed: 61 | feed.write(extraction) 62 | 63 | 64 | # detect image and text 65 | def save_content(jsonp, images, train, title): 66 | if not jsonp == '': 67 | if images: 68 | imagename = title + '_' + str(train) + '.jpg' 69 | print('Downloading image to ' + imagename) 70 | save_image(jsonp, imagename) 71 | else: 72 | save_text(jsonp, (title + '.txt')) 73 | train += 1 74 | 75 | return train 76 | 77 | 78 | def sanitize_title(title): 79 | '''Remove forbidden characters from title that will prevent OS from creating directory. (For Windows at least.) 80 | 81 | Also change ' ' to '_' to preserve previous behavior.''' 82 | 83 | forbidden_chars = " *\"/\<>:|" 84 | replace_char = "_" 85 | 86 | for ch in forbidden_chars: 87 | title = title.replace(ch, replace_char) 88 | 89 | return title 90 | 91 | 92 | # the main function 93 | def get_scribd_document(url, images): 94 | response = requests.get(url=url).text 95 | soup = BeautifulSoup(response, 'html.parser') 96 | 97 | title = soup.find('title').get_text()#.replace(' ', '_') 98 | title = sanitize_title(title) # a bit more thorough 99 | 100 | if not images: 101 | print('Extracting text to ' + title + '.txt\n') 102 | 103 | print(title + '\n') 104 | 105 | js_text = soup.find_all('script', type='text/javascript') 106 | train = 1 107 | 108 | for opening in js_text: 109 | 110 | for inner_opening in opening: 111 | portion1 = inner_opening.find('https://') 112 | 113 | if not portion1 == -1: 114 | portion2 = inner_opening.find('.jsonp') 115 | jsonp = inner_opening[portion1:portion2+6] 116 | 117 | train = save_content(jsonp, images, train, title) 118 | 119 | 120 | def command_line(): 121 | args = get_arguments() 122 | url = args.doc 123 | images = args.images 124 | get_scribd_document(url, images) 125 | 126 | 127 | if __name__ == '__main__': 128 | 129 | command_line() 130 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import scribdl 5 | 6 | with open("README.rst", "r") as f: 7 | long_description = f.read() 8 | 9 | setup(name='scribd-downloader', 10 | version=scribdl.__version__, 11 | description=' A scribd-downloader that actually works', 12 | long_description=long_description, 13 | author='Ritiek Malhotra', 14 | author_email='ritiekmalhotra123@gmail.com', 15 | packages = find_packages(), 16 | entry_points={ 17 | 'console_scripts': [ 18 | 'scribdl = scribdl.scribdl:command_line', 19 | ] 20 | }, 21 | url='https://www.github.com/ritiek/scribd-downloader', 22 | keywords=['scribd-downloader', 'documents', 'command-line', 'python'], 23 | license='MIT', 24 | download_url='https://github.com/ritiek/scribd-downloader/archive/v' + scribdl.__version__ + '.tar.gz', 25 | classifiers=[], 26 | install_requires=[ 27 | 'requests', 28 | 'BeautifulSoup4', 29 | ] 30 | ) 31 | -------------------------------------------------------------------------------- /test/expect_text.txt: -------------------------------------------------------------------------------- 1 | The 33 Strategies of War  2 | 1 3 | The 33 Strategies of War 4 | The 33 Strategies of War 5 | Author 6 | Robert Greene (author) 7 | Country 8 | United States 9 | Language 10 | English 11 | Genre(s) 12 | Business, Management, Military History, Psychology, self-improvement 13 | Publisher 14 | Penguin Group (HC); HighBridge Audio (CD) 15 | Publication date 16 | January and April 2006 17 | Media type 18 | Print (Hardcover) and CD 19 | Pages 20 | 496 21 | ISBN 22 | ISBN 0-670-03457-6 (HC); 978-1-59887-091-6 (CD) 23 | The 33 Strategies of War 24 | by Robert Greene is a "guide to the subtle social game of everyday life informed by the ... 25 | military principles in war.". 26 | [1] 27 | It is composed of discussions and examples on offensive and defensive strategies 28 | from a wide variety of people and conditions such as Napoleon Bonaparte, Lawrence of Arabia, Alexander the Great, 29 | and the Tet Offensive. 30 | [2] 31 | The scope of the book is broad, applying not only to violent conflicts but also social 32 | conflicts such as family quarrels and business negotiations. 33 | The book is di 34 | vided into 35 | five parts: Self-Directed Warfare, Organizational (Team) Warfare, Defensive Warfare, 36 | Offensive Warfare and Unconventional (Dirty) Warfare. 37 | [1] 38 | Each part contains a differing number of strategies, each 39 | in a chapter. Each chapter has a similar layout. Descriptions of battles, political and business situations are 40 | accompanied by Greene's interpretation. There are occasional instructional sections followed by examples. All 41 | chapters end with a "Reversal" to give a brief discussion of where the strategy may not apply, a contrary view or 42 | defense. Throughout the book Mr. Greene includes quotes from a variety of sources. These are incorporated in the 43 | margins and between sections. 44 | Although one reviewer has called the book "an indispensable book, [which] provides all the psychological 45 | ammunition you need to overcome patterns of failure and forever gain the upper hand," 46 | [3] 47 | another one found it 48 | "perplexing 49 |  — 50 | if not downright unhealthy 51 |  — 52 | [to publish] a book on the lessons of war for everything but war at a 53 | time when we [Canada] are, er, at war." 54 | [4] 55 | Yet another reviewer found the book's coverage of military history 56 | informative, but the political tales "mostly foolish or just plain wrong". 57 | [5] 58 | On Amazon.com, the majority of the 59 | reviews on the book are positive. 60 | The 33 Strategies of War  61 | was part of the reading list for youths attending the Indigenous Leadership Forum 62 | organised by the University of Victoria, which aimed to redesign radical Indigenous politics and the Indigenist 63 | movement. 64 | [6] 65 | It is also read by students attending a Southwestern Baptist Theological Seminary course in Christian 66 | apologetics. 67 | [7] 68 | -------------------------------------------------------------------------------- /test/test_images.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import scribdl 4 | import os 5 | 6 | 7 | URL = 'http://scribd.com/doc/17142797/Case-in-Point' 8 | images = True 9 | 10 | 11 | def test_jsonp(): 12 | expect_jsonp = 'https://html1-f.scribdassets.com/6uj1tnfc00bk97m/pages/4-8e53969a8b.jsonp' 13 | 14 | response = requests.get(url=URL).text 15 | soup = BeautifulSoup(response, 'html.parser') 16 | 17 | js_text = soup.find_all('script', type='text/javascript') 18 | inner_opening = js_text[22].get_text() 19 | 20 | portion1 = inner_opening.find('https://') 21 | portion2 = inner_opening.find('.jsonp') 22 | global jsonp 23 | jsonp = inner_opening[portion1:portion2+6] 24 | 25 | assert jsonp == expect_jsonp 26 | 27 | 28 | def test_image(): 29 | expect_image = True 30 | scribdl.save_image(jsonp, 'testimage.jpg') 31 | image = os.path.isfile('testimage.jpg') 32 | 33 | if image: 34 | os.remove('testimage.jpg') 35 | 36 | assert image == expect_image 37 | -------------------------------------------------------------------------------- /test/test_text.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import scribdl 4 | import os 5 | 6 | 7 | URL = 'https://www.scribd.com/document/55949937/33-Strategies-of-War' 8 | images = False 9 | 10 | 11 | def test_jsonp(): 12 | expect_jsonp = 'https://html2-f.scribdassets.com/8u7q15n1q8z07to/pages/1-a9de44b065.jsonp' 13 | 14 | response = requests.get(url=URL).text 15 | soup = BeautifulSoup(response, 'html.parser') 16 | 17 | js_text = soup.find_all('script', type='text/javascript') 18 | inner_opening = js_text[16].get_text() 19 | 20 | portion1 = inner_opening.find('https://') 21 | portion2 = inner_opening.find('.jsonp') 22 | global jsonp 23 | jsonp = inner_opening[portion1:portion2+6] 24 | 25 | assert jsonp == expect_jsonp 26 | 27 | 28 | def test_text(): 29 | with open('test/expect_text.txt', 'r') as textin: 30 | expect_text = textin.read() 31 | 32 | scribdl.save_text(jsonp, 'text.txt') 33 | 34 | with open('text.txt', 'r') as textin: 35 | text = textin.read() 36 | 37 | if os.path.isfile('text.txt'): 38 | os.remove('text.txt') 39 | 40 | assert text == expect_text 41 | --------------------------------------------------------------------------------