├── LICENSE ├── README.md ├── requirements.txt ├── setup.py └── slideshare_dl └── __main__.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Neel Basak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | slideshare-dl logo 3 |

4 | 5 |

slideshare-dl

6 | 7 |

8 | A simple, multi-threaded, CLI slideshare presentation downloader 🚀 9 |
10 | No login required! 11 |

12 | 13 | 14 | 15 | ## Installation 16 | 17 | Clone repo: 18 | 19 | ```powershell 20 | git clone https://github.com/Neelfrost/slideshare-dl.git; cd .\slideshare-dl 21 | ``` 22 | 23 | Install using pip: 24 | 25 | ```powershell 26 | pip install . 27 | ``` 28 | 29 | ## Usage 30 | 31 | ```powershell 32 | slideshare-dl.exe --help 33 | ``` 34 | 35 | ```powershell 36 | usage: slideshare-dl [-h] [--nopdf] url 37 | 38 | Download a slideshare presentation. 39 | 40 | positional arguments: 41 | url Slideshare presentation url. 42 | 43 | options: 44 | -h, --help show this help message and exit 45 | --nopdf Do not combine slides into a pdf. (Individual slides are saved in "slides" folder) 46 | ``` 47 | 48 | ## Todo 49 | 50 | - [x] Speed up download using multi-threading 51 | - [ ] Implement OCR 52 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.11.1 2 | img2pdf>=0.4.4 3 | requests>=2.27.1 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | def read_contents(fname): 5 | with open(fname, encoding="utf-8") as f: 6 | return f.read() 7 | 8 | 9 | setup( 10 | name="slideshare-dl", 11 | version="1.0", 12 | description="A simple, multi-threaded, CLI slideshare presentation downloader.", 13 | author="Neel Basak", 14 | author_email="neelfrost@gmail.com", 15 | license=read_contents("LICENSE"), 16 | packages=["slideshare_dl"], 17 | install_requires=read_contents("requirements.txt").splitlines(), 18 | entry_points={"console_scripts": ["slideshare-dl = slideshare_dl.__main__:main"]}, 19 | classifiers=[ 20 | "Environment :: Console", 21 | "License :: MIT", 22 | "Operating System :: OS Independent", 23 | ], 24 | ) 25 | -------------------------------------------------------------------------------- /slideshare_dl/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | from concurrent.futures import ThreadPoolExecutor 5 | from multiprocessing import freeze_support 6 | from sys import exit 7 | 8 | import img2pdf 9 | import requests 10 | from bs4 import BeautifulSoup 11 | 12 | SLIDES_FOLDER = os.path.join(os.getcwd(), "slides") 13 | 14 | 15 | def create_parser(): 16 | """Create CLI parser using argparse 17 | 18 | Returns: 19 | args namespace 20 | """ 21 | 22 | # Init parser 23 | parser = argparse.ArgumentParser( 24 | description="Download a slideshare presentation.", 25 | ) 26 | 27 | # Add args 28 | parser.add_argument( 29 | "url", 30 | type=str, 31 | help="Slideshare presentation url.", 32 | ) 33 | parser.add_argument( 34 | "--nopdf", 35 | default=False, 36 | help='Do not combine slides into a pdf. (Individual slides are saved in "slides" folder)', 37 | action="store_true", 38 | ) 39 | 40 | return parser.parse_args() 41 | 42 | 43 | def download_slide(idx, image_url, image_path): 44 | """Use requests module to download a slide (image) 45 | 46 | Args: 47 | idx (string): index of slide 48 | image_url (string): url of slide 49 | image_path (string): save path of slide 50 | """ 51 | 52 | # Print slide being downloaded 53 | print("\x1b[1K\r" + f"Downloading slide: {idx}", end="") 54 | # Download slide, save it in "slides" folder 55 | with open(image_path, "wb") as image: 56 | image.write(requests.get(image_url).content) 57 | 58 | 59 | def download_presentation(url): 60 | """Download a slideshare presentation 61 | 62 | Args: 63 | url (string): url of slideshare presentation 64 | """ 65 | 66 | # Exit if url does not belong to slideshare 67 | if r"www.slideshare.net" not in url: 68 | exit("Invalid link...") 69 | 70 | # Scrape url for slide images 71 | page = requests.get(url) 72 | soup = BeautifulSoup(page.content, "html.parser") 73 | images = soup.find_all("img", class_="slide-image") 74 | no_of_images = len(images) 75 | 76 | # Exit if presentation not found 77 | if not images: 78 | exit("No slides were found...") 79 | print(f"Number of slides to be downloaded: {len(images)}") 80 | 81 | # Make "slides" dir in cwd 82 | if not os.path.isdir(SLIDES_FOLDER): 83 | os.mkdir("slides") 84 | 85 | # Parallelize slide downloading 86 | with ThreadPoolExecutor() as executor: 87 | for idx, image in enumerate(images, start=1): 88 | # Get image url from srcset attribute (csv of image urls, with last value being the highest res) 89 | image_url = image.get("srcset").split(",")[-1].split("?")[0] 90 | 91 | # Format image name to include slide index (with leading zeros) 92 | image_name = ( 93 | f"{str(idx).zfill(len(str(no_of_images)))}-{image_url.split('/')[-1]}" 94 | ) 95 | # Save path of image (cwd/slides/image_name) 96 | image_path = os.path.join("slides", image_name) 97 | 98 | # Check if slide is already downloaded 99 | if os.path.isfile(image_path): 100 | print("\x1b[1K\r" + f"Slide: {idx} exists", end="") 101 | else: 102 | executor.submit(download_slide, idx, image_url, image_path) 103 | 104 | # "\x1b[1K" clear to end of line 105 | print("\x1b[1K\r" + "Slides downloaded") 106 | 107 | 108 | def convert_to_pdf(pdf_name, no_pdf=False): 109 | """Combine set of images within 'slides' folder into a pdf using img2pdf 110 | 111 | Args: 112 | pdf_name (string): name of the final pdf 113 | no_pdf (bool): True to generate a pdf, False to skip generation 114 | """ 115 | 116 | if no_pdf: 117 | return 118 | 119 | # Get all slides sorted by name 120 | slides = [os.path.join(SLIDES_FOLDER, slide) for slide in os.listdir(SLIDES_FOLDER)] 121 | 122 | print("\x1b[1K\r" + "Generating pdf...", end="") 123 | 124 | # Combine slides into a pdf using img2pdf 125 | with open(f"{pdf_name}.pdf", "wb") as pdf: 126 | pdf.write(img2pdf.convert(slides)) 127 | 128 | print("\x1b[1K\r" + f"Generated: {pdf_name}.pdf") 129 | 130 | # Remove "slides" folder 131 | shutil.rmtree(SLIDES_FOLDER) 132 | 133 | 134 | def main(): 135 | freeze_support() 136 | args = create_parser() 137 | download_presentation(args.url) 138 | convert_to_pdf(args.url.split("/")[-1], no_pdf=args.nopdf) 139 | 140 | 141 | if __name__ == "__main__": 142 | main() 143 | --------------------------------------------------------------------------------