slideshare-dl

├── LICENSE
├── README.md
├── requirements.txt
├── setup.py
└── slideshare_dl
    └── __main__.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Neel Basak
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <img src="https://raw.githubusercontent.com/Neelfrost/github-assets/main/slideshare/logo.png" alt="slideshare-dl logo" width="192">
 3 | </p>
 4 | 
 5 | <h1 align="center">slideshare-dl</h1>
 6 | 
 7 | <p align="center">
 8 |   <b>A simple, multi-threaded, CLI slideshare presentation downloader 🚀</b>
 9 |   <br>
10 |   <b>No login required!</b>
11 | </p>
12 | 
13 | <img src="https://raw.githubusercontent.com/Neelfrost/github-assets/main/slideshare/demo.gif" width="100%">
14 | 
15 | ## Installation
16 | 
17 | Clone repo:
18 | 
19 | ```powershell
20 | git clone https://github.com/Neelfrost/slideshare-dl.git; cd .\slideshare-dl
21 | ```
22 | 
23 | Install using pip:
24 | 
25 | ```powershell
26 | pip install .
27 | ```
28 | 
29 | ## Usage
30 | 
31 | ```powershell
32 | slideshare-dl.exe --help
33 | ```
34 | 
35 | ```powershell
36 | usage: slideshare-dl [-h] [--nopdf] url
37 | 
38 | Download a slideshare presentation.
39 | 
40 | positional arguments:
41 |   url         Slideshare presentation url.
42 | 
43 | options:
44 |   -h, --help  show this help message and exit
45 |   --nopdf     Do not combine slides into a pdf. (Individual slides are saved in "slides" folder)
46 | ```
47 | 
48 | ## Todo
49 | 
50 | -   [x] Speed up download using multi-threading
51 | -   [ ] Implement OCR
52 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.11.1
2 | img2pdf>=0.4.4
3 | requests>=2.27.1
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | def read_contents(fname):
 5 |     with open(fname, encoding="utf-8") as f:
 6 |         return f.read()
 7 | 
 8 | 
 9 | setup(
10 |     name="slideshare-dl",
11 |     version="1.0",
12 |     description="A simple, multi-threaded, CLI slideshare presentation downloader.",
13 |     author="Neel Basak",
14 |     author_email="neelfrost@gmail.com",
15 |     license=read_contents("LICENSE"),
16 |     packages=["slideshare_dl"],
17 |     install_requires=read_contents("requirements.txt").splitlines(),
18 |     entry_points={"console_scripts": ["slideshare-dl = slideshare_dl.__main__:main"]},
19 |     classifiers=[
20 |         "Environment :: Console",
21 |         "License :: MIT",
22 |         "Operating System :: OS Independent",
23 |     ],
24 | )
25 | 


--------------------------------------------------------------------------------
/slideshare_dl/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import shutil
  4 | from concurrent.futures import ThreadPoolExecutor
  5 | from multiprocessing import freeze_support
  6 | from sys import exit
  7 | 
  8 | import img2pdf
  9 | import requests
 10 | from bs4 import BeautifulSoup
 11 | 
 12 | SLIDES_FOLDER = os.path.join(os.getcwd(), "slides")
 13 | 
 14 | 
 15 | def create_parser():
 16 |     """Create CLI parser using argparse
 17 | 
 18 |     Returns:
 19 |             args namespace
 20 |     """
 21 | 
 22 |     # Init parser
 23 |     parser = argparse.ArgumentParser(
 24 |         description="Download a slideshare presentation.",
 25 |     )
 26 | 
 27 |     # Add args
 28 |     parser.add_argument(
 29 |         "url",
 30 |         type=str,
 31 |         help="Slideshare presentation url.",
 32 |     )
 33 |     parser.add_argument(
 34 |         "--nopdf",
 35 |         default=False,
 36 |         help='Do not combine slides into a pdf. (Individual slides are saved in "slides" folder)',
 37 |         action="store_true",
 38 |     )
 39 | 
 40 |     return parser.parse_args()
 41 | 
 42 | 
 43 | def download_slide(idx, image_url, image_path):
 44 |     """Use requests module to download a slide (image)
 45 | 
 46 |     Args:
 47 |         idx (string): index of slide
 48 |         image_url (string): url of slide
 49 |         image_path (string): save path of slide
 50 |     """
 51 | 
 52 |     # Print slide being downloaded
 53 |     print("\x1b[1K\r" + f"Downloading slide: {idx}", end="")
 54 |     # Download slide, save it in "slides" folder
 55 |     with open(image_path, "wb") as image:
 56 |         image.write(requests.get(image_url).content)
 57 | 
 58 | 
 59 | def download_presentation(url):
 60 |     """Download a slideshare presentation
 61 | 
 62 |     Args:
 63 |         url (string): url of slideshare presentation
 64 |     """
 65 | 
 66 |     # Exit if url does not belong to slideshare
 67 |     if r"www.slideshare.net" not in url:
 68 |         exit("Invalid link...")
 69 | 
 70 |     # Scrape url for slide images
 71 |     page = requests.get(url)
 72 |     soup = BeautifulSoup(page.content, "html.parser")
 73 |     images = soup.find_all("img", class_="slide-image")
 74 |     no_of_images = len(images)
 75 | 
 76 |     # Exit if presentation not found
 77 |     if not images:
 78 |         exit("No slides were found...")
 79 |     print(f"Number of slides to be downloaded: {len(images)}")
 80 | 
 81 |     # Make "slides" dir in cwd
 82 |     if not os.path.isdir(SLIDES_FOLDER):
 83 |         os.mkdir("slides")
 84 | 
 85 |     # Parallelize slide downloading
 86 |     with ThreadPoolExecutor() as executor:
 87 |         for idx, image in enumerate(images, start=1):
 88 |             # Get image url from srcset attribute (csv of image urls, with last value being the highest res)
 89 |             image_url = image.get("srcset").split(",")[-1].split("?")[0]
 90 | 
 91 |             # Format image name to include slide index (with leading zeros)
 92 |             image_name = (
 93 |                 f"{str(idx).zfill(len(str(no_of_images)))}-{image_url.split('/')[-1]}"
 94 |             )
 95 |             # Save path of image (cwd/slides/image_name)
 96 |             image_path = os.path.join("slides", image_name)
 97 | 
 98 |             # Check if slide is already downloaded
 99 |             if os.path.isfile(image_path):
100 |                 print("\x1b[1K\r" + f"Slide: {idx} exists", end="")
101 |             else:
102 |                 executor.submit(download_slide, idx, image_url, image_path)
103 | 
104 |     # "\x1b[1K" clear to end of line
105 |     print("\x1b[1K\r" + "Slides downloaded")
106 | 
107 | 
108 | def convert_to_pdf(pdf_name, no_pdf=False):
109 |     """Combine set of images within 'slides' folder into a pdf using img2pdf
110 | 
111 |     Args:
112 |         pdf_name (string): name of the final pdf
113 |         no_pdf (bool): True to generate a pdf, False to skip generation
114 |     """
115 | 
116 |     if no_pdf:
117 |         return
118 | 
119 |     # Get all slides sorted by name
120 |     slides = [os.path.join(SLIDES_FOLDER, slide) for slide in os.listdir(SLIDES_FOLDER)]
121 | 
122 |     print("\x1b[1K\r" + "Generating pdf...", end="")
123 | 
124 |     # Combine slides into a pdf using img2pdf
125 |     with open(f"{pdf_name}.pdf", "wb") as pdf:
126 |         pdf.write(img2pdf.convert(slides))
127 | 
128 |     print("\x1b[1K\r" + f"Generated: {pdf_name}.pdf")
129 | 
130 |     # Remove "slides" folder
131 |     shutil.rmtree(SLIDES_FOLDER)
132 | 
133 | 
134 | def main():
135 |     freeze_support()
136 |     args = create_parser()
137 |     download_presentation(args.url)
138 |     convert_to_pdf(args.url.split("/")[-1], no_pdf=args.nopdf)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()
143 | 


--------------------------------------------------------------------------------