├── LICENSE
├── README.md
├── requirements.txt
├── setup.py
└── slideshare_dl
└── __main__.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Neel Basak
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | slideshare-dl
6 |
7 |
8 | A simple, multi-threaded, CLI slideshare presentation downloader 🚀
9 |
10 | No login required!
11 |
12 |
13 |
14 |
15 | ## Installation
16 |
17 | Clone repo:
18 |
19 | ```powershell
20 | git clone https://github.com/Neelfrost/slideshare-dl.git; cd .\slideshare-dl
21 | ```
22 |
23 | Install using pip:
24 |
25 | ```powershell
26 | pip install .
27 | ```
28 |
29 | ## Usage
30 |
31 | ```powershell
32 | slideshare-dl.exe --help
33 | ```
34 |
35 | ```powershell
36 | usage: slideshare-dl [-h] [--nopdf] url
37 |
38 | Download a slideshare presentation.
39 |
40 | positional arguments:
41 | url Slideshare presentation url.
42 |
43 | options:
44 | -h, --help show this help message and exit
45 | --nopdf Do not combine slides into a pdf. (Individual slides are saved in "slides" folder)
46 | ```
47 |
48 | ## Todo
49 |
50 | - [x] Speed up download using multi-threading
51 | - [ ] Implement OCR
52 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.11.1
2 | img2pdf>=0.4.4
3 | requests>=2.27.1
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 |
4 | def read_contents(fname):
5 | with open(fname, encoding="utf-8") as f:
6 | return f.read()
7 |
8 |
9 | setup(
10 | name="slideshare-dl",
11 | version="1.0",
12 | description="A simple, multi-threaded, CLI slideshare presentation downloader.",
13 | author="Neel Basak",
14 | author_email="neelfrost@gmail.com",
15 | license=read_contents("LICENSE"),
16 | packages=["slideshare_dl"],
17 | install_requires=read_contents("requirements.txt").splitlines(),
18 | entry_points={"console_scripts": ["slideshare-dl = slideshare_dl.__main__:main"]},
19 | classifiers=[
20 | "Environment :: Console",
21 | "License :: MIT",
22 | "Operating System :: OS Independent",
23 | ],
24 | )
25 |
--------------------------------------------------------------------------------
/slideshare_dl/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import shutil
4 | from concurrent.futures import ThreadPoolExecutor
5 | from multiprocessing import freeze_support
6 | from sys import exit
7 |
8 | import img2pdf
9 | import requests
10 | from bs4 import BeautifulSoup
11 |
12 | SLIDES_FOLDER = os.path.join(os.getcwd(), "slides")
13 |
14 |
15 | def create_parser():
16 | """Create CLI parser using argparse
17 |
18 | Returns:
19 | args namespace
20 | """
21 |
22 | # Init parser
23 | parser = argparse.ArgumentParser(
24 | description="Download a slideshare presentation.",
25 | )
26 |
27 | # Add args
28 | parser.add_argument(
29 | "url",
30 | type=str,
31 | help="Slideshare presentation url.",
32 | )
33 | parser.add_argument(
34 | "--nopdf",
35 | default=False,
36 | help='Do not combine slides into a pdf. (Individual slides are saved in "slides" folder)',
37 | action="store_true",
38 | )
39 |
40 | return parser.parse_args()
41 |
42 |
43 | def download_slide(idx, image_url, image_path):
44 | """Use requests module to download a slide (image)
45 |
46 | Args:
47 | idx (string): index of slide
48 | image_url (string): url of slide
49 | image_path (string): save path of slide
50 | """
51 |
52 | # Print slide being downloaded
53 | print("\x1b[1K\r" + f"Downloading slide: {idx}", end="")
54 | # Download slide, save it in "slides" folder
55 | with open(image_path, "wb") as image:
56 | image.write(requests.get(image_url).content)
57 |
58 |
59 | def download_presentation(url):
60 | """Download a slideshare presentation
61 |
62 | Args:
63 | url (string): url of slideshare presentation
64 | """
65 |
66 | # Exit if url does not belong to slideshare
67 | if r"www.slideshare.net" not in url:
68 | exit("Invalid link...")
69 |
70 | # Scrape url for slide images
71 | page = requests.get(url)
72 | soup = BeautifulSoup(page.content, "html.parser")
73 | images = soup.find_all("img", class_="slide-image")
74 | no_of_images = len(images)
75 |
76 | # Exit if presentation not found
77 | if not images:
78 | exit("No slides were found...")
79 | print(f"Number of slides to be downloaded: {len(images)}")
80 |
81 | # Make "slides" dir in cwd
82 | if not os.path.isdir(SLIDES_FOLDER):
83 | os.mkdir("slides")
84 |
85 | # Parallelize slide downloading
86 | with ThreadPoolExecutor() as executor:
87 | for idx, image in enumerate(images, start=1):
88 | # Get image url from srcset attribute (csv of image urls, with last value being the highest res)
89 | image_url = image.get("srcset").split(",")[-1].split("?")[0]
90 |
91 | # Format image name to include slide index (with leading zeros)
92 | image_name = (
93 | f"{str(idx).zfill(len(str(no_of_images)))}-{image_url.split('/')[-1]}"
94 | )
95 | # Save path of image (cwd/slides/image_name)
96 | image_path = os.path.join("slides", image_name)
97 |
98 | # Check if slide is already downloaded
99 | if os.path.isfile(image_path):
100 | print("\x1b[1K\r" + f"Slide: {idx} exists", end="")
101 | else:
102 | executor.submit(download_slide, idx, image_url, image_path)
103 |
104 | # "\x1b[1K" clear to end of line
105 | print("\x1b[1K\r" + "Slides downloaded")
106 |
107 |
108 | def convert_to_pdf(pdf_name, no_pdf=False):
109 | """Combine set of images within 'slides' folder into a pdf using img2pdf
110 |
111 | Args:
112 | pdf_name (string): name of the final pdf
113 | no_pdf (bool): True to generate a pdf, False to skip generation
114 | """
115 |
116 | if no_pdf:
117 | return
118 |
119 | # Get all slides sorted by name
120 | slides = [os.path.join(SLIDES_FOLDER, slide) for slide in os.listdir(SLIDES_FOLDER)]
121 |
122 | print("\x1b[1K\r" + "Generating pdf...", end="")
123 |
124 | # Combine slides into a pdf using img2pdf
125 | with open(f"{pdf_name}.pdf", "wb") as pdf:
126 | pdf.write(img2pdf.convert(slides))
127 |
128 | print("\x1b[1K\r" + f"Generated: {pdf_name}.pdf")
129 |
130 | # Remove "slides" folder
131 | shutil.rmtree(SLIDES_FOLDER)
132 |
133 |
134 | def main():
135 | freeze_support()
136 | args = create_parser()
137 | download_presentation(args.url)
138 | convert_to_pdf(args.url.split("/")[-1], no_pdf=args.nopdf)
139 |
140 |
141 | if __name__ == "__main__":
142 | main()
143 |
--------------------------------------------------------------------------------