├── requirements.txt
├── README.md
└── downloader.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | bs4
3 | tqdm


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Download all images from a web page using python
 2 | 
 3 | [watch demo](https://vimeo.com/723951707)
 4 | 
 5 | ### Required:
 6 | 
 7 | 1. requests
 8 | 2. bs4
 9 | 3. tqdm
10 | 
11 | ### Install required dependencies:
12 | 
13 | ```
14 | pip install -r requirements.txt
15 | ```
16 | 
17 | ### Usage:
18 | 
19 | ```
20 | python downloader.py --help
21 | ```
22 | 
23 | ```
24 | usage: downloader.py [-h] [-p PATH] url
25 | 
26 | a python script that download all images from a web page
27 | 
28 | positional arguments:
29 |   url                   the URL of the web page you want to dwonload images
30 | 
31 | options:
32 |   -h, --help            show this help message and exit
33 |   -p PATH, --path PATH  the directory you want to store your images, default
34 |                         is the domain of URL
35 | 
36 | ```
37 | 
38 | ### Example:
39 | 
40 | ```
41 | python downloader.py -p ermias https://ermiasbahru.vercel.app/
42 | ```
43 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import requests
 4 | import os
 5 | from tqdm import tqdm
 6 | from bs4 import BeautifulSoup as bs
 7 | from urllib.parse import urljoin, urlparse
 8 | import argparse
 9 | 
10 | 
11 | def is_valid(url):
12 |     """check whether the url is a valid URL."""
13 | 
14 |     parsed = urlparse(url)
15 |     return bool(parsed.netloc) and bool(parsed.scheme)
16 | 
17 | 
18 | def get_all_images(url):
19 |     """Returns all image URLs"""
20 | 
21 |     soup = bs(requests.get(url).content, "html.parser")
22 |     urls = []
23 |     for img in tqdm(soup.find_all('img'), 'extracting images...'):
24 |         imgUrl = img.attrs.get('src')
25 |         if not imgUrl:
26 |             # if img does not contain src, skip
27 |             continue
28 |         imgUrl = urljoin(url, imgUrl)
29 | 
30 |         try:
31 |             pos = imgUrl.index('?')
32 |             imgUrl = imgUrl[:pos]
33 |         except ValueError:
34 |             pass
35 | 
36 |         if is_valid(imgUrl):
37 |             urls.append(imgUrl)
38 |     return urls
39 | 
40 | 
41 | def download(url, pathname):
42 |     if not os.path.isdir(pathname):
43 |         os.mkdir(pathname)
44 | 
45 |     resp = requests.get(url, stream=True)
46 | 
47 |     # get total file size
48 |     file_size = int(resp.headers.get('Content-Length', 0))
49 | 
50 |     # get file name
51 |     filename = os.path.join(pathname, url.split("/")[-1])
52 | 
53 |     progressBar = tqdm(resp.iter_content(
54 |         1024), f"downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
55 | 
56 |     with open(filename, "wb") as fn:
57 |         for data in progressBar.iterable:
58 |             fn.write(data)
59 |             # update progress bar
60 |             progressBar.update(len(data))
61 | 
62 | 
63 | def main(url, path):
64 |     imgs = get_all_images(url)
65 |     for img in imgs:
66 |         download(img, path)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser(
71 |         description="a python script that download all images from a web page")
72 |     parser.add_argument(
73 |         "url", help="the URL of the web page you want to dwonload images")
74 |     parser.add_argument(
75 |         "-p", "--path", help="the directory you want to store your images, default is the domain of URL")
76 | 
77 |     args = parser.parse_args()
78 |     url = args.url
79 |     path = args.path
80 | 
81 |     if not path:
82 |         # if the path is not speciefied, use the domain name of that url as a folder name
83 |         path = urlparse(url).netloc
84 | 
85 |     main(url, path)
86 | 


--------------------------------------------------------------------------------