├── requirements.txt ├── README.md └── downloader.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | bs4 3 | tqdm -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Download all images from a web page using python 2 | 3 | [watch demo](https://vimeo.com/723951707) 4 | 5 | ### Required: 6 | 7 | 1. requests 8 | 2. bs4 9 | 3. tqdm 10 | 11 | ### Install required dependencies: 12 | 13 | ``` 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | ### Usage: 18 | 19 | ``` 20 | python downloader.py --help 21 | ``` 22 | 23 | ``` 24 | usage: downloader.py [-h] [-p PATH] url 25 | 26 | a python script that download all images from a web page 27 | 28 | positional arguments: 29 | url the URL of the web page you want to dwonload images 30 | 31 | options: 32 | -h, --help show this help message and exit 33 | -p PATH, --path PATH the directory you want to store your images, default 34 | is the domain of URL 35 | 36 | ``` 37 | 38 | ### Example: 39 | 40 | ``` 41 | python downloader.py -p ermias https://ermiasbahru.vercel.app/ 42 | ``` 43 | -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import requests 4 | import os 5 | from tqdm import tqdm 6 | from bs4 import BeautifulSoup as bs 7 | from urllib.parse import urljoin, urlparse 8 | import argparse 9 | 10 | 11 | def is_valid(url): 12 | """check whether the url is a valid URL.""" 13 | 14 | parsed = urlparse(url) 15 | return bool(parsed.netloc) and bool(parsed.scheme) 16 | 17 | 18 | def get_all_images(url): 19 | """Returns all image URLs""" 20 | 21 | soup = bs(requests.get(url).content, "html.parser") 22 | urls = [] 23 | for img in tqdm(soup.find_all('img'), 'extracting images...'): 24 | imgUrl = img.attrs.get('src') 25 | if not imgUrl: 26 | # if img does not contain src, skip 27 | continue 28 | imgUrl = urljoin(url, imgUrl) 29 | 30 | try: 31 | pos = imgUrl.index('?') 32 | imgUrl = imgUrl[:pos] 33 | except ValueError: 34 | pass 35 | 36 | if is_valid(imgUrl): 37 | urls.append(imgUrl) 38 | return urls 39 | 40 | 41 | def download(url, pathname): 42 | if not os.path.isdir(pathname): 43 | os.mkdir(pathname) 44 | 45 | resp = requests.get(url, stream=True) 46 | 47 | # get total file size 48 | file_size = int(resp.headers.get('Content-Length', 0)) 49 | 50 | # get file name 51 | filename = os.path.join(pathname, url.split("/")[-1]) 52 | 53 | progressBar = tqdm(resp.iter_content( 54 | 1024), f"downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024) 55 | 56 | with open(filename, "wb") as fn: 57 | for data in progressBar.iterable: 58 | fn.write(data) 59 | # update progress bar 60 | progressBar.update(len(data)) 61 | 62 | 63 | def main(url, path): 64 | imgs = get_all_images(url) 65 | for img in imgs: 66 | download(img, path) 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser( 71 | description="a python script that download all images from a web page") 72 | parser.add_argument( 73 | "url", help="the URL of the web page you want to dwonload images") 74 | parser.add_argument( 75 | "-p", "--path", help="the directory you want to store your images, default is the domain of URL") 76 | 77 | args = parser.parse_args() 78 | url = args.url 79 | path = args.path 80 | 81 | if not path: 82 | # if the path is not speciefied, use the domain name of that url as a folder name 83 | path = urlparse(url).netloc 84 | 85 | main(url, path) 86 | --------------------------------------------------------------------------------