├── datasetscraper.egg-info
    ├── dependency_links.txt
    ├── top_level.txt
    ├── requires.txt
    ├── SOURCES.txt
    └── PKG-INFO
├── datasetscraper
    ├── __init__.py
    ├── __pycache__
    │   ├── scraper.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── downloader.cpython-36.pyc
    │   └── pageagent.cpython-36.pyc
    ├── Scraper.py
    ├── downloader.py
    └── pageagent.py
├── dist
    ├── datasetscraper-0.0.4.tar.gz
    └── datasetscraper-0.0.4-py3-none-any.whl
├── setup.py
├── DatasetScraper
    └── scraper.py
├── README.md
└── LICENSE


/datasetscraper.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/datasetscraper.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | datasetscraper
2 | 


--------------------------------------------------------------------------------
/datasetscraper/__init__.py:
--------------------------------------------------------------------------------
1 | name = "datasetscraper"
2 | 


--------------------------------------------------------------------------------
/datasetscraper.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | pyppeteer>=0.0.25
2 | fastprogress>=0.1.21
3 | requests>=2.19.1
4 | 


--------------------------------------------------------------------------------
/dist/datasetscraper-0.0.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibkumr/DatasetScraper/HEAD/dist/datasetscraper-0.0.4.tar.gz


--------------------------------------------------------------------------------
/dist/datasetscraper-0.0.4-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibkumr/DatasetScraper/HEAD/dist/datasetscraper-0.0.4-py3-none-any.whl


--------------------------------------------------------------------------------
/datasetscraper/__pycache__/scraper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibkumr/DatasetScraper/HEAD/datasetscraper/__pycache__/scraper.cpython-36.pyc


--------------------------------------------------------------------------------
/datasetscraper/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibkumr/DatasetScraper/HEAD/datasetscraper/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/datasetscraper/__pycache__/downloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibkumr/DatasetScraper/HEAD/datasetscraper/__pycache__/downloader.cpython-36.pyc


--------------------------------------------------------------------------------
/datasetscraper/__pycache__/pageagent.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vaibkumr/DatasetScraper/HEAD/datasetscraper/__pycache__/pageagent.cpython-36.pyc


--------------------------------------------------------------------------------
/datasetscraper.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | setup.py
 3 | datasetscraper/Scraper.py
 4 | datasetscraper/__init__.py
 5 | datasetscraper/downloader.py
 6 | datasetscraper/pageagent.py
 7 | datasetscraper.egg-info/PKG-INFO
 8 | datasetscraper.egg-info/SOURCES.txt
 9 | datasetscraper.egg-info/dependency_links.txt
10 | datasetscraper.egg-info/requires.txt
11 | datasetscraper.egg-info/top_level.txt


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | # reqs = (line.strip() for line in open("requirements.txt"))
 4 | LONG_DESC = open('README.md').read()
 5 | setuptools.setup(
 6 |     name="datasetscraper",
 7 |     version="0.0.4",
 8 |     author="Time Traveller",
 9 |     author_email="time.traveller.san@gmail.com",
10 |     keywords = ['dataset_scraper', 'machine learning', 'dataset', 'images', 'scrape',
11 |                 'yandex', 'google', 'bing', 'baidu'],
12 |     description="Tool to create image datasets for machine learning problems\
13 | by scraping search engines like Google, Bing and Baidu. ",
14 |     long_description=LONG_DESC,
15 |     long_description_content_type="text/markdown",
16 |     url="https://github.com/TimeTraveller-San/datasetscraper",
17 |     license="GPLv2",
18 |     packages=setuptools.find_packages(),
19 |     install_requires=[
20 |                 'pyppeteer>=0.0.25',
21 |                 'fastprogress>=0.1.21',
22 |                 'requests>=2.19.1',
23 |     ],
24 |     python_requires='>=3',
25 |     classifiers=[
26 |         "Programming Language :: Python :: 3",
27 |         "Operating System :: POSIX :: Linux",
28 |     ],
29 | )
30 | 


--------------------------------------------------------------------------------
/datasetscraper.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: datasetscraper
 3 | Version: 0.0.4
 4 | Summary: Tool to create image datasets for machine learning problemsby scraping search engines like Google, Bing and Baidu. 
 5 | Home-page: https://github.com/TimeTraveller-San/datasetscraper
 6 | Author: Time Traveller
 7 | Author-email: time.traveller.san@gmail.com
 8 | License: GPLv2
 9 | Description: # DatasetScraper
10 |         Tool to create image datasets for machine learning problems by scraping search engines like Google, Bing and Baidu.
11 |         
12 |         # Features:
13 |         - **Search engine support**: Google, Bing, Baidu. (in-production): Yahoo, Yandex, Duckduckgo
14 |         - **Image format support**: jpg, png, svg, gif, jpeg
15 |         - Fast multiprocessing enabled scraper
16 |         - Very fast multithreaded downloader
17 |         - Data verification after download for assertion of image files
18 |         
19 |         # Installation
20 |         - COMING SOON on pypi
21 |         
22 |         # Usage:
23 |         - Import
24 |         `from datasetscraper import Scraper`
25 |         
26 |         - Defaults
27 |         ```python
28 |         obj = Scraper()
29 |         urls = obj.fetch_urls('kiniro mosaic')
30 |         obj.download(urls, directory='kiniro_mosaic/')
31 |         ```
32 |         
33 |         - Specify a search engine
34 |         ```python
35 |         obj = Scraper()
36 |         urls = obj.fetch_urls('kiniro mosaic', engine=['google'])
37 |         obj.download(urls, directory='kiniro_mosaic/')
38 |         ```
39 |         
40 |         - Specify a list of search engines
41 |         ```python
42 |         obj = Scraper()
43 |         urls = obj.fetch_urls('kiniro mosaic', engine=['google', 'bing'])
44 |         obj.download(urls, directory='kiniro_mosaic/')
45 |         ```
46 |         
47 |         - Specify max images (default was 200)
48 |         ```python
49 |         obj = Scraper()
50 |         urls = obj.fetch_urls('kiniro mosaic', engine=['google', 'bing'], maxlist=[500, 300])
51 |         obj.download(urls, directory='kiniro_mosaic/')
52 |         ```
53 |         
54 |         # FAQs
55 |         - Why aren't yandex, yahoo, duckduckgo and other search engines supported?
56 |         They are hard to scrape, I am working on them and will update as soon as I can.
57 |         
58 |         - I set maxlist=[500] why are only (x<500) images downloaded?
59 |         There can be several reasons for this:
60 |             - Search ran out: This happens very often, google/bing might not have enough images for your query
61 |             - Slow internet: Increase the timeout (default is 60 seconds) as follows: ```obj.download(urls, directory='kiniro_mosaic/', timeout=100)```
62 |         
63 |         - How to debug?
64 |         You can change the logging level while making the scraper object : `obj = Scraper(logger.INFO)`
65 |         
66 |         # TODO:
67 |         - More search engines
68 |         - Better debug
69 |         - Write documentation
70 |         - Text data? Audio data?
71 |         
72 | Keywords: dataset_scraper,machine learning,dataset,images,scrape,yandex,google,bing,baidu
73 | Platform: UNKNOWN
74 | Classifier: Programming Language :: Python :: 3
75 | Classifier: Operating System :: POSIX :: Linux
76 | Requires-Python: >=3
77 | Description-Content-Type: text/markdown
78 | 


--------------------------------------------------------------------------------
/DatasetScraper/scraper.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from pyppeteer import launch
 3 | import time
 4 | from pathlib import Path
 5 | import urllib
 6 | import urllib.request as request
 7 | import os
 8 | import logging
 9 | import concurrent.futures
10 | import sys
11 | from .downloader import Downloader
12 | from .pageagent import PageAgent
13 | 
14 | class Scraper():
15 |     def __init__(self, logLevel = logging.WARNING, headless=True):
16 |         self.configure_logging(logLevel)
17 |         self.patch_pyppeteer()
18 |         self.headless = headless
19 | 
20 |     def patch_pyppeteer(self):
21 |         import pyppeteer.connection
22 |         original_method = pyppeteer.connection.websockets.client.connect
23 |         def new_method(*args, **kwargs):
24 |             kwargs['ping_interval'] = None
25 |             kwargs['ping_timeout'] = None
26 |             return original_method(*args, **kwargs)
27 | 
28 |         pyppeteer.connection.websockets.client.connect = new_method
29 | 
30 |     def configure_logging(self, logLevel):
31 |         logger = logging.getLogger()
32 |         pyppeteer_logger = logging.getLogger('pyppeteer')
33 |         logger.setLevel(logLevel)
34 |         handler = logging.StreamHandler()
35 |         handler.setFormatter(
36 |             logging.Formatter('[%(asctime)s %(levelname)s \
37 | %(module)s]: %(message)s'))
38 |         logger.addHandler(handler)
39 |         return logger
40 | 
41 |     def fetch_urls(self, query, engine='google', maxlist=[200], format='jpg'):
42 |         logger = logging.getLogger()
43 |         print("Fetching URLs..")
44 |         if type(engine) is not list: engine = [ engine ]
45 |         if type(format) is not list: format = [ format ]
46 |         if type(maxlist) is not list: maxlist = [ maxlist for e in engine ]
47 |         assert len(maxlist) == len(engine), f"Length of max ({len(maxlist)}) not same as engine ({len(engine)})"
48 |         self.engine, self.format = engine, format
49 |         urlDict, totalLen = {}, 0
50 |         for e, max in zip(engine, maxlist):
51 |             logger.info(f"Fetching URLs from {e}")
52 |             urlDict[e] = asyncio.get_event_loop().run_until_complete(
53 |                                         self.launch_engine(e, query, max))
54 |             totalLen += len(urlDict[e])
55 |             logger.info(f"Fetched {len(urlDict[e])} URLs from {e}")
56 |             print(f"Fetched {len(urlDict[e])} URLs from {e}")
57 |         logger = logging.getLogger()
58 |         logger.info(f"Total Number of URLs fetched: {totalLen}")
59 |         urlSaved = self.mixUrls(urlDict, maxlist)
60 |         logger.info(f"Number of URLs saving: {len(urlSaved)}")
61 |         print(f"{len(urlSaved)} URLs fetched")
62 |         return urlSaved
63 | 
64 |     def mixUrls(self, urlDict, maxlist):
65 |         urls = []
66 |         for key, max in zip(urlDict, maxlist):
67 |             urlDict[key] = urlDict[key][:max]
68 |             for url in urlDict[key]:
69 |                 if url not in urls:
70 |                     urls.append(url)
71 |         return urls
72 | 
73 |     def download(self, urls, directory='images/',
74 |                 formats=['jpg', 'png', 'jpeg'], default='jpg',
75 |                 nworkers=10, timeout=30):
76 |         downloader = Downloader(directory, formats, default,
77 |                                 nworkers, timeout)
78 |         downloader.download(urls)
79 | 
80 |     async def launch_engine(self, engine, query, max=200):
81 |         agent = PageAgent(engine, query, max, self.headless)
82 |         return await agent.get_list()
83 | 


--------------------------------------------------------------------------------
/datasetscraper/Scraper.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from pyppeteer import launch
 3 | import time
 4 | from pathlib import Path
 5 | import urllib
 6 | import urllib.request as request
 7 | import os
 8 | import logging
 9 | import concurrent.futures
10 | import sys
11 | from .downloader import Downloader
12 | from .pageagent import PageAgent
13 | 
14 | class Scraper():
15 |     def __init__(self, logLevel = logging.WARNING, headless=True):
16 |         self.configure_logging(logLevel)
17 |         self.patch_pyppeteer()
18 |         self.headless = headless
19 | 
20 |     def patch_pyppeteer(self):
21 |         import pyppeteer.connection
22 |         original_method = pyppeteer.connection.websockets.client.connect
23 |         def new_method(*args, **kwargs):
24 |             kwargs['ping_interval'] = None
25 |             kwargs['ping_timeout'] = None
26 |             return original_method(*args, **kwargs)
27 | 
28 |         pyppeteer.connection.websockets.client.connect = new_method
29 | 
30 |     def configure_logging(self, logLevel):
31 |         logger = logging.getLogger()
32 |         pyppeteer_logger = logging.getLogger('pyppeteer')
33 |         logger.setLevel(logLevel)
34 |         handler = logging.StreamHandler()
35 |         handler.setFormatter(
36 |             logging.Formatter('[%(asctime)s %(levelname)s \
37 | %(module)s]: %(message)s'))
38 |         logger.addHandler(handler)
39 |         return logger
40 | 
41 |     def fetch_urls(self, query, engine='google', maxlist=[200], format='jpg'):
42 |         logger = logging.getLogger()
43 |         print("Fetching URLs..")
44 |         if type(engine) is not list: engine = [ engine ]
45 |         if type(format) is not list: format = [ format ]
46 |         if type(maxlist) is not list: maxlist = [ maxlist for e in engine ]
47 |         assert len(maxlist) == len(engine), f"Length of max ({len(maxlist)}) not same as engine ({len(engine)})"
48 |         self.engine, self.format = engine, format
49 |         urlDict, totalLen = {}, 0
50 |         for e, max in zip(engine, maxlist):
51 |             logger.info(f"Fetching URLs from {e}")
52 |             urlDict[e] = asyncio.get_event_loop().run_until_complete(
53 |                                         self.launch_engine(e, query, max))
54 |             totalLen += len(urlDict[e])
55 |             logger.info(f"Fetched {len(urlDict[e])} URLs from {e}")
56 |             print(f"Fetched {len(urlDict[e])} URLs from {e}")
57 |         logger = logging.getLogger()
58 |         logger.info(f"Total Number of URLs fetched: {totalLen}")
59 |         urlSaved = self.mixUrls(urlDict, maxlist)
60 |         logger.info(f"Number of URLs saving: {len(urlSaved)}")
61 |         print(f"{len(urlSaved)} URLs fetched")
62 |         return urlSaved
63 | 
64 |     def mixUrls(self, urlDict, maxlist):
65 |         urls = []
66 |         for key, max in zip(urlDict, maxlist):
67 |             urlDict[key] = urlDict[key][:max]
68 |             for url in urlDict[key]:
69 |                 if url not in urls:
70 |                     urls.append(url)
71 |         return urls
72 | 
73 |     def download(self, urls, directory='images/',
74 |                 formats=['jpg', 'png', 'jpeg'], default='jpg',
75 |                 nworkers=10, timeout=30):
76 |         downloader = Downloader(directory, formats, default,
77 |                                 nworkers, timeout)
78 |         downloader.download(urls)
79 | 
80 |     async def launch_engine(self, engine, query, max=200):
81 |         agent = PageAgent(engine, query, max, self.headless)
82 |         return await agent.get_list()
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DatasetScraper
 2 | Tool to create image datasets for machine learning problems by scraping search engines like Google, Bing and Baidu.
 3 | 
 4 | # Features:
 5 | - **Search engine support**: Google, Bing, Baidu. (in-production): Yahoo, Yandex, Duckduckgo
 6 | - **Image format support**: jpg, png, svg, gif, jpeg
 7 | - Fast multiprocessing enabled scraper
 8 | - Very fast multithreaded downloader
 9 | - Data verification after download for assertion of image files
10 | 
11 | # Installation
12 | `pip install datasetscraper`
13 | Alternatively, you can clone this repository:
14 | 
15 | 
16 | # About
17 | - `Scraper.Scraper()`
18 |   - creates a scraper object which can be used to scrape URLs from search engines and later download them.
19 |   - `Scraper.Scraper(logLevel = logging.WARNING, headless=True)` are the defaults.
20 |       - headless: If False then you can see the actual pypeeter browser
21 |       - logLevel: Simple python logger level
22 | 
23 | - `fetch_urls()`
24 |   - To fetch image URLs
25 |   - `fetch_urls(query, engine='google', maxlist=[200])`
26 |       - query: Search query
27 |       - engine: search engine or list of search engines
28 |       - maxlist: Maximum number of images to download. If multiple search engines exist then this needs to be a list specifying maximum number of images to fetch from each engine.
29 | 
30 | - download
31 |   - To download the fetched URLs into a directory
32 |   - download(urls, directory='images/',
33 |                 formats=['jpg', 'png', 'jpeg'], default='jpg',
34 |                 nworkers=10, timeout=30)
35 |       - urls: List of urls
36 |       - directory: Place on disk where downloaded images are stored
37 |       - formats: List of file formats to dwonload
38 |       - nworkers: Number of threads for multithreaded download
39 |       - timeout: maximum time before which the download is cancelled
40 | 
41 | # Usage:
42 | - Import
43 | `from datasetscraper import Scraper`
44 | 
45 | - Defaults
46 | ```python
47 | obj = Scraper.Scraper()
48 | urls = obj.fetch_urls('kiniro mosaic')
49 | obj.download(urls, directory='kiniro_mosaic/')
50 | ```
51 | 
52 | - Specify a search engine
53 | ```python
54 | obj = Scraper.Scraper()
55 | urls = obj.fetch_urls('kiniro mosaic', engine=['google'])
56 | obj.download(urls, directory='kiniro_mosaic/')
57 | ```
58 | 
59 | - Specify a list of search engines
60 | ```python
61 | obj = Scraper.Scraper()
62 | urls = obj.fetch_urls('kiniro mosaic', engine=['google', 'bing'])
63 | obj.download(urls, directory='kiniro_mosaic/')
64 | ```
65 | 
66 | - Specify max images (default was 200)
67 | ```python
68 | obj = Scraper.Scraper()
69 | urls = obj.fetch_urls('kiniro mosaic', engine=['google', 'bing'], maxlist=[500, 300])
70 | obj.download(urls, directory='kiniro_mosaic/')
71 | ```
72 | 
73 | # FAQs
74 | - Why aren't yandex, yahoo, duckduckgo and other search engines supported?
75 | They are hard to scrape, I am working on them and will update as soon as I can.
76 | 
77 | - I set maxlist=[500] why are only (x<500) images downloaded?
78 | There can be several reasons for this:
79 |     - Search ran out: This happens very often, google/bing might not have enough images for your query
80 |     - Slow internet: Increase the timeout (default is 60 seconds) as follows: ```obj.download(urls, directory='kiniro_mosaic/', timeout=100)```
81 | 
82 | - How to debug?
83 | You can change the logging level while making the scraper object : `obj = Scraper(logger.INFO)`
84 | 
85 | # TODO:
86 | - More search engines
87 | - Better debug
88 | - Write documentation
89 | - Text data? Audio data?
90 | 
91 | # Black hole chan kawaii
92 | ![](https://i.imgur.com/6OIxoup.jpg)
93 | 


--------------------------------------------------------------------------------
/datasetscraper/downloader.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from pyppeteer import launch
 3 | import time
 4 | from pathlib import Path
 5 | import urllib
 6 | import urllib.request as request
 7 | import os
 8 | import logging
 9 | import concurrent.futures
10 | import sys
11 | import requests as req
12 | import traceback
13 | from multiprocessing.pool import ThreadPool
14 | from fastprogress import progress_bar
15 | import imghdr
16 | 
17 | class Downloader():
18 |     def __init__(self, directory='images/', formats=['jpg', 'png', 'jpeg'],
19 |                  default='jpg', nworkers=10, timeout=60):
20 |         self.formats, self.default = formats, default
21 |         self.directory = Path(directory)
22 |         self.MBFACTOR = float(1 << 20)
23 |         self.nworkers, self.timeout = nworkers, timeout
24 |         if not os.path.exists(directory):
25 |             os.makedirs(directory)
26 | 
27 |     def download(self, urls, timeout=200):
28 |         self.timeout = timeout
29 |         logger = logging.getLogger()
30 |         urls = self.clean_urls(urls)
31 |         with concurrent.futures.ThreadPoolExecutor(
32 |                                     max_workers = self.nworkers) as ex:
33 |             futures = [ex.submit(self.save_image, url['url'],
34 |                        self.directory/f"{str(i) + url['format']}") for i,\
35 |                        url in enumerate(urls)]
36 |             for f in progress_bar(concurrent.futures.as_completed(futures),
37 |                        total=len(urls)): pass
38 |         self.verify()
39 | 
40 |     def save_file(self, file_name, response):
41 |         with open(file_name, 'wb') as fh:
42 |             for chunk in response.iter_content(1024 * 1024):
43 |                 fh.write(chunk)
44 | 
45 |     def save_image(self, url, file_name):
46 |         logger = logging.getLogger()
47 |         opener = request.build_opener()
48 |         user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; \
49 |         rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
50 |         try:
51 |             opener.addheaders = [('User-agent', user_agent)]
52 |             request.install_opener(opener)
53 |             logger.info(f"[Downloading] {url} [AS FILE] {file_name}")
54 |             response = req.get(url, timeout=self.timeout)
55 |             self.save_file(file_name, response)
56 |             try:
57 |                 size = int(response.headers['Content-Length'])/self.MBFACTOR
58 |             except:
59 |                 size = 0
60 |         except Exception as e:
61 |             # logger.info(f"[FAILED] {file_name} - {url} because: \n{e}")
62 |             # logger.info(f"[EXCEPTION]:{traceback.print_tb(e.__traceback__)}")
63 |             if os.path.isfile(file_name): #Delete if urlretrieve fails
64 |                 os.remove(file_name)
65 |             return #exit function
66 |         # logger.info(f"[Downloading] {file_name}")
67 |         logger.info(f"[Done] {file_name} with size: {round(size, 3)} MB")
68 | 
69 |     def clean_urls(self, urls):
70 |         formats = [f'.{format}' for format in self.formats]
71 |         default = f".{self.default}"
72 |         furls = []
73 |         for url in urls:
74 |             found = False
75 |             for format in formats:
76 |                 if format in url:
77 |                     found = True
78 |                     break
79 |             if not found:
80 |                 continue
81 |             furls.append({
82 |                             'url':url,
83 |                             'format':format,
84 |                             })
85 |         return furls
86 | 
87 |     def verify(self):
88 |         print("Verifying download...")
89 |         logger = logging.getLogger("Verifying download...")
90 |         deleted = 0
91 |         for file in os.listdir(self.directory):
92 |             if not imghdr.what(self.directory/file):
93 |                 os.remove(self.directory/file)
94 |                 deleted += 1
95 |         print(f"Deleted {deleted} corrupt/non-image files")
96 |         logger = logging.getLogger(f"Deleted {deleted} corrupt/non-image files")
97 | 


--------------------------------------------------------------------------------
/datasetscraper/pageagent.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from pyppeteer import launch
  3 | import time
  4 | from pathlib import Path
  5 | import urllib
  6 | import urllib.request as request
  7 | import os
  8 | import logging
  9 | import concurrent.futures
 10 | import sys
 11 | 
 12 | SANDBOX_ERROR = """No usable sandbox for chrome.\nFor a quick fix try \
 13 | running: `sudo sysctl -w kernel.unprivileged_userns_clone=1` on your linux \
 14 | shell. You need to set up chrome sandbox, for more info check out: \nhttps\
 15 | ://github.com/GoogleChrome/\
 16 | puppeteer/blob/master/docs/troubleshooting.md"""
 17 | 
 18 | class PageAgent():
 19 |     def __init__(self, engine, query, max=200, headless=True):
 20 |         self.engine = engine
 21 |         self.query = query
 22 |         self.max = max
 23 |         self.headless = headless
 24 |         # More search engine support coming soon, please update the INFO
 25 |         # dict below if you can.
 26 |         self.INFO = {
 27 |                 "google":{
 28 |                             "nclick": 400,
 29 |                             "nscroll": 5,
 30 |                             "urlStart": "https://www.google.com/search?q=",
 31 |                             "urlEnd": "&source=lnms&tbm=isch",
 32 |                             "jClick": "document.getElementById(\"smb\").click();",
 33 |                             "jsFunc": "Array.from(document.querySelectorAll('.rg_di .rg_meta')).\
 34 |                                       map(el=>JSON.parse(el.textContent).ou);"
 35 |                 },
 36 |                 "yahoo":{
 37 |                             "nclick": 500,
 38 |                             "nscroll": 10,
 39 |                             "urlStart": "https://images.search.yahoo.com/search/images;?&p=",
 40 |                             "urlEnd": "&ei=UTF-8&iscqry=&fr=sfp",
 41 |                             "jClick": "document.getElementsByClassName(\"ygbt more-res\")[0].click();",
 42 |                             #jsFunc is pretty hard to write here
 43 |                             "jsFunc": "Array.from(document.querySelectorAll('a.iusc')).map(x=>x.attributes.m).map(x=>JSON.parse(x.nodeValue)[\"murl\"]);",
 44 |                 },
 45 |                 "bing":{
 46 |                             "nclick": 400,
 47 |                             "nscroll": 10,
 48 |                             "urlStart": "https://www.bing.com/images/search?q=",
 49 |                             "urlEnd": "&source=lnms&tbm=isch",
 50 |                             "jClick": "document.getElementsByClassName(\"btn_seemore cbtn mBtn\")[0].click();",
 51 |                             "jsFunc": "Array.from(document.querySelectorAll('a.iusc')).map(x=>x.attributes.m).map(x=>JSON.parse(x.nodeValue)[\"murl\"]);",
 52 |                 },
 53 |                 "duckduckgo":{
 54 |                             "nclick": 500,
 55 |                             "nscroll": 10,
 56 |                             "urlStart": "https://duckduckgo.com/?q=",
 57 |                             "urlEnd": "&t=h_&ia=images&iax=images",
 58 |                             "jClick": "document.getElementsByClassName(\"btn_seemore cbtn mBtn\")[0].click();",
 59 |                             "jsFunc": "Array.from(document.querySelectorAll('img.tile--img__img')).map(x=>x.src).map(x=>JSON.parse(x.nodeValue)[\"murl\"]);",
 60 |                 },
 61 |                 "baidu":{
 62 |                             "nclick": 500,
 63 |                             "nscroll": 25,
 64 |                             "urlStart": "https://image.baidu.com/search/index?tn=baiduimage&word=",
 65 |                             "urlEnd": "",
 66 |                             "jClick": "document.getElementsByClassName(\"btn_seemore cbtn mBtn\")[0].click();",
 67 |                             "jsFunc": "Array.from(document.querySelectorAll('li.imgitem')).map(x=>x.dataset.objurl);",
 68 |                 },
 69 |         }
 70 | 
 71 |     async def scroll(self, page, nscroll):
 72 |         logger = logging.getLogger()
 73 |         jscroll = "window.scrollBy(0, document.body.scrollHeight);"
 74 |         logger.info(f"[SCROLLING PAGE..]")
 75 |         for x in range(nscroll):
 76 |             try:
 77 |                 await page.evaluate(jscroll)
 78 |                 await page.waitFor(2000)
 79 |             except:
 80 |                 logger.info(f"Can't scroll")
 81 |                 return
 82 | 
 83 |     async def click_more(self, page, nscroll, jClick, nclicks = 1):
 84 |         logger = logging.getLogger()
 85 |         logger.info(f"[NClicks: ] {nclicks}")
 86 |         if nclicks == 0:
 87 |             await self.scroll(page, nscroll)
 88 |             return
 89 | 
 90 |         for _ in range(nclicks):
 91 |             try:
 92 |                 await self.scroll(page, nscroll)
 93 |                 await page.evaluate(jClick, force_expr=True),
 94 |                 await self.scroll(page, nscroll)
 95 |             except:
 96 |                 return
 97 | 
 98 |     async def get_list(self):
 99 |         engine, query, max = self.engine, self.query, self.max
100 |         info = self.INFO[engine]
101 |         nclick = max // info['nclick'] + 1
102 |         if nclick < 2:
103 |             nclick = 0
104 |         try:
105 |             browser = await launch(headless=self.headless)
106 |         except Exception as exception:
107 |             print(SANDBOX_ERROR)
108 |             sys.exit(0)
109 |         if self.engine == 'baidu':
110 |             nclick = 0
111 |         page = await browser.newPage()
112 |         await page.setUserAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64)\
113 |                                 AppleWebKit/537.36 (KHTML, like Gecko) \
114 |                                 Chrome/66.0.3359.181 Safari/537.36")
115 |         url = info['urlStart'] + str(query) + info['urlEnd']
116 |         await page.goto(url, timeout=0)
117 |         await self.click_more(page, info['nscroll'], info['jClick'], nclick)
118 |         urlList = await page.evaluate(info["jsFunc"], force_expr=True)
119 |         await browser.close()
120 |         return urlList
121 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------