├── .gitignore ├── README.md ├── helpers.py ├── index.py ├── requirements └── test_websites /.gitignore: -------------------------------------------------------------------------------- 1 | results 2 | #### joe made this: http://goel.io/joe 3 | #### python #### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # IPython Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv/ 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | web-scraper 2 | =========== 3 | 4 | A simple script that scrapes a website, extracting texts in a CSV file with the format below, and saving images. 5 | 6 | | Page | Tag | Text | Link | Image | 7 | |-----------|---------------------------------|--------------|-------------------|------------------------| 8 | | page path | element tag (h{1,6}, a, p, etc) | text content | link url (if any) | image address (if any) | 9 | 10 | ## Usage 11 | First, install dependencies (python3): 12 | 13 | ``` 14 | pip install -r requirements 15 | ``` 16 | 17 | Then create a file containing urls of the websites you want to scrape, one line for each website, for example (I'll call this file `test_websites`): 18 | 19 | ``` 20 | https://theread.me 21 | https://theguardian.com 22 | ``` 23 | 24 | Now you are ready to execute the script: 25 | 26 | ``` 27 | python index.py test_websites 28 | # ^ path to your file 29 | ``` 30 | 31 | After the script is done with it's job, you can find the results in `results/` folder. 32 | 33 | To see available options, try `python index.py -h`. 34 | -------------------------------------------------------------------------------- /helpers.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse, urlunparse, urljoin 2 | import os 3 | from csv import DictWriter 4 | 5 | def to_absolute(url, host): 6 | if url == '/': return host 7 | 8 | if url[0:2] == '//': 9 | url = urlparse(host).scheme + ':' + url 10 | 11 | p = urlparse(url) 12 | if not (p.scheme in ['http', 'https', '']): return None 13 | 14 | if not p.netloc: 15 | p = urlparse(urljoin(host, url)) 16 | 17 | return p 18 | 19 | def write_results(main, data, first=False): 20 | with open(os.path.join('results', main.netloc, 'texts.csv'), 'w' if first else 'a') as f: 21 | w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image']) 22 | 23 | if first: 24 | w.writeheader() 25 | 26 | w.writerows(data) 27 | 28 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import sys 4 | import os 5 | import shutil 6 | import argparse 7 | from urllib.parse import urlparse, urljoin 8 | from helpers import to_absolute, write_results 9 | import time 10 | 11 | tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img'] 12 | 13 | parser = argparse.ArgumentParser(description='Web scraper') 14 | parser.add_argument('file', help='path to file containing target websites, one line for each') 15 | parser.add_argument('--depth', type=int, help='how deep should the scraper follow links') 16 | parser.add_argument('--no-image', help='do not download images', action='store_true') 17 | parser.add_argument('--delay', help='delay between requests in seconds, use to avoid being treated as an attacker', type=float) 18 | 19 | args = parser.parse_args() 20 | 21 | path = os.path.join(os.path.dirname(__file__), args.file) 22 | with open(path) as f: 23 | sites = [a.replace('\n', '') for a in f.readlines()] 24 | 25 | for host in sites: 26 | visited = [] 27 | queue = [] 28 | 29 | main = urlparse(host.replace('\n', '')) 30 | base_dir = os.path.join('results', main.netloc) 31 | images_dir = os.path.join(base_dir, 'images') 32 | 33 | if not os.path.isdir(base_dir): 34 | os.makedirs(base_dir) 35 | if not os.path.isdir(images_dir): 36 | os.makedirs(images_dir) 37 | 38 | def scrape(url, depth=0): 39 | data = [] 40 | if args.depth is not None and depth > args.depth: return 41 | 42 | t = url.geturl() 43 | 44 | if t in visited: return 45 | 46 | print(t) 47 | 48 | html = requests.get(t).text 49 | visited.append(t) 50 | 51 | soup = BeautifulSoup(html, 'html.parser') 52 | elements = soup.find_all(tags) 53 | 54 | for el in elements: 55 | href = el.get('href') 56 | 57 | if not href and not el.string and not el.name == 'img': continue 58 | 59 | record = { 60 | 'page': url.path, 61 | 'tag': el.name, 62 | 'text': el.string, 63 | 'link': href, 64 | 'image': el.src if el.name == 'img' else None 65 | } 66 | 67 | if not args.no_image and el.name == 'img' and el.get('src'): 68 | p = to_absolute(el.get('src'), host) 69 | filepath = os.path.join(images_dir, os.path.basename(p.path)) 70 | 71 | if not os.path.exists(filepath): 72 | response = requests.get(p.geturl(), stream=True) 73 | with open(filepath, 'wb') as out_file: 74 | shutil.copyfileobj(response.raw, out_file) 75 | del response 76 | 77 | 78 | data.append(record) 79 | 80 | if href and href != '/': 81 | p = to_absolute(href, host) 82 | if p and p.netloc == main.netloc and p.geturl() not in visited: 83 | queue.insert(0, p) 84 | 85 | 86 | write_results(main, data, first=depth == 0) 87 | 88 | for link in queue: 89 | queue.remove(link) 90 | if args.delay is not None: 91 | time.sleep(args.delay) 92 | scrape(link, depth=depth + 1) 93 | 94 | scrape(main) 95 | -------------------------------------------------------------------------------- /requirements: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | beautifulsoup4==4.5.3 3 | packaging==16.8 4 | pyparsing==2.2.0 5 | requests==2.13.0 6 | six==1.10.0 7 | -------------------------------------------------------------------------------- /test_websites: -------------------------------------------------------------------------------- 1 | https://www.theguardian.com/international 2 | https://theread.me 3 | --------------------------------------------------------------------------------