├── .gitignore
├── README.md
├── helpers.py
├── index.py
├── requirements
└── test_websites


/.gitignore:
--------------------------------------------------------------------------------
 1 | results
 2 | #### joe made this: http://goel.io/joe
 3 | #### python ####
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 | 
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 | 
63 | # Scrapy stuff:
64 | .scrapy
65 | 
66 | # Sphinx documentation
67 | docs/_build/
68 | 
69 | # PyBuilder
70 | target/
71 | 
72 | # IPython Notebook
73 | .ipynb_checkpoints
74 | 
75 | # pyenv
76 | .python-version
77 | 
78 | # celery beat schedule file
79 | celerybeat-schedule
80 | 
81 | # dotenv
82 | .env
83 | 
84 | # virtualenv
85 | .venv/
86 | venv/
87 | ENV/
88 | 
89 | # Spyder project settings
90 | .spyderproject
91 | 
92 | # Rope project settings
93 | .ropeproject
94 | 
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | web-scraper
 2 | ===========
 3 | 
 4 | A simple script that scrapes a website, extracting texts in a CSV file with the format below, and saving images.
 5 | 
 6 | | Page      | Tag                             | Text         | Link              | Image                  |
 7 | |-----------|---------------------------------|--------------|-------------------|------------------------|
 8 | | page path | element tag (h{1,6}, a, p, etc) | text content | link url (if any) | image address (if any) |
 9 | 
10 | ## Usage
11 | First, install dependencies (python3):
12 | 
13 | ```
14 | pip install -r requirements
15 | ```
16 | 
17 | Then create a file containing urls of the websites you want to scrape, one line for each website, for example (I'll call this file `test_websites`):
18 | 
19 | ```
20 | https://theread.me
21 | https://theguardian.com
22 | ```
23 | 
24 | Now you are ready to execute the script:
25 | 
26 | ```
27 | python index.py test_websites
28 |                 # ^ path to your file
29 | ```
30 | 
31 | After the script is done with it's job, you can find the results in `results/<website_hostname>` folder.
32 | 
33 | To see available options, try `python index.py -h`.
34 | 


--------------------------------------------------------------------------------
/helpers.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse, urlunparse, urljoin
 2 | import os
 3 | from csv import DictWriter
 4 | 
 5 | def to_absolute(url, host):
 6 |     if url == '/': return host
 7 | 
 8 |     if url[0:2] == '//':
 9 |         url = urlparse(host).scheme + ':' + url
10 | 
11 |     p = urlparse(url)
12 |     if not (p.scheme in ['http', 'https', '']): return None
13 | 
14 |     if not p.netloc:
15 |         p = urlparse(urljoin(host, url))
16 | 
17 |     return p
18 |         
19 | def write_results(main, data, first=False):
20 |     with open(os.path.join('results', main.netloc, 'texts.csv'), 'w' if first else 'a') as f:
21 |             w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
22 | 
23 |             if first:
24 |                 w.writeheader()
25 | 
26 |             w.writerows(data)
27 | 
28 | 


--------------------------------------------------------------------------------
/index.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | import sys
 4 | import os
 5 | import shutil
 6 | import argparse
 7 | from urllib.parse import urlparse, urljoin
 8 | from helpers import to_absolute, write_results
 9 | import time
10 | 
11 | tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
12 | 
13 | parser = argparse.ArgumentParser(description='Web scraper')
14 | parser.add_argument('file', help='path to file containing target websites, one line for each')
15 | parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')
16 | parser.add_argument('--no-image', help='do not download images', action='store_true')
17 | parser.add_argument('--delay', help='delay between requests in seconds, use to avoid being treated as an attacker', type=float)
18 | 
19 | args = parser.parse_args()
20 | 
21 | path = os.path.join(os.path.dirname(__file__), args.file)
22 | with open(path) as f:
23 |     sites = [a.replace('\n', '') for a in f.readlines()]
24 | 
25 | for host in sites:
26 |     visited = []
27 |     queue = []
28 | 
29 |     main = urlparse(host.replace('\n', ''))
30 |     base_dir = os.path.join('results', main.netloc)
31 |     images_dir = os.path.join(base_dir, 'images')
32 | 
33 |     if not os.path.isdir(base_dir):
34 |         os.makedirs(base_dir)
35 |     if not os.path.isdir(images_dir):
36 |         os.makedirs(images_dir)
37 | 
38 |     def scrape(url, depth=0):
39 |         data = []
40 |         if args.depth is not None and depth > args.depth: return
41 | 
42 |         t = url.geturl()
43 | 
44 |         if t in visited: return
45 | 
46 |         print(t)
47 | 
48 |         html = requests.get(t).text
49 |         visited.append(t)
50 | 
51 |         soup = BeautifulSoup(html, 'html.parser')
52 |         elements = soup.find_all(tags)
53 | 
54 |         for el in elements:
55 |             href = el.get('href')
56 | 
57 |             if not href and not el.string and not el.name == 'img': continue
58 | 
59 |             record = {
60 |                 'page': url.path,
61 |                 'tag': el.name,
62 |                 'text': el.string,
63 |                 'link': href,
64 |                 'image': el.src if el.name == 'img' else None
65 |             }
66 | 
67 |             if not args.no_image and el.name == 'img' and el.get('src'):
68 |                 p = to_absolute(el.get('src'), host)
69 |                 filepath = os.path.join(images_dir, os.path.basename(p.path))
70 | 
71 |                 if not os.path.exists(filepath):
72 |                     response = requests.get(p.geturl(), stream=True)
73 |                     with open(filepath, 'wb') as out_file:
74 |                         shutil.copyfileobj(response.raw, out_file)
75 |                     del response
76 | 
77 | 
78 |             data.append(record)
79 |             
80 |             if href and href != '/':
81 |                 p = to_absolute(href, host)
82 |                 if p and p.netloc == main.netloc and p.geturl() not in visited:
83 |                     queue.insert(0, p)
84 |             
85 | 
86 |         write_results(main, data, first=depth == 0)
87 | 
88 |         for link in queue:
89 |             queue.remove(link)
90 |             if args.delay is not None:
91 |                 time.sleep(args.delay)
92 |             scrape(link, depth=depth + 1)
93 |                         
94 |     scrape(main)
95 | 


--------------------------------------------------------------------------------
/requirements:
--------------------------------------------------------------------------------
1 | appdirs==1.4.3
2 | beautifulsoup4==4.5.3
3 | packaging==16.8
4 | pyparsing==2.2.0
5 | requests==2.13.0
6 | six==1.10.0
7 | 


--------------------------------------------------------------------------------
/test_websites:
--------------------------------------------------------------------------------
1 | https://www.theguardian.com/international
2 | https://theread.me
3 | 


--------------------------------------------------------------------------------