├── README.md
└── html2csv.py
/README.md:
--------------------------------------------------------------------------------
1 | html2csv.py
2 | ===================================
3 |
4 | This handy script reads an HTML file and prints each of the HTML tables
5 | therein to separate CSV files.
6 |
7 | Examples
8 | --------
9 |
10 | ./html2csv.py webpage.html
11 |
12 | cat webpage.html | ./html2csv.py
13 |
14 | Credits
15 | -------
16 |
17 | Fernando Ferreira provided changes that updated the script to Python3.
--------------------------------------------------------------------------------
/html2csv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #Richard's html2csv converter
4 | #rijard.barnes@gmail.com
5 | #
6 |
7 | from bs4 import BeautifulSoup
8 | import argparse
9 | import csv
10 | import fileinput
11 | import sys
12 |
13 | parser = argparse.ArgumentParser(description='Reads in an HTML and attempts to convert all tables into CSV files.')
14 | parser.add_argument('--delimiter', '-d', action='store', default=',',help="Character with which to separate CSV columns")
15 | parser.add_argument('--quotechar', '-q', action='store', default='"',help="Character within which to nest CSV text")
16 | parser.add_argument('filename', nargs="?", help="HTML file from which to extract tables")
17 | args = parser.parse_args()
18 |
19 | data = list(fileinput.input(files=(args.filename, ) if args.filename is not None else ('-', )))
20 | data = "\n".join(data)
21 |
22 | soup = BeautifulSoup(data, "html.parser")
23 |
24 | #Preemptively removing unnecessary tags
25 | [s.extract() for s in soup('script')]
26 |
27 | for index,table in enumerate(soup.findAll("table")):
28 | fout = csv.writer(sys.stdout, delimiter=args.delimiter, quotechar=args.quotechar, quoting=csv.QUOTE_MINIMAL)
29 | for row in table.findAll('tr'):
30 | cols = row.findAll(['td','th'])
31 | if cols:
32 | cols = [str(x.text).strip() for x in cols]
33 | fout.writerow(cols)
34 |
35 | print("\n\n###")
--------------------------------------------------------------------------------