├── README.md └── html2csv.py /README.md: -------------------------------------------------------------------------------- 1 | html2csv.py 2 | =================================== 3 | 4 | This handy script reads an HTML file and prints each of the HTML tables 5 | therein to separate CSV files. 6 | 7 | Examples 8 | -------- 9 | 10 | ./html2csv.py webpage.html 11 | 12 | cat webpage.html | ./html2csv.py 13 | 14 | Credits 15 | ------- 16 | 17 | Fernando Ferreira provided changes that updated the script to Python3. -------------------------------------------------------------------------------- /html2csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | #Richard's html2csv converter 4 | #rijard.barnes@gmail.com 5 | # 6 | 7 | from bs4 import BeautifulSoup 8 | import argparse 9 | import csv 10 | import fileinput 11 | import sys 12 | 13 | parser = argparse.ArgumentParser(description='Reads in an HTML and attempts to convert all tables into CSV files.') 14 | parser.add_argument('--delimiter', '-d', action='store', default=',',help="Character with which to separate CSV columns") 15 | parser.add_argument('--quotechar', '-q', action='store', default='"',help="Character within which to nest CSV text") 16 | parser.add_argument('filename', nargs="?", help="HTML file from which to extract tables") 17 | args = parser.parse_args() 18 | 19 | data = list(fileinput.input(files=(args.filename, ) if args.filename is not None else ('-', ))) 20 | data = "\n".join(data) 21 | 22 | soup = BeautifulSoup(data, "html.parser") 23 | 24 | #Preemptively removing unnecessary tags 25 | [s.extract() for s in soup('script')] 26 | 27 | for index,table in enumerate(soup.findAll("table")): 28 | fout = csv.writer(sys.stdout, delimiter=args.delimiter, quotechar=args.quotechar, quoting=csv.QUOTE_MINIMAL) 29 | for row in table.findAll('tr'): 30 | cols = row.findAll(['td','th']) 31 | if cols: 32 | cols = [str(x.text).strip() for x in cols] 33 | fout.writerow(cols) 34 | 35 | print("\n\n###") --------------------------------------------------------------------------------