├── .gitignore
├── Makefile
├── README.md
├── crank
├── s
    ├── index.md
    ├── java.md
    ├── perl.md
    ├── php.md
    ├── python.md
    └── regexes.md
├── static
    ├── css
    │   └── htmlparsing.css
    ├── img
    │   ├── background.png
    │   ├── container_bg.png
    │   ├── footer_bg.png
    │   ├── nav_bg.png
    │   ├── nav_bgbottom.png
    │   ├── navmenu_bg.png
    │   ├── top_bg.png
    │   └── toptop_bg.png
    └── robots.txt
├── t
    └── html.t
└── tt
    ├── header.tt
    ├── page.ttml
    └── sidebar.tt


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: \
 2 | 	crank \
 3 | 	clean
 4 | 
 5 | BUILD=build
 6 | SOURCE=s
 7 | 
 8 | default: crank
 9 | 
10 | clean:
11 | 	rm -fr $(BUILD)
12 | 
13 | crank: clean
14 | 	mkdir -p $(BUILD)/ || true > /dev/null 2>&1
15 | 	./crank --sourcepath=$(SOURCE) --buildpath=$(BUILD)
16 | 	cp -R static/* $(BUILD)/
17 | 
18 | test: crank
19 | 	prove t/html.t
20 | 
21 | # This is only useful for Andy
22 | rsync: crank
23 | 	rsync -azu -e ssh --delete --verbose \
24 | 	    $(BUILD)/ andy@alex.petdance.com:/srv/htmlparsing
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This project is the source code for http://htmlparsing.com, plus the
 2 | Perl code that converts it from Markdown format into HTML and uploads
 3 | it to the server.
 4 | 
 5 | Repository layout
 6 | -----------------
 7 | 
 8 |     s               page bodies in Markdown format
 9 |     tt              templates in Template::Toolkit format
10 |     static          images and styles
11 |     t               tests
12 |     build           output
13 | 
14 | Requirements
15 | ------------
16 | 
17 | Perl and additional CPAN modules.
18 | 
19 | For building:
20 | 
21 |     File::Slurp
22 |     Template
23 |     Text::Markdown
24 | 
25 | For testing:
26 | 
27 |     Test::HTML::Tidy5
28 | 
29 | Contributing page content
30 | -------------------------
31 | 
32 | 1. Modify templates or page bodies. New pages have to be registered in the file `crank`.
33 | 2. Run `make` to build the site and inspect the result in the `build` directory.
34 | 3. Run `make test` to check for HTML errors.
35 | 4. Commit/publish changes, see `s/index.md`.
36 | 


--------------------------------------------------------------------------------
/crank:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | eval 'use Carp::Always'; # Not everyone has it
 7 | 
 8 | use Getopt::Long;
 9 | use File::Slurp;
10 | use Encode qw(decode_utf8);
11 | use Text::Markdown ();
12 | use Template ();
13 | use Template::Constants qw( :debug :chomp );
14 | 
15 | my $sourcepath = 's';
16 | my $buildpath  = 'build';
17 | 
18 | GetOptions(
19 |     'sourcepath:s' => \$sourcepath,
20 |     'buildpath:s'  => \$buildpath,
21 | ) or exit;
22 | 
23 | -d $buildpath && -w $buildpath or die;
24 | 
25 | my $pages = [
26 |     index      => 'Home',
27 |     java       => 'Java',
28 |     perl       => 'Perl',
29 |     php        => 'PHP',
30 |     python     => 'Python',
31 |     regexes    => 'Regexes',
32 | #    asp        => 'ASP',
33 | #    coldfusion => 'ColdFusion',
34 | #    csharp     => 'C#',
35 | #    delphi     => 'Delphi',
36 | #    dotnet     => '.NET',
37 | #    perl       => 'Perl',
38 | #    php        => 'PHP',
39 | #    postgresql => 'PostgreSQL',
40 | #    ruby       => 'Ruby',
41 | #    scheme     => 'Scheme',
42 | ];
43 | 
44 | MAIN: {
45 |     my $m = Text::Markdown->new;
46 | 
47 |     my @sidelinks;
48 | 
49 |     my %tt_defaults = (
50 |         INCLUDE_PATH => [ qw( tt ) ],
51 |         OUTPUT_PATH  => $buildpath,
52 |         DEBUG        => DEBUG_UNDEF,
53 |         TRIM         => CHOMP_ALL,
54 |         PRE_CHOMP    => 1,
55 |         POST_CHOMP   => 1,
56 |         ENCODING     => 'utf8',
57 |     );
58 | 
59 |     my $tt = Template->new( \%tt_defaults );
60 | 
61 |     my @pages = @{$pages};
62 |     while ( @pages ) {
63 |         my ($section,$desc) = splice( @pages, 0, 2 );
64 |         my $path = ($section eq 'index') ? './' : "./$section.html";
65 |         push( @sidelinks, {
66 |             path => $path,
67 |             text => $desc,
68 |         } );
69 |     }
70 | 
71 |     my $vars = {
72 |         sidelinks => \@sidelinks,
73 |     };
74 | 
75 |     @pages = @{$pages};
76 |     while ( @pages ) {
77 |         my ($section,$desc) = splice( @pages, 0, 2 );
78 | 
79 |         my $source = read_file( "$sourcepath/$section.md" );
80 |         my $html   = $m->markdown( $source );
81 |         $html      =~ s{<code>\n}{<code>}smxg;
82 | 
83 |         $vars->{body}     = $html;
84 |         $vars->{currlang} = ( $desc eq 'Home' ) ? '' : $desc;
85 |         $tt->process( 'page.ttml', $vars, "$section.html", { binmode => ':encoding(UTF-8)' } )
86 |             || die sprintf("file: %s\nerror: %s\n", "$section.html", $tt->error);
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/s/index.md:
--------------------------------------------------------------------------------
 1 | # "I need a regular expression to parse my HTML"
 2 | 
 3 | New programmers who want to extract information from an HTML document
 4 | often turn to regular expressions.
 5 | 
 6 | This is rarely a good idea.  HTML is an irregular language and regexes
 7 | are inadequate for the job.  You should use an HTML parser.
 8 | 
 9 | This site shows you how.
10 | 
11 | 
12 | # To do
13 | 
14 | * Add more languages.
15 | * Explain why regexes are bad.
16 | * Explain how fragile regexes are.
17 | 
18 | # Thanks
19 | 
20 | Thanks to the following folks for their contributions:
21 | 
22 | * M. Buettner
23 | * Kirk Kimmel
24 | * Anubhava Srivastava
25 | * Nathan Mahdavi
26 | * Jeffrey Kegler
27 | * Bill Ricker
28 | * Stuart Caie
29 | * and Jeana Clark
30 | 


--------------------------------------------------------------------------------
/s/java.md:
--------------------------------------------------------------------------------
  1 | The important point about Java HTML parsing is to use a parser designed
  2 | for it. While you can parse HTML using the default XML parser, it's a
  3 | brittle thing because it will only accept well formed, strict XHTML.
  4 | 
  5 | 
  6 | # TagSoup library
  7 | 
  8 | Hence, I highly recommend using the TagSoup library which slots right
  9 | into the parsing framework but handles crappy HTML.
 10 | 
 11 | <pre>
 12 | import java.net.URL;
 13 | import org.xml.sax.Attributes;
 14 | import org.xml.sax.helpers.DefaultHandler;
 15 | import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
 16 | 
 17 | public class HTMLParseExample {
 18 |     public static void main(String args[]) throws Exception {
 19 | 
 20 |         // print the 'src' attributes of &lt;img> tags
 21 |         // from http://www.yahoo.com/
 22 |         // using the TagSoup parser
 23 | 
 24 |         SAXParserImpl.newInstance(null).parse(
 25 |             new URL("http://www.yahoo.com/").openConnection().getInputStream(),
 26 |             new DefaultHandler() {
 27 |                 public void startElement(String uri, String localName,
 28 |                                          String name, Attributes a)
 29 |                 {
 30 |                     if (name.equalsIgnoreCase("img"))
 31 |                         System.out.println(a.getValue("src"));
 32 |                 }
 33 |             }
 34 |         );
 35 |     }
 36 | }
 37 | </pre>
 38 | 
 39 | # Xerces
 40 | 
 41 | And here's a slightly more complex example (collect and print the text
 42 | inside nested `<p>` tags), this time using the standard Java XML parser
 43 | Xerxes instead of TagSoup.
 44 | 
 45 | <pre>
 46 | import java.net.URL;
 47 | import java.util.ArrayList;
 48 | import org.xml.sax.Attributes;
 49 | import org.xml.sax.InputSource;
 50 | import org.xml.sax.helpers.DefaultHandler;
 51 | 
 52 | public class XHTMLParseExample {
 53 |     public static void main(String args[]) throws Exception {
 54 | 
 55 |         // print the text in &lt;p> ... &lt;/p> tags on http://www.w3.org/
 56 |         // using the standard Java XML parser, Xerxes
 57 | 
 58 |         javax.xml.parsers.SAXParserFactory.newInstance().newSAXParser().parse(
 59 |             new URL("http://www.w3.org/").openConnection().getInputStream(),
 60 |             new DefaultHandler() {
 61 |                 ArrayList&lt;StringBuilder> p = new ArrayList&lt;StringBuilder>();
 62 | 
 63 |                 public void startElement(String uri, String localName,
 64 |                                          String name, Attributes a)
 65 |                 {
 66 |                     // push a string buffer for every &lt;p> tag
 67 |                     if (name.equalsIgnoreCase("p")) {
 68 |                         p.add(new StringBuilder());
 69 |                     }
 70 |                 }
 71 | 
 72 |                 public void endElement(String uri, String localName, String name)
 73 |                 {
 74 |                     // pop and print a string buffer for every &lt;/p> tag
 75 |                     if (name.equalsIgnoreCase("p")) {
 76 |                         int lastIdx = p.size() - 1;
 77 |                         System.out.print("PARA: " + p.remove(lastIdx));
 78 |                     }
 79 |                 }
 80 | 
 81 |                 public void characters(char[] ch, int start, int length) {
 82 |                     // append any characters to the current string buffer
 83 |                     int lastIdx = p.size() - 1;
 84 |                     if (lastIdx > -1) {
 85 |                         p.get(lastIdx).append(new String(ch, start, length))
 86 |                                       .append(' ');
 87 |                     }
 88 |                 }
 89 | 
 90 |                 // if we don't include a fake resolveEntity() method, Xerxes
 91 |                 // will try to download the entity URI listed its cached DTD:
 92 |                 // http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
 93 |                 public InputSource resolveEntity(String publicId, String systemId)
 94 |                     throws org.xml.sax.SAXException, java.io.IOException
 95 |                 {
 96 |                     final String fake = "&lt;!ENTITY nbsp \"&#160;\">";
 97 |                     return new InputSource(new java.io.StringReader(fake));
 98 |                 }
 99 |             }
100 |         );
101 |     }
102 | }
103 | </pre>
104 | 


--------------------------------------------------------------------------------
/s/perl.md:
--------------------------------------------------------------------------------
  1 | # Perl uses HTML::Parser
  2 | 
  3 | The CPAN module [HTML::Parser][6] is the basis for all HTML parsing
  4 | in Perl.  There are other CPAN modules that do parsing, but the
  5 | vast majority of them are just wrappers around HTML::Parser.
  6 | 
  7 | # Marpa::HTML
  8 | 
  9 | [Marpa::HTML][1] does "high-level" parsing of HTML.  It allows
 10 | handlers to be specified for elements, terminals and other components
 11 | in the hierarchical structure of an HTML document.  It's a is a
 12 | completely liberal HTML parser: it never rejects a document, no
 13 | matter how poorly that document fits the HTML standards.
 14 | 
 15 | The parsing method [Marpa::HTML][1] uses is totally new, as described
 16 | in "How to Parse HTML", Parts [one][2], [two][3] and [three][4].
 17 | Its [Marpa::XS][5] parse engine is in optimized C.
 18 | 
 19 | # WWW::Mechanize
 20 | 
 21 | [WWW::Mechanize][7] is a handy module because it handles two common
 22 | tasks associated with parsing HTML: fetching a remote document and
 23 | extracting basic information from a document.
 24 | 
 25 |     # Fetch the document located at $url
 26 |     $mech->get( $url );
 27 | 
 28 | Calling the `get()` subroutine handles all the lower level work of
 29 | using [LWP](https://metacpan.org/module/LWP) to fetch a page and
 30 | then [HTML::Parser][6] to build up a useful object. This `$mech`
 31 | object has numerous subroutines for accessing all of the data or
 32 | in a piecemeal fashion.
 33 | 
 34 |     # Get the text from the current object
 35 |     my $text = $mech->text();
 36 | 
 37 |     # Return all links
 38 |     my $links = $mech->links();
 39 | 
 40 |     # Return all images
 41 |     my $images = $mech->images();
 42 | 
 43 |     # Fetch the page title
 44 |     my $title = $mech->title();
 45 | 
 46 | [WWW::Mechanize][7] also provides `find_all_links()` and
 47 | `find_all_images()` for searching through all the links and images
 48 | that match a certain criteria, such as:
 49 | 
 50 |     # Find all links with link text of "Download"
 51 |     my @links = $mech->find_all_links( text => 'Download' );
 52 | 
 53 |     # Find all links that look like they might be download
 54 |     my @links = $mech->find_all_links( url_regex => qr/download/i );
 55 | 
 56 | # WWW::Mechanize::TreeBuilder
 57 | 
 58 | [WWW::Mechanize::TreeBuilder][8] is a combination of [WWW::Mechanize][7]
 59 | and [WWW::TreeBuilder][9] which brings the functionality of
 60 | [HTML::Element][10] with it. Now it is possible to search by tag
 61 | name or by attribute.
 62 | 
 63 |     use v5.10;
 64 |     use WWW::Mechanize;
 65 |     use WWW::Mechanize::TreeBuilder;
 66 | 
 67 |     my $mech = WWW::Mechanize->new;
 68 |     WWW::Mechanize::TreeBuilder->meta->apply($mech);
 69 | 
 70 |     $mech->get( 'http://htmlparsing.com/' );
 71 | 
 72 |     # Find all <h1> tags
 73 |     my @list = $mech->find('h1');
 74 | 
 75 |     # or this way
 76 |     my @list = $mech->look_down('_tag', 'h1');
 77 | 
 78 |     # Now just iterate and process
 79 |     foreach (@list) {
 80 |         say $_->as_text();
 81 |     }
 82 | 
 83 | `find()` searches by tag name whereas `look_down()` starts at `$mech`
 84 | and looks thru its element descendants (in pre-order), looking for
 85 | elements matching the criteria you specify. In the above example
 86 | we are using the internal attribute value `_tag` to search for
 87 | `<h1>` tags only. `look_down()` can use HTML attribute names, values
 88 | or be passed a coderef.
 89 | 
 90 | # xmlgrep
 91 | 
 92 | The [XML::Twig](http://search.cpan.org/dist/XML-Twig) module includes the
 93 | `xmlgrep` utility, which can often be good enough.  It doesn't parse,
 94 | but finds local matches.
 95 | 
 96 | 
 97 | # To do
 98 | 
 99 | * Code examples
100 | * Other modules than HTML::Parser
101 | 
102 |   [1]: https://metacpan.org/module/Marpa::HTML "Marpa::HTML"
103 |   [2]: http://blogs.perl.org/users/jeffrey_kegler/2011/11/how-to-parse-html.html "How to Parse HTML 1"
104 |   [3]: http://blogs.perl.org/users/jeffrey_kegler/2011/12/how-to-parse-html-part-2.html "How to Parse HTML 2"
105 |   [4]: http://blogs.perl.org/users/jeffrey_kegler/2011/12/how-to-parse-html-part-3.html "How to Parse HTML 3"
106 |   [5]: https://metacpan.org/module/Marpa::XS "Marpa::XS"
107 |   [6]: http://search.cpan.org/dist/HTML-Parser/
108 |   [7]: https://metacpan.org/module/WWW::Mechanize
109 |   [8]: https://metacpan.org/module/WWW::Mechanize::TreeBuilder
110 |   [9]: https://metacpan.org/module/HTML::TreeBuilder
111 |   [10]: https://metacpan.org/module/HTML::Element
112 | 


--------------------------------------------------------------------------------
/s/php.md:
--------------------------------------------------------------------------------
 1 | # PHP uses DOM
 2 | 
 3 | HTML parsing in PHP is done with the
 4 | [DOM module](http://php.net/manual/en/book.dom.php).
 5 | 
 6 |     $dom = new DOMDocument;
 7 |     $dom->loadHTML($html);
 8 |     $images = $dom->getElementsByTagName('img');
 9 |     foreach ($images as $image) {
10 |             $image->setAttribute('src', 'http://example.com/' . $image->getAttribute('src'));
11 |     }
12 |     $html = $dom->saveHTML();
13 | 
14 | Here's an example for pulling out any `<a>` tags with the `nofollow` attribute:
15 | 
16 |     $doc = new DOMDocument();
17 |     libxml_use_internal_errors(true);
18 |     $doc->loadHTML($html); // loads your HTML
19 |     $xpath = new DOMXPath($doc);
20 |     // returns a list of all links with rel=nofollow
21 |     $nlist = $xpath->query("//a[@rel='nofollow']");
22 | 
23 | ## A simple DOM program to extract Google result links
24 | 
25 |     <?php
26 | 
27 |     # Use the Curl extension to query Google and get back a page of results
28 |     $url = "http://www.google.com";
29 |     $ch = curl_init();
30 |     $timeout = 5;
31 |     curl_setopt($ch, CURLOPT_URL, $url);
32 |     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
33 |     curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
34 |     $html = curl_exec($ch);
35 |     curl_close($ch);
36 | 
37 |     # Create a DOM parser object
38 |     $dom = new DOMDocument();
39 | 
40 |     # Parse the HTML from Google.
41 |     # The @ before the method call suppresses any warnings that
42 |     # loadHTML might throw because of invalid HTML in the page.
43 |     @$dom->loadHTML($html);
44 | 
45 |     # Iterate over all the <a> tags
46 |     foreach($dom->getElementsByTagName('a') as $link) {
47 |             # Show the <a href>
48 |             echo $link->getAttribute('href');
49 |             echo "<br />";
50 |     }
51 |     ?>
52 | 
53 | # simple\_html\_dom
54 | 
55 | The [simple\_html\_dom][simple_html_dom] module is an alternative to
56 | the built-in-DOM module.  Since it is a third-party module, you'll have
57 | to install it yourself.
58 | 
59 | [simple_html_dom]: http://simplehtmldom.sourceforge.net/ "simple_html_dom homepage"
60 | 
61 | ## Modifying links with simple\_html\_dom
62 | 
63 | Say you have some links in your HTML file that look like this:
64 | 
65 |     <a class="someclass" href="/some/file.html">
66 | 
67 | and you want to convert them to:
68 | 
69 |     <a class="someclass" href="http://www.example.com/some/file.html">
70 | 
71 | but only the ones with a class of "someclass".  Here's a program to
72 | do that:
73 | 
74 |     $html = new simple_html_dom();
75 |     $html->load($input);
76 | 
77 |     foreach($html->find('a[class=someclass]') as $link)
78 |         $link->href = 'http://www.example.com' . $link->href;
79 | 
80 |     $result = $html->save();
81 | 
82 | `find` lets you easily query the DOM. The parameter is
83 | `tagtype[attributeName=attributeValue]` where the square brackets are
84 | an optional filter. Then you just iterate over every link this function
85 | finds, and prepend the href attribute with your domain.  The `href`
86 | function is both a getter and setter.
87 | 
88 | ## Extracting text with simple\_html\_dom
89 | 
90 | A common task is to remove all tag markup from a page of HTML, leaving
91 | only the text.  This is simple:
92 | 
93 |     echo file_get_html('http://www.google.com/')->plaintext;
94 | 
95 | # More alternative parsers for PHP
96 | 
97 | [This thread on StackOverflow](http://stackoverflow.com/questions/292926/robust-mature-html-parser-for-php)
98 | discusses a number of different parsing tools available for PHP.
99 | 


--------------------------------------------------------------------------------
/s/python.md:
--------------------------------------------------------------------------------
 1 | * [ElementTree](http://docs.python.org/2/library/xml.etree.elementtree.html) is part of the standard library.
 2 | * [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/) is a popular 3rd-party library.
 3 | * [lxml](http://lxml.de/) is a fast and feature-rich C-based library.
 4 | * [twisted](http://twistedmatrix.com/documents/current/api/twisted.html) includes a DOM library [twisted.web.domhelpers](http://twistedmatrix.com/documents/10.2.0/api/twisted.web.domhelpers.html)
 5 | 
 6 | ElementTree example:
 7 | 
 8 |     from xml.etree import ElementTree
 9 | 
10 |     tree = ElementTree.parse('filename.html')
11 |     for elem in tree.findall('table'):
12 |             print ElementTree.tostring(elem)
13 | 
14 | Examples welcomed!
15 | 


--------------------------------------------------------------------------------
/s/regexes.md:
--------------------------------------------------------------------------------
 1 | # You should probably not be using regular expressions
 2 | 
 3 | * HTML is not regular
 4 | * Regexes may match today, but what about tomorrow?
 5 | 
 6 | Say you've got a file of HTML where you're trying to extract URLs from
 7 | &lt;img&gt; tags.
 8 | 
 9 |     <img src="http://example.com/whatever.jpg">
10 | 
11 | So you write a regex like this (in [Perl](/perl.html)):
12 | 
13 |     if ( $html =~ /<img src="(.+)"/ ) {
14 |         $url = $1;
15 |     }
16 | 
17 | In this case, `$url` will indeed contain
18 | `http://example.com/whatever.jpg`.  But what happens when
19 | you start getting HTML like this:
20 | 
21 |     <img src='http://example.com/whatever.jpg'>
22 | 
23 | or
24 | 
25 |     <img src=http://example.com/whatever.jpg>
26 | 
27 | or
28 | 
29 |     <img border=0 src="http://example.com/whatever.jpg">
30 | 
31 | or
32 | 
33 |     <img
34 |         src="http://example.com/whatever.jpg">
35 | 
36 | or you start getting false positives from
37 | 
38 |     <!-- <img src="http://example.com/outdated.png"> -->
39 | 
40 | # Don't reinvent the wheel
41 | 
42 | Parsers are pieces of code that already work, already have been tested.
43 | 
44 | Your regex probably doesn't have everything worked out.
45 | Parsers have solutions for edge cases built in.
46 | 
47 | # Why not parse with regexes?
48 | 
49 | You can't reliably parse HTML with regexes.  Regular expressions are a
50 | tool that is insufficiently sophisticated to understand the constructs
51 | employed by HTML. HTML is not a regular language and hence cannot be
52 | parsed by regular expressions. Regex queries are not equipped to break
53 | down HTML into its meaningful parts.  HTML is a language of sufficient
54 | complexity that it cannot be parsed by regular expressions.
55 | 


--------------------------------------------------------------------------------
/static/css/htmlparsing.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     margin: 0;
  3 |     padding: 0;
  4 |     background: #dcdcdc url('../img/background.png') repeat-x top;
  5 |     font-family: georgia, 'times new roman', serif;
  6 | }
  7 | 
  8 | /* basic site styling */
  9 | 
 10 | #content a:link, a:visited {
 11 |     color:#800;
 12 |     text-decoration:underline;
 13 | }
 14 | 
 15 | #content a:hover, a:active {
 16 |     color:#515151;
 17 |     text-decoration:underline;
 18 | }
 19 | 
 20 | h1, h2 {
 21 |     font-size:24px;
 22 |     margin-top:40px;
 23 |     margin-bottom:15px;
 24 |     font-weight:normal;
 25 |     font-family:'trebuchet ms', verdana, tahoma, 'sans serif';
 26 | }
 27 | 
 28 | h1 {
 29 |     font-family: 'trebuchet ms', verdana, tahoma, "sans serif";
 30 |     color: #000000;
 31 | }
 32 | 
 33 | h1 a:link {
 34 |     color:#FFFFFF;
 35 |     text-decoration:none;
 36 | }
 37 | 
 38 | h1 a:visited {
 39 |     color:#FFFFFF;
 40 |     text-decoration:none;
 41 | }
 42 | 
 43 | 
 44 | .topic {
 45 |     font-size: 20px;
 46 | }
 47 | 
 48 | #current {
 49 |     color:#800;
 50 |     font-weight:bold;
 51 | }
 52 | 
 53 | 
 54 | 
 55 | 
 56 | /* containers */
 57 | #wrap {
 58 |     -webkit-box-shadow: 0px 3px 15px #2b2b2b;
 59 |     -moz-box-shadow: 0px 3px 15px #2b2b2b;
 60 |     margin:0 5%;
 61 |     min-width:960px;
 62 | }
 63 | 
 64 | #topcontainer {
 65 |     height: 10px;
 66 |     border-top: #666 1px solid;
 67 |     background: #000 url('../img/toptop_bg.png') repeat-x bottom;
 68 | }
 69 | 
 70 | #container, #container2 {
 71 |     background: #FFF url('../img/container_bg.png') repeat-x top;
 72 | }
 73 | 
 74 | div#container2 > a > img { /* github banderole */
 75 |     position: absolute; top: 0; right: 0; border: 0;
 76 | }
 77 | 
 78 | #banner {
 79 |     background:black url('../img/top_bg.png') repeat-x top;
 80 |     border-bottom: 1px solid #333;
 81 | }
 82 | 
 83 | #banner h1 {
 84 |     margin: 0;
 85 |     padding: 30px 5px 30px 20px;
 86 |     color:white;
 87 | }
 88 | 
 89 | /* top right nav */
 90 | #subnav {
 91 |     margin:0;
 92 |     background:#4c4c4c;
 93 |     border-top:1px solid #666;
 94 |     border-bottom:2px solid #fff;
 95 |     padding: 4px 5px 4px 5px;
 96 |     color:#a9a9a9;
 97 |     height:20px;
 98 | }
 99 | 
100 | /* left column */
101 | #left {
102 |     float: left;
103 |     width: 190px;
104 |     padding-left:0px;
105 | }
106 | 
107 | /* main content area */
108 | #content {
109 |     padding-top: 1em;
110 |     margin: 0 2em 0 220px;
111 | }
112 | 
113 | #content img {
114 |     margin: 0 0 15px 15px;
115 | }
116 | 
117 | #content h1 {
118 |     font-weight:bold;
119 | }
120 | 
121 | code {
122 |     /* font & text */
123 |     font-family: monospace;
124 |     font-size: 12.7px;
125 |     line-height: 14px;
126 |     white-space: pre;
127 | 
128 |     /* color & background */
129 |     background-color: #fffae5;
130 |     background-position: 0% 0%;
131 |     color: #000000;
132 | 
133 |     /* box */
134 |     width: 95%;
135 |     border: 1px dashed #f0c931;
136 |     margin: 3px 0 13px 0;
137 |     padding: 13px 13px 13px 13px;
138 | 
139 |     /* positioning */
140 |     display: block;
141 | }
142 | 
143 | p code {
144 |     display: inline;
145 |     padding: 1px;
146 |     font-size: 13px;
147 | }
148 | 
149 | div#content > p > a > img { /* comic */
150 |     float: right;
151 |     border: 0;
152 | }
153 | 
154 | 
155 | /* left nav */
156 | 
157 | 
158 | #nav ul {
159 |     width: 100%;
160 |     margin:0;
161 |     padding:0;
162 |     background: url('../img/navmenu_bg.png') no-repeat top right;
163 |     margin-top:30px;
164 |     font-family:'trebuchet ms', verdana, tahoma, 'sans serif';
165 | }
166 | 
167 | #nav li {
168 |     list-style:none;
169 |     height:45px;
170 |     background: url('../img/nav_bgbottom.png') no-repeat -40px 44px;
171 | }
172 | 
173 | #nav ul li a {
174 |     color:black;
175 |     text-decoration:none;
176 |     padding-left:20px;
177 |     line-height:45px;
178 |     display:block;
179 |     height:45px;
180 | }
181 | 
182 | #nav ul li a:hover {
183 |     color:#800;
184 |     background-color: #f2dd8c;
185 |     text-decoration:underline;
186 | }
187 | 
188 | #nav a:active {
189 |     color:#fff;
190 |     background-color:#800;
191 |     text-decoration:underline;
192 | }
193 | 
194 | 
195 | /* top right nav */
196 | #subnav ul {
197 |     margin:0;
198 |     padding: 0;
199 |     text-decoration:none;
200 | }
201 | 
202 | #subnav li {
203 |     list-style:none;
204 |     text-decoration:none;
205 | }
206 | 
207 | #subnav ul li a {
208 |     display:block;
209 |     color:#a9a9a9;
210 |     text-decoration:none;
211 |     float:right;
212 |     padding:0px 12px 0px 12px;
213 | 
214 | }
215 | 
216 | #subnav ul li a:hover {
217 |     color:white;
218 |     text-decoration:none;
219 | }
220 | 
221 | 
222 | /* footer */
223 | 
224 | #footer {
225 |     clear: both;
226 |     background: #000 url('../img/footer_bg.png') repeat-x top;
227 |     padding: 3em 0 0;
228 |     height:55px;
229 |     margin:0px 0px 0px 0px;
230 |     color:#f9f9f9;
231 |     text-align: center;
232 | }
233 | 
234 | #footer a:link, #footer a:visited {
235 |     color: white;
236 |     text-decoration:none;
237 | }
238 | 
239 | #footer a:hover, #footer a:active {
240 |     color: white;
241 |     text-decoration:underline;
242 | }
243 | 


--------------------------------------------------------------------------------
/static/img/background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/background.png


--------------------------------------------------------------------------------
/static/img/container_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/container_bg.png


--------------------------------------------------------------------------------
/static/img/footer_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/footer_bg.png


--------------------------------------------------------------------------------
/static/img/nav_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/nav_bg.png


--------------------------------------------------------------------------------
/static/img/nav_bgbottom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/nav_bgbottom.png


--------------------------------------------------------------------------------
/static/img/navmenu_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/navmenu_bg.png


--------------------------------------------------------------------------------
/static/img/top_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/top_bg.png


--------------------------------------------------------------------------------
/static/img/toptop_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/toptop_bg.png


--------------------------------------------------------------------------------
/static/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *           # directed to all spiders, not just Scooter
2 | 


--------------------------------------------------------------------------------
/t/html.t:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -T
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | use Test::More;
 7 | use Test::HTML::Tidy5;
 8 | use File::Slurp;
 9 | use Encode qw(decode_utf8);
10 | 
11 | my @files = glob( 'build/*.html' );
12 | plan( tests => scalar @files );
13 | 
14 | for my $filename ( @files ) {
15 |     my $text = decode_utf8(read_file( $filename ));
16 | 
17 |     html_tidy_ok( $text, $filename );
18 | }
19 | 


--------------------------------------------------------------------------------
/tt/header.tt:
--------------------------------------------------------------------------------
 1 | <div id="topcontainer">
 2 |     &nbsp;
 3 | </div>
 4 | <div id="container">
 5 |     <div id="banner">
 6 |         <h1><a href="http://htmlparsing.com/">htmlparsing.com -- How to parse HTML the right way, without regular expressions</a></h1>
 7 |     </div>
 8 |     <!--
 9 |     <div id="subnav">
10 |         <ul>
11 |             <li>
12 |             <a href="#">About Bobby Tables</a>
13 |             </li>
14 |             <li>
15 |             <a href="#">About SQL Injection</a>
16 |             </li>
17 |             <li>
18 |             <a href="#">The Right Way To Do It</a>
19 |             </li>
20 |         </ul>
21 |     </div>
22 |     -->
23 | </div>
24 | 


--------------------------------------------------------------------------------
/tt/page.ttml:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 2 | <html xmlns="http://www.w3.org/1999/xhtml">
 3 |     <head>
 4 |         <title>htmlparsing.com: Your guide to parsing HTML</title>
 5 |         <link rel="stylesheet" type="text/css" href="/css/htmlparsing.css" />
 6 |         <!-- Global site tag (gtag.js) - Google Analytics -->
 7 |         <script async src="https://www.googletagmanager.com/gtag/js?id=G-D69WGEGR26"></script>
 8 |         <script>
 9 |             window.dataLayer = window.dataLayer || [];
10 |             function gtag(){dataLayer.push(arguments);}
11 |             gtag('js', new Date());
12 | 
13 |             gtag('config', 'G-D69WGEGR26');
14 |         </script>
15 |         <meta name="msvalidate.01" content="4EF6B7CF4E86418C29626DA5E8B28AF5" /><!-- Bing verification -->
16 |     </head>
17 |     <body>
18 |         <div id="wrap">
19 |             [% PROCESS header.tt %]
20 |             <div id="container2">
21 |                 [% PROCESS sidebar.tt %]
22 |                 <div id="content">
23 |                     [% body %]
24 |                 </div>
25 |                 <div style="clear:both;"></div>
26 |                 <div id="footer">
27 |                     <p>This site's content is available under the
28 |                     <a href="https://creativecommons.org/licenses/by-sa/3.0/">Creative Commons Attribution-ShareAlike 3.0 License</a>.</p>
29 |                 </div>
30 |                 <a href="http://github.com/petdance/htmlparsing"><img
31 |                     src="http://s3.amazonaws.com/github/ribbons/forkme_right_gray_6d6d6d.png" alt="Fork me on GitHub"
32 |                     height="149" width="149"
33 |                     /></a>
34 |             </div>
35 |         </div>
36 |     </body>
37 | </html>
38 | 


--------------------------------------------------------------------------------
/tt/sidebar.tt:
--------------------------------------------------------------------------------
 1 | <div id="left">
 2 |     <div id="nav">
 3 |         <ul>
 4 |             [% FOR link IN sidelinks %]
 5 |                 <li><a href="[% link.path %]"><span class="topic"
 6 |                     [% IF link.text == currlang %]
 7 |                         id="current"
 8 |                     [% END %]
 9 |                 >[% link.text %]</span></a></li>
10 |             [% END %]
11 |         </ul>
12 |     </div>
13 | </div>
14 | 


--------------------------------------------------------------------------------