├── .gitignore ├── Makefile ├── README.md ├── crank ├── s ├── index.md ├── java.md ├── perl.md ├── php.md ├── python.md └── regexes.md ├── static ├── css │ └── htmlparsing.css ├── img │ ├── background.png │ ├── container_bg.png │ ├── footer_bg.png │ ├── nav_bg.png │ ├── nav_bgbottom.png │ ├── navmenu_bg.png │ ├── top_bg.png │ └── toptop_bg.png └── robots.txt ├── t └── html.t └── tt ├── header.tt ├── page.ttml └── sidebar.tt /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: \ 2 | crank \ 3 | clean 4 | 5 | BUILD=build 6 | SOURCE=s 7 | 8 | default: crank 9 | 10 | clean: 11 | rm -fr $(BUILD) 12 | 13 | crank: clean 14 | mkdir -p $(BUILD)/ || true > /dev/null 2>&1 15 | ./crank --sourcepath=$(SOURCE) --buildpath=$(BUILD) 16 | cp -R static/* $(BUILD)/ 17 | 18 | test: crank 19 | prove t/html.t 20 | 21 | # This is only useful for Andy 22 | rsync: crank 23 | rsync -azu -e ssh --delete --verbose \ 24 | $(BUILD)/ andy@alex.petdance.com:/srv/htmlparsing 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This project is the source code for http://htmlparsing.com, plus the 2 | Perl code that converts it from Markdown format into HTML and uploads 3 | it to the server. 4 | 5 | Repository layout 6 | ----------------- 7 | 8 | s page bodies in Markdown format 9 | tt templates in Template::Toolkit format 10 | static images and styles 11 | t tests 12 | build output 13 | 14 | Requirements 15 | ------------ 16 | 17 | Perl and additional CPAN modules. 18 | 19 | For building: 20 | 21 | File::Slurp 22 | Template 23 | Text::Markdown 24 | 25 | For testing: 26 | 27 | Test::HTML::Tidy5 28 | 29 | Contributing page content 30 | ------------------------- 31 | 32 | 1. Modify templates or page bodies. New pages have to be registered in the file `crank`. 33 | 2. Run `make` to build the site and inspect the result in the `build` directory. 34 | 3. Run `make test` to check for HTML errors. 35 | 4. Commit/publish changes, see `s/index.md`. 36 | -------------------------------------------------------------------------------- /crank: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | eval 'use Carp::Always'; # Not everyone has it 7 | 8 | use Getopt::Long; 9 | use File::Slurp; 10 | use Encode qw(decode_utf8); 11 | use Text::Markdown (); 12 | use Template (); 13 | use Template::Constants qw( :debug :chomp ); 14 | 15 | my $sourcepath = 's'; 16 | my $buildpath = 'build'; 17 | 18 | GetOptions( 19 | 'sourcepath:s' => \$sourcepath, 20 | 'buildpath:s' => \$buildpath, 21 | ) or exit; 22 | 23 | -d $buildpath && -w $buildpath or die; 24 | 25 | my $pages = [ 26 | index => 'Home', 27 | java => 'Java', 28 | perl => 'Perl', 29 | php => 'PHP', 30 | python => 'Python', 31 | regexes => 'Regexes', 32 | # asp => 'ASP', 33 | # coldfusion => 'ColdFusion', 34 | # csharp => 'C#', 35 | # delphi => 'Delphi', 36 | # dotnet => '.NET', 37 | # perl => 'Perl', 38 | # php => 'PHP', 39 | # postgresql => 'PostgreSQL', 40 | # ruby => 'Ruby', 41 | # scheme => 'Scheme', 42 | ]; 43 | 44 | MAIN: { 45 | my $m = Text::Markdown->new; 46 | 47 | my @sidelinks; 48 | 49 | my %tt_defaults = ( 50 | INCLUDE_PATH => [ qw( tt ) ], 51 | OUTPUT_PATH => $buildpath, 52 | DEBUG => DEBUG_UNDEF, 53 | TRIM => CHOMP_ALL, 54 | PRE_CHOMP => 1, 55 | POST_CHOMP => 1, 56 | ENCODING => 'utf8', 57 | ); 58 | 59 | my $tt = Template->new( \%tt_defaults ); 60 | 61 | my @pages = @{$pages}; 62 | while ( @pages ) { 63 | my ($section,$desc) = splice( @pages, 0, 2 ); 64 | my $path = ($section eq 'index') ? './' : "./$section.html"; 65 | push( @sidelinks, { 66 | path => $path, 67 | text => $desc, 68 | } ); 69 | } 70 | 71 | my $vars = { 72 | sidelinks => \@sidelinks, 73 | }; 74 | 75 | @pages = @{$pages}; 76 | while ( @pages ) { 77 | my ($section,$desc) = splice( @pages, 0, 2 ); 78 | 79 | my $source = read_file( "$sourcepath/$section.md" ); 80 | my $html = $m->markdown( $source ); 81 | $html =~ s{\n}{}smxg; 82 | 83 | $vars->{body} = $html; 84 | $vars->{currlang} = ( $desc eq 'Home' ) ? '' : $desc; 85 | $tt->process( 'page.ttml', $vars, "$section.html", { binmode => ':encoding(UTF-8)' } ) 86 | || die sprintf("file: %s\nerror: %s\n", "$section.html", $tt->error); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /s/index.md: -------------------------------------------------------------------------------- 1 | # "I need a regular expression to parse my HTML" 2 | 3 | New programmers who want to extract information from an HTML document 4 | often turn to regular expressions. 5 | 6 | This is rarely a good idea. HTML is an irregular language and regexes 7 | are inadequate for the job. You should use an HTML parser. 8 | 9 | This site shows you how. 10 | 11 | 12 | # To do 13 | 14 | * Add more languages. 15 | * Explain why regexes are bad. 16 | * Explain how fragile regexes are. 17 | 18 | # Thanks 19 | 20 | Thanks to the following folks for their contributions: 21 | 22 | * M. Buettner 23 | * Kirk Kimmel 24 | * Anubhava Srivastava 25 | * Nathan Mahdavi 26 | * Jeffrey Kegler 27 | * Bill Ricker 28 | * Stuart Caie 29 | * and Jeana Clark 30 | -------------------------------------------------------------------------------- /s/java.md: -------------------------------------------------------------------------------- 1 | The important point about Java HTML parsing is to use a parser designed 2 | for it. While you can parse HTML using the default XML parser, it's a 3 | brittle thing because it will only accept well formed, strict XHTML. 4 | 5 | 6 | # TagSoup library 7 | 8 | Hence, I highly recommend using the TagSoup library which slots right 9 | into the parsing framework but handles crappy HTML. 10 | 11 |
 12 | import java.net.URL;
 13 | import org.xml.sax.Attributes;
 14 | import org.xml.sax.helpers.DefaultHandler;
 15 | import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
 16 | 
 17 | public class HTMLParseExample {
 18 |     public static void main(String args[]) throws Exception {
 19 | 
 20 |         // print the 'src' attributes of <img> tags
 21 |         // from http://www.yahoo.com/
 22 |         // using the TagSoup parser
 23 | 
 24 |         SAXParserImpl.newInstance(null).parse(
 25 |             new URL("http://www.yahoo.com/").openConnection().getInputStream(),
 26 |             new DefaultHandler() {
 27 |                 public void startElement(String uri, String localName,
 28 |                                          String name, Attributes a)
 29 |                 {
 30 |                     if (name.equalsIgnoreCase("img"))
 31 |                         System.out.println(a.getValue("src"));
 32 |                 }
 33 |             }
 34 |         );
 35 |     }
 36 | }
 37 | 
38 | 39 | # Xerces 40 | 41 | And here's a slightly more complex example (collect and print the text 42 | inside nested `

` tags), this time using the standard Java XML parser 43 | Xerxes instead of TagSoup. 44 | 45 |

 46 | import java.net.URL;
 47 | import java.util.ArrayList;
 48 | import org.xml.sax.Attributes;
 49 | import org.xml.sax.InputSource;
 50 | import org.xml.sax.helpers.DefaultHandler;
 51 | 
 52 | public class XHTMLParseExample {
 53 |     public static void main(String args[]) throws Exception {
 54 | 
 55 |         // print the text in <p> ... </p> tags on http://www.w3.org/
 56 |         // using the standard Java XML parser, Xerxes
 57 | 
 58 |         javax.xml.parsers.SAXParserFactory.newInstance().newSAXParser().parse(
 59 |             new URL("http://www.w3.org/").openConnection().getInputStream(),
 60 |             new DefaultHandler() {
 61 |                 ArrayList<StringBuilder> p = new ArrayList<StringBuilder>();
 62 | 
 63 |                 public void startElement(String uri, String localName,
 64 |                                          String name, Attributes a)
 65 |                 {
 66 |                     // push a string buffer for every <p> tag
 67 |                     if (name.equalsIgnoreCase("p")) {
 68 |                         p.add(new StringBuilder());
 69 |                     }
 70 |                 }
 71 | 
 72 |                 public void endElement(String uri, String localName, String name)
 73 |                 {
 74 |                     // pop and print a string buffer for every </p> tag
 75 |                     if (name.equalsIgnoreCase("p")) {
 76 |                         int lastIdx = p.size() - 1;
 77 |                         System.out.print("PARA: " + p.remove(lastIdx));
 78 |                     }
 79 |                 }
 80 | 
 81 |                 public void characters(char[] ch, int start, int length) {
 82 |                     // append any characters to the current string buffer
 83 |                     int lastIdx = p.size() - 1;
 84 |                     if (lastIdx > -1) {
 85 |                         p.get(lastIdx).append(new String(ch, start, length))
 86 |                                       .append(' ');
 87 |                     }
 88 |                 }
 89 | 
 90 |                 // if we don't include a fake resolveEntity() method, Xerxes
 91 |                 // will try to download the entity URI listed its cached DTD:
 92 |                 // http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
 93 |                 public InputSource resolveEntity(String publicId, String systemId)
 94 |                     throws org.xml.sax.SAXException, java.io.IOException
 95 |                 {
 96 |                     final String fake = "<!ENTITY nbsp \" \">";
 97 |                     return new InputSource(new java.io.StringReader(fake));
 98 |                 }
 99 |             }
100 |         );
101 |     }
102 | }
103 | 
104 | -------------------------------------------------------------------------------- /s/perl.md: -------------------------------------------------------------------------------- 1 | # Perl uses HTML::Parser 2 | 3 | The CPAN module [HTML::Parser][6] is the basis for all HTML parsing 4 | in Perl. There are other CPAN modules that do parsing, but the 5 | vast majority of them are just wrappers around HTML::Parser. 6 | 7 | # Marpa::HTML 8 | 9 | [Marpa::HTML][1] does "high-level" parsing of HTML. It allows 10 | handlers to be specified for elements, terminals and other components 11 | in the hierarchical structure of an HTML document. It's a is a 12 | completely liberal HTML parser: it never rejects a document, no 13 | matter how poorly that document fits the HTML standards. 14 | 15 | The parsing method [Marpa::HTML][1] uses is totally new, as described 16 | in "How to Parse HTML", Parts [one][2], [two][3] and [three][4]. 17 | Its [Marpa::XS][5] parse engine is in optimized C. 18 | 19 | # WWW::Mechanize 20 | 21 | [WWW::Mechanize][7] is a handy module because it handles two common 22 | tasks associated with parsing HTML: fetching a remote document and 23 | extracting basic information from a document. 24 | 25 | # Fetch the document located at $url 26 | $mech->get( $url ); 27 | 28 | Calling the `get()` subroutine handles all the lower level work of 29 | using [LWP](https://metacpan.org/module/LWP) to fetch a page and 30 | then [HTML::Parser][6] to build up a useful object. This `$mech` 31 | object has numerous subroutines for accessing all of the data or 32 | in a piecemeal fashion. 33 | 34 | # Get the text from the current object 35 | my $text = $mech->text(); 36 | 37 | # Return all links 38 | my $links = $mech->links(); 39 | 40 | # Return all images 41 | my $images = $mech->images(); 42 | 43 | # Fetch the page title 44 | my $title = $mech->title(); 45 | 46 | [WWW::Mechanize][7] also provides `find_all_links()` and 47 | `find_all_images()` for searching through all the links and images 48 | that match a certain criteria, such as: 49 | 50 | # Find all links with link text of "Download" 51 | my @links = $mech->find_all_links( text => 'Download' ); 52 | 53 | # Find all links that look like they might be download 54 | my @links = $mech->find_all_links( url_regex => qr/download/i ); 55 | 56 | # WWW::Mechanize::TreeBuilder 57 | 58 | [WWW::Mechanize::TreeBuilder][8] is a combination of [WWW::Mechanize][7] 59 | and [WWW::TreeBuilder][9] which brings the functionality of 60 | [HTML::Element][10] with it. Now it is possible to search by tag 61 | name or by attribute. 62 | 63 | use v5.10; 64 | use WWW::Mechanize; 65 | use WWW::Mechanize::TreeBuilder; 66 | 67 | my $mech = WWW::Mechanize->new; 68 | WWW::Mechanize::TreeBuilder->meta->apply($mech); 69 | 70 | $mech->get( 'http://htmlparsing.com/' ); 71 | 72 | # Find all

tags 73 | my @list = $mech->find('h1'); 74 | 75 | # or this way 76 | my @list = $mech->look_down('_tag', 'h1'); 77 | 78 | # Now just iterate and process 79 | foreach (@list) { 80 | say $_->as_text(); 81 | } 82 | 83 | `find()` searches by tag name whereas `look_down()` starts at `$mech` 84 | and looks thru its element descendants (in pre-order), looking for 85 | elements matching the criteria you specify. In the above example 86 | we are using the internal attribute value `_tag` to search for 87 | `

` tags only. `look_down()` can use HTML attribute names, values 88 | or be passed a coderef. 89 | 90 | # xmlgrep 91 | 92 | The [XML::Twig](http://search.cpan.org/dist/XML-Twig) module includes the 93 | `xmlgrep` utility, which can often be good enough. It doesn't parse, 94 | but finds local matches. 95 | 96 | 97 | # To do 98 | 99 | * Code examples 100 | * Other modules than HTML::Parser 101 | 102 | [1]: https://metacpan.org/module/Marpa::HTML "Marpa::HTML" 103 | [2]: http://blogs.perl.org/users/jeffrey_kegler/2011/11/how-to-parse-html.html "How to Parse HTML 1" 104 | [3]: http://blogs.perl.org/users/jeffrey_kegler/2011/12/how-to-parse-html-part-2.html "How to Parse HTML 2" 105 | [4]: http://blogs.perl.org/users/jeffrey_kegler/2011/12/how-to-parse-html-part-3.html "How to Parse HTML 3" 106 | [5]: https://metacpan.org/module/Marpa::XS "Marpa::XS" 107 | [6]: http://search.cpan.org/dist/HTML-Parser/ 108 | [7]: https://metacpan.org/module/WWW::Mechanize 109 | [8]: https://metacpan.org/module/WWW::Mechanize::TreeBuilder 110 | [9]: https://metacpan.org/module/HTML::TreeBuilder 111 | [10]: https://metacpan.org/module/HTML::Element 112 | -------------------------------------------------------------------------------- /s/php.md: -------------------------------------------------------------------------------- 1 | # PHP uses DOM 2 | 3 | HTML parsing in PHP is done with the 4 | [DOM module](http://php.net/manual/en/book.dom.php). 5 | 6 | $dom = new DOMDocument; 7 | $dom->loadHTML($html); 8 | $images = $dom->getElementsByTagName('img'); 9 | foreach ($images as $image) { 10 | $image->setAttribute('src', 'http://example.com/' . $image->getAttribute('src')); 11 | } 12 | $html = $dom->saveHTML(); 13 | 14 | Here's an example for pulling out any `` tags with the `nofollow` attribute: 15 | 16 | $doc = new DOMDocument(); 17 | libxml_use_internal_errors(true); 18 | $doc->loadHTML($html); // loads your HTML 19 | $xpath = new DOMXPath($doc); 20 | // returns a list of all links with rel=nofollow 21 | $nlist = $xpath->query("//a[@rel='nofollow']"); 22 | 23 | ## A simple DOM program to extract Google result links 24 | 25 | loadHTML($html); 44 | 45 | # Iterate over all the tags 46 | foreach($dom->getElementsByTagName('a') as $link) { 47 | # Show the 48 | echo $link->getAttribute('href'); 49 | echo "
"; 50 | } 51 | ?> 52 | 53 | # simple\_html\_dom 54 | 55 | The [simple\_html\_dom][simple_html_dom] module is an alternative to 56 | the built-in-DOM module. Since it is a third-party module, you'll have 57 | to install it yourself. 58 | 59 | [simple_html_dom]: http://simplehtmldom.sourceforge.net/ "simple_html_dom homepage" 60 | 61 | ## Modifying links with simple\_html\_dom 62 | 63 | Say you have some links in your HTML file that look like this: 64 | 65 |
66 | 67 | and you want to convert them to: 68 | 69 | 70 | 71 | but only the ones with a class of "someclass". Here's a program to 72 | do that: 73 | 74 | $html = new simple_html_dom(); 75 | $html->load($input); 76 | 77 | foreach($html->find('a[class=someclass]') as $link) 78 | $link->href = 'http://www.example.com' . $link->href; 79 | 80 | $result = $html->save(); 81 | 82 | `find` lets you easily query the DOM. The parameter is 83 | `tagtype[attributeName=attributeValue]` where the square brackets are 84 | an optional filter. Then you just iterate over every link this function 85 | finds, and prepend the href attribute with your domain. The `href` 86 | function is both a getter and setter. 87 | 88 | ## Extracting text with simple\_html\_dom 89 | 90 | A common task is to remove all tag markup from a page of HTML, leaving 91 | only the text. This is simple: 92 | 93 | echo file_get_html('http://www.google.com/')->plaintext; 94 | 95 | # More alternative parsers for PHP 96 | 97 | [This thread on StackOverflow](http://stackoverflow.com/questions/292926/robust-mature-html-parser-for-php) 98 | discusses a number of different parsing tools available for PHP. 99 | -------------------------------------------------------------------------------- /s/python.md: -------------------------------------------------------------------------------- 1 | * [ElementTree](http://docs.python.org/2/library/xml.etree.elementtree.html) is part of the standard library. 2 | * [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/) is a popular 3rd-party library. 3 | * [lxml](http://lxml.de/) is a fast and feature-rich C-based library. 4 | * [twisted](http://twistedmatrix.com/documents/current/api/twisted.html) includes a DOM library [twisted.web.domhelpers](http://twistedmatrix.com/documents/10.2.0/api/twisted.web.domhelpers.html) 5 | 6 | ElementTree example: 7 | 8 | from xml.etree import ElementTree 9 | 10 | tree = ElementTree.parse('filename.html') 11 | for elem in tree.findall('table'): 12 | print ElementTree.tostring(elem) 13 | 14 | Examples welcomed! 15 | -------------------------------------------------------------------------------- /s/regexes.md: -------------------------------------------------------------------------------- 1 | # You should probably not be using regular expressions 2 | 3 | * HTML is not regular 4 | * Regexes may match today, but what about tomorrow? 5 | 6 | Say you've got a file of HTML where you're trying to extract URLs from 7 | <img> tags. 8 | 9 | 10 | 11 | So you write a regex like this (in [Perl](/perl.html)): 12 | 13 | if ( $html =~ / 22 | 23 | or 24 | 25 | 26 | 27 | or 28 | 29 | 30 | 31 | or 32 | 33 | 35 | 36 | or you start getting false positives from 37 | 38 | 39 | 40 | # Don't reinvent the wheel 41 | 42 | Parsers are pieces of code that already work, already have been tested. 43 | 44 | Your regex probably doesn't have everything worked out. 45 | Parsers have solutions for edge cases built in. 46 | 47 | # Why not parse with regexes? 48 | 49 | You can't reliably parse HTML with regexes. Regular expressions are a 50 | tool that is insufficiently sophisticated to understand the constructs 51 | employed by HTML. HTML is not a regular language and hence cannot be 52 | parsed by regular expressions. Regex queries are not equipped to break 53 | down HTML into its meaningful parts. HTML is a language of sufficient 54 | complexity that it cannot be parsed by regular expressions. 55 | -------------------------------------------------------------------------------- /static/css/htmlparsing.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | padding: 0; 4 | background: #dcdcdc url('../img/background.png') repeat-x top; 5 | font-family: georgia, 'times new roman', serif; 6 | } 7 | 8 | /* basic site styling */ 9 | 10 | #content a:link, a:visited { 11 | color:#800; 12 | text-decoration:underline; 13 | } 14 | 15 | #content a:hover, a:active { 16 | color:#515151; 17 | text-decoration:underline; 18 | } 19 | 20 | h1, h2 { 21 | font-size:24px; 22 | margin-top:40px; 23 | margin-bottom:15px; 24 | font-weight:normal; 25 | font-family:'trebuchet ms', verdana, tahoma, 'sans serif'; 26 | } 27 | 28 | h1 { 29 | font-family: 'trebuchet ms', verdana, tahoma, "sans serif"; 30 | color: #000000; 31 | } 32 | 33 | h1 a:link { 34 | color:#FFFFFF; 35 | text-decoration:none; 36 | } 37 | 38 | h1 a:visited { 39 | color:#FFFFFF; 40 | text-decoration:none; 41 | } 42 | 43 | 44 | .topic { 45 | font-size: 20px; 46 | } 47 | 48 | #current { 49 | color:#800; 50 | font-weight:bold; 51 | } 52 | 53 | 54 | 55 | 56 | /* containers */ 57 | #wrap { 58 | -webkit-box-shadow: 0px 3px 15px #2b2b2b; 59 | -moz-box-shadow: 0px 3px 15px #2b2b2b; 60 | margin:0 5%; 61 | min-width:960px; 62 | } 63 | 64 | #topcontainer { 65 | height: 10px; 66 | border-top: #666 1px solid; 67 | background: #000 url('../img/toptop_bg.png') repeat-x bottom; 68 | } 69 | 70 | #container, #container2 { 71 | background: #FFF url('../img/container_bg.png') repeat-x top; 72 | } 73 | 74 | div#container2 > a > img { /* github banderole */ 75 | position: absolute; top: 0; right: 0; border: 0; 76 | } 77 | 78 | #banner { 79 | background:black url('../img/top_bg.png') repeat-x top; 80 | border-bottom: 1px solid #333; 81 | } 82 | 83 | #banner h1 { 84 | margin: 0; 85 | padding: 30px 5px 30px 20px; 86 | color:white; 87 | } 88 | 89 | /* top right nav */ 90 | #subnav { 91 | margin:0; 92 | background:#4c4c4c; 93 | border-top:1px solid #666; 94 | border-bottom:2px solid #fff; 95 | padding: 4px 5px 4px 5px; 96 | color:#a9a9a9; 97 | height:20px; 98 | } 99 | 100 | /* left column */ 101 | #left { 102 | float: left; 103 | width: 190px; 104 | padding-left:0px; 105 | } 106 | 107 | /* main content area */ 108 | #content { 109 | padding-top: 1em; 110 | margin: 0 2em 0 220px; 111 | } 112 | 113 | #content img { 114 | margin: 0 0 15px 15px; 115 | } 116 | 117 | #content h1 { 118 | font-weight:bold; 119 | } 120 | 121 | code { 122 | /* font & text */ 123 | font-family: monospace; 124 | font-size: 12.7px; 125 | line-height: 14px; 126 | white-space: pre; 127 | 128 | /* color & background */ 129 | background-color: #fffae5; 130 | background-position: 0% 0%; 131 | color: #000000; 132 | 133 | /* box */ 134 | width: 95%; 135 | border: 1px dashed #f0c931; 136 | margin: 3px 0 13px 0; 137 | padding: 13px 13px 13px 13px; 138 | 139 | /* positioning */ 140 | display: block; 141 | } 142 | 143 | p code { 144 | display: inline; 145 | padding: 1px; 146 | font-size: 13px; 147 | } 148 | 149 | div#content > p > a > img { /* comic */ 150 | float: right; 151 | border: 0; 152 | } 153 | 154 | 155 | /* left nav */ 156 | 157 | 158 | #nav ul { 159 | width: 100%; 160 | margin:0; 161 | padding:0; 162 | background: url('../img/navmenu_bg.png') no-repeat top right; 163 | margin-top:30px; 164 | font-family:'trebuchet ms', verdana, tahoma, 'sans serif'; 165 | } 166 | 167 | #nav li { 168 | list-style:none; 169 | height:45px; 170 | background: url('../img/nav_bgbottom.png') no-repeat -40px 44px; 171 | } 172 | 173 | #nav ul li a { 174 | color:black; 175 | text-decoration:none; 176 | padding-left:20px; 177 | line-height:45px; 178 | display:block; 179 | height:45px; 180 | } 181 | 182 | #nav ul li a:hover { 183 | color:#800; 184 | background-color: #f2dd8c; 185 | text-decoration:underline; 186 | } 187 | 188 | #nav a:active { 189 | color:#fff; 190 | background-color:#800; 191 | text-decoration:underline; 192 | } 193 | 194 | 195 | /* top right nav */ 196 | #subnav ul { 197 | margin:0; 198 | padding: 0; 199 | text-decoration:none; 200 | } 201 | 202 | #subnav li { 203 | list-style:none; 204 | text-decoration:none; 205 | } 206 | 207 | #subnav ul li a { 208 | display:block; 209 | color:#a9a9a9; 210 | text-decoration:none; 211 | float:right; 212 | padding:0px 12px 0px 12px; 213 | 214 | } 215 | 216 | #subnav ul li a:hover { 217 | color:white; 218 | text-decoration:none; 219 | } 220 | 221 | 222 | /* footer */ 223 | 224 | #footer { 225 | clear: both; 226 | background: #000 url('../img/footer_bg.png') repeat-x top; 227 | padding: 3em 0 0; 228 | height:55px; 229 | margin:0px 0px 0px 0px; 230 | color:#f9f9f9; 231 | text-align: center; 232 | } 233 | 234 | #footer a:link, #footer a:visited { 235 | color: white; 236 | text-decoration:none; 237 | } 238 | 239 | #footer a:hover, #footer a:active { 240 | color: white; 241 | text-decoration:underline; 242 | } 243 | -------------------------------------------------------------------------------- /static/img/background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/background.png -------------------------------------------------------------------------------- /static/img/container_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/container_bg.png -------------------------------------------------------------------------------- /static/img/footer_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/footer_bg.png -------------------------------------------------------------------------------- /static/img/nav_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/nav_bg.png -------------------------------------------------------------------------------- /static/img/nav_bgbottom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/nav_bgbottom.png -------------------------------------------------------------------------------- /static/img/navmenu_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/navmenu_bg.png -------------------------------------------------------------------------------- /static/img/top_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/top_bg.png -------------------------------------------------------------------------------- /static/img/toptop_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petdance/htmlparsing/d7fe4b28d4769cfde044d0499e3b7a505635a055/static/img/toptop_bg.png -------------------------------------------------------------------------------- /static/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * # directed to all spiders, not just Scooter 2 | -------------------------------------------------------------------------------- /t/html.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -T 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use Test::More; 7 | use Test::HTML::Tidy5; 8 | use File::Slurp; 9 | use Encode qw(decode_utf8); 10 | 11 | my @files = glob( 'build/*.html' ); 12 | plan( tests => scalar @files ); 13 | 14 | for my $filename ( @files ) { 15 | my $text = decode_utf8(read_file( $filename )); 16 | 17 | html_tidy_ok( $text, $filename ); 18 | } 19 | -------------------------------------------------------------------------------- /tt/header.tt: -------------------------------------------------------------------------------- 1 |
2 |   3 |
4 |
24 | -------------------------------------------------------------------------------- /tt/page.ttml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | htmlparsing.com: Your guide to parsing HTML 5 | 6 | 7 | 8 | 15 | 16 | 17 | 18 |
19 | [% PROCESS header.tt %] 20 |
21 | [% PROCESS sidebar.tt %] 22 |
23 | [% body %] 24 |
25 |
26 | 30 | Fork me on GitHub 34 |
35 |
36 | 37 | 38 | -------------------------------------------------------------------------------- /tt/sidebar.tt: -------------------------------------------------------------------------------- 1 |
2 | 13 |
14 | --------------------------------------------------------------------------------