├── .gitignore
├── CHANGES
├── LICENSE
├── README.md
├── blogs
    ├── htmlcleaner
    │   └── htmlcleaner.md
    ├── images
    │   ├── compiler.pages
    │   ├── compiler.png
    │   ├── hacker.png
    │   ├── html compiler.png
    │   ├── select uml.png
    │   ├── streetfighter.jpg
    │   ├── uml.zargo
    │   ├── uml.zargo~
    │   └── 类图.png
    ├── jsoup1.md
    ├── jsoup2.md
    ├── jsoup3.md
    ├── jsoup4.md
    ├── jsoup5.md
    ├── jsoup6.md
    ├── jsoup7.md
    └── jsoup8.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   ├── org
        │   │   └── jsoup
        │   │   │   ├── Connection.java
        │   │   │   ├── HttpStatusException.java
        │   │   │   ├── Jsoup.java
        │   │   │   ├── UnsupportedMimeTypeException.java
        │   │   │   ├── examples
        │   │   │       ├── HtmlToPlainText.java
        │   │   │       ├── ListLinks.java
        │   │   │       └── package-info.java
        │   │   │   ├── helper
        │   │   │       ├── DataUtil.java
        │   │   │       ├── DescendableLinkedList.java
        │   │   │       ├── HttpConnection.java
        │   │   │       ├── StringUtil.java
        │   │   │       └── Validate.java
        │   │   │   ├── nodes
        │   │   │       ├── Attribute.java
        │   │   │       ├── Attributes.java
        │   │   │       ├── Comment.java
        │   │   │       ├── DataNode.java
        │   │   │       ├── Document.java
        │   │   │       ├── DocumentType.java
        │   │   │       ├── Element.java
        │   │   │       ├── Entities.java
        │   │   │       ├── FormElement.java
        │   │   │       ├── Node.java
        │   │   │       ├── TextNode.java
        │   │   │       ├── XmlDeclaration.java
        │   │   │       ├── entities-base.properties
        │   │   │       ├── entities-full.properties
        │   │   │       └── package-info.java
        │   │   │   ├── package-info.java
        │   │   │   ├── parser
        │   │   │       ├── CharacterReader.java
        │   │   │       ├── HtmlTreeBuilder.java
        │   │   │       ├── HtmlTreeBuilderState.java
        │   │   │       ├── ITokeniserState.java
        │   │   │       ├── MiniSoupTokeniserState.java
        │   │   │       ├── ParseError.java
        │   │   │       ├── ParseErrorList.java
        │   │   │       ├── Parser.java
        │   │   │       ├── Tag.java
        │   │   │       ├── Token.java
        │   │   │       ├── TokenQueue.java
        │   │   │       ├── Tokeniser.java
        │   │   │       ├── TokeniserState.java
        │   │   │       ├── TreeBuilder.java
        │   │   │       ├── XmlTreeBuilder.java
        │   │   │       └── package-info.java
        │   │   │   ├── safety
        │   │   │       ├── Cleaner.java
        │   │   │       ├── Whitelist.java
        │   │   │       └── package-info.java
        │   │   │   └── select
        │   │   │       ├── Collector.java
        │   │   │       ├── CombiningEvaluator.java
        │   │   │       ├── Elements.java
        │   │   │       ├── Evaluator.java
        │   │   │       ├── NodeTraversor.java
        │   │   │       ├── NodeVisitor.java
        │   │   │       ├── QueryParser.java
        │   │   │       ├── Selector.java
        │   │   │       ├── StructuralEvaluator.java
        │   │   │       └── package-info.java
        │   └── us
        │   │   └── codecraft
        │   │       └── learning
        │   │           ├── automata
        │   │               ├── ABStateMachine.java
        │   │               ├── StateModelABStateMachine.java
        │   │               ├── StringReader.java
        │   │               └── SwitchABStateMachine.java
        │   │           ├── parser
        │   │               ├── PageErrorChecker.java
        │   │               └── ParserCorrectorTest.java
        │   │           └── select
        │   │               └── SelectorTest.java
        └── javadoc
        │   └── overview.html
    └── test
        ├── java
            └── org
            │   └── jsoup
            │       ├── TextUtil.java
            │       ├── helper
            │           ├── DataUtilTest.java
            │           ├── HttpConnectionTest.java
            │           └── StringUtilTest.java
            │       ├── integration
            │           ├── Benchmark.java
            │           ├── ParseTest.java
            │           └── UrlConnectTest.java
            │       ├── nodes
            │           ├── AttributeTest.java
            │           ├── AttributesTest.java
            │           ├── DocumentTest.java
            │           ├── DocumentTypeTest.java
            │           ├── ElementTest.java
            │           ├── EntitiesTest.java
            │           ├── FormElementTest.java
            │           ├── NodeTest.java
            │           └── TextNodeTest.java
            │       ├── parser
            │           ├── AttributeParseTest.java
            │           ├── CharacterReaderTest.java
            │           ├── HtmlParserTest.java
            │           ├── TagTest.java
            │           ├── TokenQueueTest.java
            │           └── XmlTreeBuilderTest.java
            │       ├── safety
            │           └── CleanerTest.java
            │       └── select
            │           ├── CssTest.java
            │           ├── ElementsTest.java
            │           ├── QueryParserTest.java
            │           └── SelectorTest.java
        └── resources
            └── htmltests
                ├── README
                ├── baidu-cn-home.html
                ├── baidu-variant.html
                ├── google-ipod.html
                ├── meta-charset-1.html
                ├── meta-charset-2.html
                ├── meta-charset-3.html
                ├── news-com-au-home.html
                ├── nyt-article-1.html
                ├── smh-biz-article-1.html
                ├── thumb.jpg
                ├── xml-test.xml
                ├── yahoo-article-1.html
                └── yahoo-jp.html


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | jsoup.iml
 3 | jsoup.ipr
 4 | jsoup.iws
 5 | target/
 6 | .classpath
 7 | .project
 8 | .settings/
 9 | *Thrash*
10 | 
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2009, 2010, 2011, 2012, 2013 Jonathan Hedley <jonathan@hedley.net>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Jsoup学习笔记 
 2 | ------
 3 | **Jsoup**是Java世界的一款HTML解析工具，它支持用CSS Selector方式选择DOM元素，也可过滤HTML文本，防止XSS攻击。
 4 | 
 5 | 学习Jsoup是为了更好的开发我的另一个爬虫框架[webmagic](https://github.com/code4craft/webmagic)，为了学的比较详细，就强制自己用很规范的方式写出这部分文章。
 6 | 
 7 | 代码部分来自[https://github.com/jhy/jsoup](https://github.com/jhy/jsoup)，添加了一些中文注释以及示例代码。
 8 | 
 9 | ---------------
10 | 
11 | ## 提纲
12 | 
13 | 1. [概述](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup1.md)
14 | 
15 | 2. [DOM相关对象](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup2.md)
16 | 
17 | 3. [Document的输出](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup3.md)
18 | 
19 | 4. HTML语法分析parser
20 | 
21 | 	1. [语法分析与状态机基础](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup4.md)
22 | 	2. [词法分析Tokenizer](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup5.md)
23 | 	3. [语法检查及DOM树构建](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup6.md)
24 | 
25 | 5. [CSS Selector](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup7.md)
26 | 
27 | 6. [防御XSS攻击](https://github.com/code4craft/jsoup-learning/blob/master/blogs/jsoup8.md)
28 | 
29 | 7. [为Jsoup增加XPath选择功能](https://github.com/code4craft/xsoup)
30 | 	
31 | 	Jsoup默认没有XPath功能，我写了一个项目[Xsoup](https://github.com/code4craft/xsoup)，可以使用XPath来选择HTML文本。Java里较常用的XPath抽取器是HtmlCleaner，Xsoup的性能比它快了一倍。
32 | 
33 | -------
34 | 
35 | ## 协议：
36 | 
37 | 相关代码遵循MIT协议。
38 | 
39 | 文档遵循CC-BYNC协议。
40 | 
41 | [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/jsoup-learning/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
42 | 
43 | 


--------------------------------------------------------------------------------
/blogs/htmlcleaner/htmlcleaner.md:
--------------------------------------------------------------------------------
 1 | htmlcleaner代码学习
 2 | ---
 3 | 相比Jsoup，htmlcleaner支持XPath进行抽取，也是挺有用的。
 4 | 
 5 | htmlcleaner托管在sourceforge下[http://htmlcleaner.sourceforge.net/‎](http://htmlcleaner.sourceforge.net/‎
 6 | )，由于某种原因，访问sourceforge不是那么顺畅，最后选了这个比较新的github上的fork:[https://github.com/amplafi/htmlcleaner](https://github.com/amplafi/htmlcleaner)。
 7 | 
 8 | htmlcleaner的包结构与Jsoup还是有些差距，一开始就被一字排开的类给吓到了。
 9 | 
10 | htmlcleaner仍然有一套自己的树结构，继承自:`HtmlNode`。但是它提供了到`org.w3c.dom.Document`和`org.jdom2.Document`的转换。
11 | 
12 | `HtmlTokenizer`是词法分析部分，有状态但是没用状态机，而是用了一些基本类型来保存状态，例如：
13 | 
14 |     public class HtmlTokenizer {
15 | 
16 |         private BufferedReader _reader;
17 |         private char[] _working = new char[WORKING_BUFFER_SIZE];
18 | 
19 |         private transient int _pos;
20 |         private transient int _len = -1;
21 |         private transient int _row = 1;
22 |         private transient int _col = 1;
23 |         
24 | 
25 |         private transient StringBuffer _saved = new StringBuffer(512);
26 | 
27 |         private transient boolean _isLateForDoctype;
28 |         private transient DoctypeToken _docType;
29 |         private transient TagToken _currentTagToken;
30 |         private transient List<BaseToken> _tokenList = new ArrayList<BaseToken>();
31 |         private transient Set<String> _namespacePrefixes = new HashSet<String>();
32 | 
33 |         private boolean _asExpected = true;
34 | 
35 |         private boolean _isScriptContext;
36 |     }
37 | 
38 | 浓烈的面向过程编程的味道。
39 | 
40 | `Tokenize`之后就是简单的用栈将树组合起来。
41 | 
42 | 测试了一下，一个44k的文档，用Jsoup做parse是3.5ms，而htmlcleaner是7.9ms，差距在一倍左右。
43 | 
44 | XPath部分也是云里雾里，


--------------------------------------------------------------------------------
/blogs/images/compiler.pages:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/compiler.pages


--------------------------------------------------------------------------------
/blogs/images/compiler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/compiler.png


--------------------------------------------------------------------------------
/blogs/images/hacker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/hacker.png


--------------------------------------------------------------------------------
/blogs/images/html compiler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/html compiler.png


--------------------------------------------------------------------------------
/blogs/images/select uml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/select uml.png


--------------------------------------------------------------------------------
/blogs/images/streetfighter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/streetfighter.jpg


--------------------------------------------------------------------------------
/blogs/images/uml.zargo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/uml.zargo


--------------------------------------------------------------------------------
/blogs/images/uml.zargo~:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/uml.zargo~


--------------------------------------------------------------------------------
/blogs/images/类图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/blogs/images/类图.png


--------------------------------------------------------------------------------
/blogs/jsoup1.md:
--------------------------------------------------------------------------------
 1 | Jsoup代码解读之一-概述
 2 | ------
 3 | >今天看到一个用python写的抽取正文的东东，美滋滋的用Java实现了一番，放到了webmagic里，然后发现Jsoup里已经有了…觉得自己各种不靠谱啊！算了，静下心来学学好东西吧！
 4 | 
 5 | Jsoup是Java世界用作html解析和过滤的不二之选。支持将html解析为DOM树、支持CSS Selector形式选择、支持html过滤，本身还附带了一个Http下载器。
 6 | 
 7 | ## 概述
 8 | 
 9 | Jsoup的代码相当简洁，Jsoup总共53个类，且没有任何第三方包的依赖，对比最终发行包9.8M的SAXON，实在算得上是短小精悍了。
10 | 
11 | ```shell
12 |     jsoup
13 |     ├── examples #样例，包括一个将html转为纯文本和一个抽取所有链接地址的例子。    
14 |     ├── helper #一些工具类，包括读取数据、处理连接以及字符串转换的工具
15 |     ├── nodes #DOM节点定义
16 |     ├── parser #解析html并转换为DOM树
17 |     ├── safety #安全相关，包括白名单及html过滤
18 |     └── select #选择器，支持CSS Selector以及NodeVisitor格式的遍历
19 | ```
20 |     
21 | ## 使用
22 | 
23 | Jsoup的入口是`Jsoup`类。examples包里提供了两个例子，解析html后，分别用CSS Selector以及NodeVisitor来操作Dom元素。
24 | 
25 | 这里用`ListLinks`里的例子来说明如何调用Jsoup：
26 | 
27 | ```java
28 |     public static void main(String[] args) throws IOException {
29 |         Validate.isTrue(args.length == 1, "usage: supply url to fetch");
30 |         String url = args[0];
31 |         print("Fetching %s...", url);
32 | 
33 |         // 下载url并解析成html DOM结构
34 |         Document doc = Jsoup.connect(url).get();
35 |         // 使用select方法选择元素，参数是CSS Selector表达式
36 |         Elements links = doc.select("a[href]");
37 | 
38 |         print("\nLinks: (%d)", links.size());
39 |         for (Element link : links) {
40 |             //使用abs:前缀取绝对url地址
41 |             print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
42 |         }
43 |     }
44 | ```
45 |     
46 | Jsoup使用了自己的一套DOM代码体系，这里的Elements、Element等虽然名字和概念都与Java XML API`org.w3c.dom`类似，但并没有代码层面的关系。就是说你想用XML的一套API来操作Jsoup的结果是办不到的，但是正因为如此，才使得Jsoup可以抛弃xml里一些繁琐的API，使得代码更加简单。
47 |      
48 | 还有一种方式是通过`NodeVisitor`来遍历DOM树，这个在对整个html做分析和替换时比较有用：
49 | 
50 | ```java
51 |     public interface NodeVisitor {
52 | 
53 |         //遍历到节点开始时，调用此方法
54 |         public void head(Node node, int depth);
55 | 
56 |         //遍历到节点结束时(所有子节点都已遍历完)，调用此方法
57 |         public void tail(Node node, int depth);
58 |     }
59 | ```
60 |     
61 | `HtmlToPlainText`的例子说明了如何使用NodeVisitor来遍历DOM树，将html转化为纯文本，并将需要换行的标签替换为换行\\n：
62 | 
63 | ```java
64 |     public static void main(String... args) throws IOException {
65 |         Validate.isTrue(args.length == 1, "usage: supply url to fetch");
66 |         String url = args[0];
67 | 
68 |         // fetch the specified URL and parse to a HTML DOM
69 |         Document doc = Jsoup.connect(url).get();
70 | 
71 |         HtmlToPlainText formatter = new HtmlToPlainText();
72 |         String plainText = formatter.getPlainText(doc);
73 |         System.out.println(plainText);
74 |     }
75 | 
76 |     public String getPlainText(Element element) {
77 |         //自定义一个NodeVisitor - FormattingVisitor
78 |         FormattingVisitor formatter = new FormattingVisitor();
79 |         //使用NodeTraversor来装载FormattingVisitor
80 |         NodeTraversor traversor = new NodeTraversor(formatter);
81 |         //进行遍历
82 |         traversor.traverse(element);
83 |         return formatter.toString();
84 |     }
85 | ```
86 | 
87 | 下一节将从DOM结构开始对Jsoup代码进行分析。


--------------------------------------------------------------------------------
/blogs/jsoup2.md:
--------------------------------------------------------------------------------
 1 | Jsoup代码解读之二-DOM相关对象
 2 | -------
 3 | 之前在文章中说到，Jsoup使用了一套自己的DOM对象体系，和Java XML API互不兼容。这样做的好处是从XML的API里解脱出来，使得代码精炼了很多。这篇文章会说明Jsoup的DOM结构，DOM的遍历方式。在下一篇文章，我会并结合这两个基础，分析一下Jsoup的HTML输出功能。
 4 | ## DOM结构相关类
 5 | 
 6 | 我们先来看看nodes包的类图：
 7 | 
 8 | ![node类图][1]
 9 | 
10 | 这里可以看到，核心无疑是`Node`类。
11 | 
12 | Node类是一个抽象类，它代表DOM树中的一个节点，它包含：
13 | 
14 | * 父节点`parentNode`以及子节点`childNodes`的引用
15 | * 属性值集合`attributes`
16 | * 页面的uri`baseUri`，用于修正相对地址为绝对地址
17 | * 在兄弟节点中的位置`siblingIndex`，用于进行DOM操作
18 | 
19 | Node里面包含一些获取属性、父子节点、修改元素的方法，其中比较有意思的是`absUrl()`。我们知道，在很多html页面里，链接会使用相对地址，我们有时会需要将其转变为绝对地址。Jsoup的解决方案是在attr()的参数开始加"abs:"，例如attr("abs:href")，而`absUrl()`就是其实现方式。我写的爬虫框架[webmagic](http://www.oschina.net/p/webmagic)里也用到了类似功能，当时是自己手写的，看到Jsoup的实现，才发现自己是白费劲了，代码如下：
20 | 
21 | ```java
22 |     URL base;
23 |     try {
24 |         try {
25 |             base = new URL(baseUri);
26 |         } catch (MalformedURLException e) {
27 |             // the base is unsuitable, but the attribute may be abs on its own, so try that
28 |             URL abs = new URL(relUrl);
29 |             return abs.toExternalForm();
30 |         }
31 |         // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
32 |         if (relUrl.startsWith("?"))
33 |             relUrl = base.getPath() + relUrl;
34 |         // java URL自带的相对路径解析    
35 |         URL abs = new URL(base, relUrl);
36 |         return abs.toExternalForm();
37 |     } catch (MalformedURLException e) {
38 |         return "";
39 |     }
40 | ```
41 | 
42 | Node还有一个比较值得一提的方法是`abstract String nodeName()`，这个相当于定义了节点的类型名(例如`Document`是'#Document'，`Element`则是对应的TagName)。
43 | 
44 | Element也是一个重要的类，它代表的是一个HTML元素。它包含一个字段`tag`和`classNames`。classNames是"class"属性解析出来的集合，因为CSS规范里，"class"属性允许设置多个，并用空格隔开，而在用Selector选择的时候，即使只指定其中一个，也能够选中其中的元素。所以这里就把"class"属性展开了。Element还有选取元素的入口，例如`select`、`getElementByXXX`，这些都用到了select包中的内容，这个留到下篇文章select再说。
45 | 
46 | Document是代表整个文档，它也是一个特殊的Element，即根节点。Document除了Element的内容，还包括一些输出的方法。
47 | 
48 | Document还有一个属性`quirksMode`，大致意思是定义处理非标准HTML的几个级别，这个留到以后分析parser的时候再说。
49 | 
50 | ## DOM树的遍历
51 | 
52 | Node还有一些方法，例如`outerHtml()`，用作节点及文档HTML的输出，用到了树的遍历。在DOM树的遍历上，用到了`NodeVisitor`和`NodeTraversor`来对树的进行遍历。`NodeVisitor`在上一篇文章提到过了，head()和tail()分别是遍历开始和结束时的方法，而`NodeTraversor`的核心代码如下：
53 | 
54 | ```java
55 |     public void traverse(Node root) {
56 |         Node node = root;
57 |         int depth = 0;
58 | 
59 |         //这里对树进行后序(深度优先)遍历
60 |         while (node != null) {
61 |             //开始遍历node
62 |             visitor.head(node, depth);
63 |             if (node.childNodeSize() > 0) {
64 |                 node = node.childNode(0);
65 |                 depth++;
66 |             } else {
67 |                 //没有下一个兄弟节点，退栈
68 |                 while (node.nextSibling() == null && depth > 0) {
69 |                     visitor.tail(node, depth);
70 |                     node = node.parent();
71 |                     depth--;
72 |                 }
73 |                 //结束遍历
74 |                 visitor.tail(node, depth);
75 |                 if (node == root)
76 |                     break;
77 |                 node = node.nextSibling();
78 |             }
79 |         }
80 |     }
81 | ```
82 | 
83 | 这里使用循环+回溯来替换掉了我们常用的递归方式，从而避免了栈溢出的风险。
84 | 
85 | 实际上，Jsoup的Selector机制也是基于`NodeVisitor`来实现的，可以说`NodeVisitor`是更加底层和灵活的API。
86 | 
87 | 在下一篇博客我会讲讲Document的输出。
88 | 
89 | 
90 | 
91 |   [1]: http://static.oschina.net/uploads/space/2013/0825/221021_wQvT_190591.png


--------------------------------------------------------------------------------
/blogs/jsoup3.md:
--------------------------------------------------------------------------------
  1 | Jsoup代码解读之三-Document的输出
  2 | -------
  3 | 
  4 | Jsoup官方说明里，一个重要的功能就是***output tidy HTML***。这里我们看看Jsoup是如何输出HTML的。
  5 | 
  6 | ## HTML相关知识
  7 | 
  8 | 分析代码前，我们不妨先想想，"tidy HTML"到底包括哪些东西：
  9 | 
 10 | * 换行，块级标签习惯上都会独占一行
 11 | * 缩进，根据HTML标签嵌套层数，行首缩进会不同
 12 | * 严格的标签闭合，如果是可以自闭合的标签并且没有内容，则进行自闭合
 13 | * HTML实体的转义
 14 | 
 15 | 这里要补充一下HTML标签的知识。HTML Tag可以分为block和inline两类。关于Tag的inline和block的定义可以参考[http://www.w3schools.com/html/html_blocks.asp](http://www.w3schools.com/html/html_blocks.asp)，而Jsoup的`Tag`类则是对Java开发者非常好的学习资料。
 16 | 
 17 | ```java
 18 |     // internal static initialisers:
 19 |     // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources
 20 |     //block tags，需要换行
 21 |     private static final String[] blockTags = {
 22 |             "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame",
 23 |             "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6",
 24 |             "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
 25 |             "del", "s", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
 26 |             "td", "video", "audio", "canvas", "details", "menu", "plaintext"
 27 |     };
 28 |     //inline tags，无需换行
 29 |     private static final String[] inlineTags = {
 30 |             "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd",
 31 |             "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q",
 32 |             "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup",
 33 |             "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
 34 |             "summary", "command", "device"
 35 |     };
 36 |     //emptyTags是不能有内容的标签，这类标签都是可以自闭合的
 37 |     private static final String[] emptyTags = {
 38 |             "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
 39 |             "device"
 40 |     };
 41 |     private static final String[] formatAsInlineTags = {
 42 |             "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style",
 43 |             "ins", "del", "s"
 44 |     };
 45 |     //在这些标签里，需要保留空格
 46 |     private static final String[] preserveWhitespaceTags = {
 47 |             "pre", "plaintext", "title", "textarea"
 48 |     };
 49 | ```
 50 | 
 51 | 另外，Jsoup的`Entities`类里包含了一些HTML实体转义的东西。这些转义的对应数据保存在`entities-full.properties`和`entities-base.properties`里。
 52 | 
 53 | ## Jsoup的格式化实现
 54 | 
 55 | 在Jsoup里，直接调用`Document.toString()`(继承自Element)，即可对文档进行输出。另外`OutputSettings`可以控制输出格式，主要是`prettyPrint`(是否重新格式化)、`outline`(是否强制所有标签换行)、`indentAmount`(缩进长度)等。
 56 | 
 57 | 里面的继承和互相调用关系略微复杂，大概是这样子：
 58 | 
 59 | `Document.toString()`=>`Document.outerHtml()`=>`Element.html()`，最终`Element.html()`又会循环调用所有子元素的`outerHtml()`，拼接起来作为输出。
 60 | 
 61 | ```java
 62 |     private void html(StringBuilder accum) {
 63 |         for (Node node : childNodes)
 64 |             node.outerHtml(accum);
 65 |     }
 66 | ```
 67 | 
 68 | 而`outerHtml()`会使用一个`OuterHtmlVisitor`对所以子节点做遍历，并拼装起来作为结果。
 69 | 
 70 | ```java
 71 | 	protected void outerHtml(StringBuilder accum) {
 72 |         new NodeTraversor(new OuterHtmlVisitor(accum, getOutputSettings())).traverse(this);
 73 |     }
 74 | ```
 75 | 
 76 | OuterHtmlVisitor会对所有子节点做遍历，并调用`node.outerHtmlHead()`和`node.outerHtmlTail`两个方法。
 77 |     
 78 | ```java
 79 |     private static class OuterHtmlVisitor implements NodeVisitor {
 80 |         private StringBuilder accum;
 81 |         private Document.OutputSettings out;
 82 | 
 83 |         public void head(Node node, int depth) {
 84 |             node.outerHtmlHead(accum, depth, out);
 85 |         }
 86 | 
 87 |         public void tail(Node node, int depth) {
 88 |             if (!node.nodeName().equals("#text")) // saves a void hit.
 89 |                 node.outerHtmlTail(accum, depth, out);
 90 |         }
 91 |     }
 92 | ```
 93 | 
 94 | 我们终于找到了真正工作的代码，`node.outerHtmlHead()`和`node.outerHtmlTail`。Jsoup里每种Node的输出方式都不太一样，这里只讲讲两种主要节点：`Element`和`TextNode`。`Element`是格式化的主要对象，它的两个方法代码如下：
 95 | 
 96 | ```java
 97 |     void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
 98 |         if (accum.length() > 0 && out.prettyPrint()
 99 |                 && (tag.formatAsBlock() || (parent() != null && parent().tag().formatAsBlock()) || out.outline()) )
100 |             //换行并调整缩进
101 |             indent(accum, depth, out);
102 |         accum
103 |                 .append("<")
104 |                 .append(tagName());
105 |         attributes.html(accum, out);
106 | 
107 |         if (childNodes.isEmpty() && tag.isSelfClosing())
108 |             accum.append(" />");
109 |         else
110 |             accum.append(">");
111 |     }
112 | 
113 |     void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {
114 |         if (!(childNodes.isEmpty() && tag.isSelfClosing())) {
115 |             if (out.prettyPrint() && (!childNodes.isEmpty() && (
116 |                     tag.formatAsBlock() || (out.outline() && (childNodes.size()>1 || (childNodes.size()==1 && !(childNodes.get(0) instanceof TextNode))))
117 |             )))
118 |                 //换行并调整缩进
119 |                 indent(accum, depth, out);
120 |             accum.append("</").append(tagName()).append(">");
121 |         }
122 |     }
123 | ```
124 | 
125 | 而ident方法的代码只有一行：
126 | 
127 | ```java
128 |     protected void indent(StringBuilder accum, int depth, Document.OutputSettings out) {
129 |         //out.indentAmount()是缩进长度，默认是1
130 |         accum.append("\n").append(StringUtil.padding(depth * out.indentAmount()));
131 |     }
132 | ```
133 |     
134 | 代码简单明了，就没什么好说的了。值得一提的是，`StringUtil.padding()`方法为了减少字符串生成，把常用的缩进保存到了一个数组中。
135 | 
136 | 好了，水了一篇文章，下一篇将比较有技术含量的parser部分。
137 | 
138 | 另外，通过本节的学习，我们学到了要把StringBuilder命名为**accum**，而不是**sb**。


--------------------------------------------------------------------------------
/blogs/jsoup4.md:
--------------------------------------------------------------------------------
  1 | Jsoup代码解读之四-parser(上)
  2 | -------
  3 | 作为Java世界最好的HTML 解析库，Jsoup的parser实现非常具有代表性。这部分也是Jsoup最复杂的部分，需要一些数据结构、状态机乃至编译器的知识。好在HTML语法不复杂，解析只是到DOM树为止，所以作为编译器入门倒是挺合适的。这一块不要指望囫囵吞枣，我们还是泡一杯咖啡，细细品味其中的奥妙吧。
  4 | 
  5 | ## 基础知识
  6 | 
  7 | ### 编译器
  8 | 
  9 | 将计算机语言转化为另一种计算机语言(通常是更底层的语言，例如机器码、汇编、或者JVM字节码)的过程就叫做编译(compile)。编译器(Compiler)是计算机科学的一个重要领域，已经有很多年历史了，而最近各种通用语言层出不穷，加上跨语言编译的兴起、DSL概念的流行，都让编译器变成了一个很时髦的东西。
 10 | 
 11 | 编译器领域相关有三本公认的经典书籍，龙书[《Compilers: Principles, Techniques, and Tools 》](http://book.douban.com/subject/1866231/)，虎书[《Modern Compiler Implementation in X (X表示各种语言)》](http://book.douban.com/subject/1923484/)，鲸书[《Advanced Compiler Design and Implementation》](http://book.douban.com/subject/1821532/)。其中龙书是编译理论方面公认的不二之选，而后面两本则对实践更有指导意义。另外[@装配脑袋](http://www.cnblogs.com/Ninputer)有个很好的编译器入门系列博客：[http://www.cnblogs.com/Ninputer/archive/2011/06/07/2074632.html](http://www.cnblogs.com/Ninputer/archive/2011/06/07/2074632.html)
 12 | 
 13 | 编译器的基本流程如下：
 14 | 
 15 | ![compiler][1]
 16 | 
 17 | 其中词法分析、语法分析、语义分析这部分又叫编译器的前端(front-end)，而此后的中间代码生成直到目标生成、优化等属于编译器的后端(back-end)。编译器的前端技术已经很成熟了，也有yacc这样的工具来自动进行词法、语法分析(Java里也有一个类似的工具ANTLR)，而后端技术更加复杂，也是目前编译器研究的重点。
 18 | 
 19 | 说了这么多，回到咱们的HTML上来。HTML是一种声明式的语言，可以理解它的最终的输出是浏览器里图形化的页面，而并非可执行的目标语言，因此我将这里的Translate改为了Render。
 20 | 
 21 | ![html compiler][2]
 22 | 
 23 | 在Jsoup(包括类似的HTML parser)里，只做了Lex(词法分析)、Parse(语法分析)两步，而HTML parse最终产出结果，就是DOM树。至于HTML的语义解析以及渲染，不妨看看携程UED团队的这篇文章：[《浏览器是怎样工作的：渲染引擎，HTML解析》](http://ued.ctrip.com/blog/?p=3295)。
 24 | 
 25 | ### 状态机
 26 | 
 27 | Jsoup的词法分析和语法分析都用到了状态机。状态机可以理解为一个特殊的程序模型，例如经常跟我们打交道的正则表达式就是用状态机实现的。
 28 | 
 29 | 它由状态(state)和转移(transition)两部分构成。根据状态转移的可能性，状态机又分为DFA(确定有限状态机)和NFA(非确定有限状态自动机)。这里拿一个最简单的正则表达式"a[b]*"作为例子，我们先把它映射到一个状态机DFA，大概是这样子：
 30 | 
 31 | ![state machine][3]
 32 | 
 33 | 状态机本身是一个编程模型，这里我们尝试用程序去实现它，那么最直接的方式大概是这样：
 34 | 
 35 | ```java
 36 |     public void process(StringReader reader) throws StringReader.EOFException {
 37 |         char ch;
 38 |         switch (state) {
 39 |             case Init:
 40 |                 ch = reader.read();
 41 |                 if (ch == 'a') {
 42 |                     state = State.AfterA;
 43 |                     accum.append(ch);
 44 |                 }
 45 |                 break;
 46 |             case AfterA:
 47 |                 ...
 48 |                 break;
 49 |             case AfterB:
 50 |                 ...
 51 |                 break;
 52 |             case Accept:
 53 |                 ...
 54 |                 break;
 55 |         }
 56 |     }
 57 | ```
 58 | 
 59 | 这样写简单的状态机倒没有问题，但是复杂情况下就有点难受了。还有一种标准的状态机解法，先建立状态转移表，然后使用这个表建立状态机。这个方法的问题就是，只能做纯状态转移，无法在代码级别操作输入输出。
 60 | 
 61 | Jsoup里则使用了状态模式来实现状态机，初次看到时，确实让人眼前一亮。状态模式是设计模式的一种，它将状态和对应的行为绑定在一起。而在状态机的实现过程中，使用它来实现状态转移时的处理再合适不过了。
 62 | 
 63 | "a[b]*"的例子的状态模式实现如下，这里采用了与Jsoup相同的方式，用到了枚举来实现状态模式：
 64 | 
 65 | ```java
 66 |     public class StateModelABStateMachine implements ABStateMachine {
 67 | 
 68 |         State state;
 69 | 
 70 |         StringBuilder accum;
 71 | 
 72 |         enum State {
 73 |             Init {
 74 |                 @Override
 75 |                 public void process(StateModelABStateMachine stateModelABStateMachine, StringReader reader) throws StringReader.EOFException {
 76 |                     char ch = reader.read();
 77 |                     if (ch == 'a') {
 78 |                         stateModelABStateMachine.state = AfterA;
 79 |                         stateModelABStateMachine.accum.append(ch);
 80 |                     }
 81 |                 }
 82 |             },
 83 |             Accept {
 84 |                 ...
 85 |             },
 86 |             AfterA {
 87 |                 ...
 88 |             },
 89 |             AfterB {
 90 |                 ...
 91 |             };
 92 | 
 93 |             public void process(StateModelABStateMachine stateModelABStateMachine, StringReader reader) throws StringReader.EOFException {
 94 |             }
 95 |         }
 96 | 
 97 |         public void process(StringReader reader) throws StringReader.EOFException {
 98 |             state.process(this, reader);
 99 |         }
100 |     }
101 | ```
102 | 
103 | 本文中提到的几种状态机的完整实现在这个仓库的[https://github.com/code4craft/jsoup-learning/tree/master/src/main/java/us/codecraft/learning/automata](https://github.com/code4craft/jsoup-learning/tree/master/src/main/java/us/codecraft/learning/automata)路径下。
104 | 
105 | 下一篇文章将从Jsoup的词法分析器开始来讲状态机的使用。
106 | 
107 | 
108 | 
109 |   [1]: http://static.oschina.net/uploads/space/2013/0828/081055_j2Xy_190591.png
110 |   [2]: http://static.oschina.net/uploads/space/2013/0828/103726_uejc_190591.png
111 |   [3]: http://static.oschina.net/uploads/space/2013/0828/131113_nyHh_190591.png


--------------------------------------------------------------------------------
/blogs/jsoup5.md:
--------------------------------------------------------------------------------
  1 | Jsoup代码解读之五-parser(中)
  2 | -------
  3 | 上一篇文章讲到了状态机和词法分析的基本知识，这一节我们来分析Jsoup是如何进行词法分析的。
  4 | 
  5 | ## 代码结构
  6 | 
  7 | 先介绍以下parser包里的主要类：
  8 | 
  9 | * `Parser`
 10 | 
 11 | 	Jsoup parser的入口facade，封装了常用的parse静态方法。可以设置`maxErrors`，用于收集错误记录，默认是0，即不收集。与之相关的类有`ParseError`,`ParseErrorList`。基于这个功能，我写了一个[`PageErrorChecker`](https://github.com/code4craft/jsoup-learning/tree/master/src/main/java/us/codecraft/learning/parser)来对页面做语法检查，并输出语法错误。
 12 | 
 13 | * `Token` 
 14 | 	
 15 | 	保存单个的词法分析结果。Token是一个抽象类，它的实现有`Doctype`,`StartTag`,`EndTag`,`Comment`,`Character`,`EOF`6种，对应6种词法类型。
 16 | 	
 17 | * `Tokeniser` 
 18 | 
 19 | 	保存词法分析过程的状态及结果。比较重要的两个字段是`state`和`emitPending`，前者保存状态，后者保存输出。其次还有`tagPending`/`doctypePending`/`commentPending`，保存还没有填充完整的Token。
 20 | 	
 21 | * `CharacterReader`
 22 | 
 23 | 	对读取字符的逻辑的封装，用于Tokenize时候的字符输入。CharacterReader包含了类似NIO里ByteBuffer的`consume()`、`unconsume()`、`mark()`、`rewindToMark()`，还有高级的`consumeTo()`这样的用法。
 24 | 	
 25 | * `TokeniserState`
 26 | 
 27 |  	用枚举实现的词法分析状态机。
 28 |  	
 29 | * `HtmlTreeBuilder`
 30 | 
 31 | 	语法分析，通过token构建DOM树的类。
 32 | 	
 33 | * `HtmlTreeBuilderState`
 34 | 
 35 | 	语法分析状态机。
 36 |  	
 37 | * `TokenQueue`
 38 | 
 39 | 	虽然披了个Token的马甲，其实是在query的时候用到，留到select部分再讲。
 40 | 
 41 | ## 词法分析状态机
 42 | 
 43 | 现在我们来讲讲HTML的词法分析过程。这里借用一下[http://ued.ctrip.com/blog/?p=3295](http://ued.ctrip.com/blog/?p=3295)里的图，图中描述了一个Tag标签的状态转移过程，
 44 | 
 45 | ![lexer][1]
 46 | 
 47 | 这里忽略了HTML注释、实体以及属性，只保留基本的开始/结束标签，例如下面的HTML:
 48 | 
 49 | 	<div>test</div>
 50 | 
 51 | Jsoup里词法分析比较复杂，我从里面抽取出了对应的部分，就成了我们的miniSoupLexer(这里省略了部分代码，完整代码可以看这里[`MiniSoupTokeniserState`](https://github.com/code4craft/jsoup-learning/blob/master/src/main/java/org/jsoup/parser/MiniSoupTokeniserState.java))：
 52 | 
 53 | ```java
 54 | 	enum MiniSoupTokeniserState implements ITokeniserState {
 55 | 	    /**
 56 | 	     * 什么层级都没有的状态
 57 | 	     * ⬇
 58 | 	     * <div>test</div>
 59 | 	     *      ⬇
 60 | 	     * <div>test</div>
 61 | 	     */
 62 | 	    Data {
 63 | 	        // in data state, gather characters until a character reference or tag is found
 64 | 	        public void read(Tokeniser t, CharacterReader r) {
 65 | 	            switch (r.current()) {
 66 | 	                case '<':
 67 | 	                    t.advanceTransition(TagOpen);
 68 | 	                    break;
 69 | 	                case eof:
 70 | 	                    t.emit(new Token.EOF());
 71 | 	                    break;
 72 | 	                default:
 73 | 	                    String data = r.consumeToAny('&', '<', nullChar);
 74 | 	                    t.emit(data);
 75 | 	                    break;
 76 | 	            }
 77 | 	        }
 78 | 	    },
 79 | 	    /**
 80 | 	     * ⬇
 81 | 	     * <div>test</div>
 82 | 	     */
 83 | 	    TagOpen {
 84 | 	        ...
 85 | 	    },
 86 | 	    /**
 87 | 	     *           ⬇
 88 | 	     * <div>test</div>
 89 | 	     */
 90 | 	    EndTagOpen {
 91 | 	        ...
 92 | 	    },
 93 | 	    /**
 94 | 	     *  ⬇
 95 | 	     * <div>test</div>
 96 | 	     */
 97 | 	    TagName {
 98 | 	        ...
 99 | 	    };
100 | 
101 | 	}
102 | ```
103 | 	
104 | 参考这个程序，可以看到Jsoup的词法分析的大致思路。分析器本身的编写是比较繁琐的过程，涉及属性值(区分单双引号)、DocType、注释、HTML实体，以及一些错误情况。不过了解了其思路，代码实现也是按部就班的过程。
105 | 
106 | 下一节开始介绍语法分析部分。
107 | 
108 |   [1]: http://taligarsiel.com/Projects/image019.png


--------------------------------------------------------------------------------
/blogs/jsoup6.md:
--------------------------------------------------------------------------------
  1 | Jsoup代码解读之六-parser(下)
  2 | --------
  3 | 最近生活上有点忙，女儿老是半夜不睡，精神状态也不是很好。工作上的事情也谈不上顺心，有很多想法但是没有几个被认可，有些事情也不是说代码写得好就行的。算了，还是端正态度，毕竟资历尚浅，我还是继续我的。
  4 | 
  5 | 读Jsoup源码并非无聊，目的其实是为了将webmagic做的更好一点，毕竟parser也是爬虫的重要组成部分之一。读了代码后，收获也不少，对HTML的知识也更进一步了。
  6 | 
  7 | ## DOM树产生过程
  8 | 
  9 | 这里单独将`TreeBuilder`部分抽出来叫做语法分析过程可能稍微不妥，其实就是根据Token生成DOM树的过程，不过我还是沿用这个编译器里的称呼了。
 10 | 
 11 | `TreeBuilder`同样是一个facade对象，真正进行语法解析的是以下一段代码：
 12 | 	
 13 | ```java
 14 |     protected void runParser() {
 15 |         while (true) {
 16 |             Token token = tokeniser.read();
 17 |             
 18 |             process(token);
 19 | 
 20 |             if (token.type == Token.TokenType.EOF)
 21 |                 break;
 22 |         }
 23 |     }
 24 | ```
 25 | 
 26 | `TreeBuilder`有两个子类，`HtmlTreeBuilder`和`XmlTreeBuilder`。`XmlTreeBuilder`自然是构建XML树的类，实现颇为简单，基本上是维护一个栈，并根据不同Token插入节点即可：
 27 | 
 28 | ```java
 29 | 	@Override
 30 |     protected boolean process(Token token) {
 31 |         // start tag, end tag, doctype, comment, character, eof
 32 |         switch (token.type) {
 33 |             case StartTag:
 34 |                 insert(token.asStartTag());
 35 |                 break;
 36 |             case EndTag:
 37 |                 popStackToClose(token.asEndTag());
 38 |                 break;
 39 |             case Comment:
 40 |                 insert(token.asComment());
 41 |                 break;
 42 |             case Character:
 43 |                 insert(token.asCharacter());
 44 |                 break;
 45 |             case Doctype:
 46 |                 insert(token.asDoctype());
 47 |                 break;
 48 |             case EOF: // could put some normalisation here if desired
 49 |                 break;
 50 |             default:
 51 |                 Validate.fail("Unexpected token type: " + token.type);
 52 |         }
 53 |         return true;
 54 |     }
 55 | ```
 56 |     
 57 | `insertNode`的代码大致是这个样子(为了便于展示，对方法进行了一些整合)：
 58 | 
 59 | ```java
 60 |     Element insert(Token.StartTag startTag) {
 61 |         Tag tag = Tag.valueOf(startTag.name());
 62 |         Element el = new Element(tag, baseUri, startTag.attributes);
 63 |         stack.getLast().appendChild(el);
 64 |         if (startTag.isSelfClosing()) {
 65 |             tokeniser.acknowledgeSelfClosingFlag();
 66 |             if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above.
 67 |                 tag.setSelfClosing();
 68 |         } else {
 69 |             stack.add(el);
 70 |         }
 71 |         return el;
 72 |     }
 73 | ```
 74 | 
 75 | ## HTML解析状态机
 76 | 
 77 | 相比`XmlTreeBuilder`，`HtmlTreeBuilder`则实现较为复杂，除了类似的栈结构以外，还用到了`HtmlTreeBuilderState`来构建了一个状态机来分析HTML。这是为什么呢？不妨看看`HtmlTreeBuilderState`到底用到了哪些状态吧（在代码中中用`<!-- State: -->`标明状态）：
 78 | 
 79 | ```html
 80 |     <!-- State: Initial -->
 81 |     <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 82 |     <!-- State: BeforeHtml -->
 83 |     <html lang='zh-CN' xml:lang='zh-CN' xmlns='http://www.w3.org/1999/xhtml'>
 84 |     <!-- State: BeforeHead -->
 85 |     <head>
 86 |       <!-- State: InHead -->
 87 |       <script type="text/javascript">
 88 |       //<!-- State: Text -->
 89 |         function xx(){
 90 |         }
 91 |       </script>
 92 |       <noscript>
 93 |         <!-- State: InHeadNoscript -->
 94 |         Your browser does not support JavaScript!
 95 |       </noscript>
 96 |     </head>
 97 |     <!-- State: AfterHead -->
 98 |     <body>
 99 |     <!-- State: InBody -->
100 |     <textarea>
101 |         <!-- State: Text -->
102 |         xxx
103 |     </textarea>
104 |     <table>
105 |         <!-- State: InTable -->
106 |         <!-- State: InTableText -->
107 |         xxx
108 |         <tbody>
109 |         <!-- State: InTableBody -->
110 |         </tbody>
111 |         <tr>
112 |             <!-- State: InRow -->
113 |             <td>
114 |                 <!-- State: InCell -->
115 |             </td>
116 |         </tr>    
117 |     </table>
118 |     </html>
119 | ```
120 | 
121 | 这里可以看到，HTML标签是有嵌套要求的，例如`<tr>`,`<td>`需要组合`<table>`来使用。根据Jsoup的代码，可以发现，`HtmlTreeBuilderState`做了以下一些事情：
122 | 
123 | * ### 语法检查
124 | 	
125 | 	例如`tr`没有嵌套在`table`标签内，则是一个语法错误。当`InBody`状态直接出现以下tag时，则出错。Jsoup里遇到这种错误，会发现这个Token的解析并记录错误，然后继续解析下面内容，并不会直接退出。
126 | 	
127 | ```java
128 | 	    InBody {
129 | 	        boolean process(Token t, HtmlTreeBuilder tb) {
130 | 				if (StringUtil.in(name,
131 | 				"caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) {
132 | 				tb.error(this);
133 | 				return false;
134 | 				}
135 | 	        }
136 | ```
137 | 	
138 | * ### 标签补全
139 | 
140 | 	例如`head`标签没有闭合，就写入了一些只有body内才允许出现的标签，则自动闭合`</head>`。`HtmlTreeBuilderState`有的方法`anythingElse()`就提供了自动补全标签，例如`InHead`状态的自动闭合代码如下：
141 | 	
142 | ```java
143 | 	        private boolean anythingElse(Token t, TreeBuilder tb) {
144 | 	            tb.process(new Token.EndTag("head"));
145 | 	            return tb.process(t);
146 | 	        }
147 | ```	
148 | 	
149 | 还有一种标签闭合方式，例如下面的代码：
150 | 	
151 | ```java
152 | 		private void closeCell(HtmlTreeBuilder tb) {
153 |             if (tb.inTableScope("td"))
154 |                 tb.process(new Token.EndTag("td"));
155 |             else
156 |                 tb.process(new Token.EndTag("th")); // only here if th or td in scope
157 |         }
158 | ```
159 | 
160 | ## 实例研究
161 | 
162 | ### 缺少标签时，会发生什么事？
163 | 
164 | 好了，看了这么多parser的源码，不妨回到我们的日常应用上来。我们知道，在页面里多写一个两个未闭合的标签是很正常的事，那么它们会被怎么解析呢？
165 | 
166 | 就拿`<div>`标签为例：
167 | 
168 | 1. 漏写了开始标签，只写了结束标签
169 | 
170 | 	```java
171 | 		case EndTag:
172 | 			if (StringUtil.in(name,"div","dl", "fieldset", "figcaption", "figure", "footer", "header", "pre", "section", "summary", "ul")) {                
173 | 				if (!tb.inScope(name)) {
174 | 				tb.error(this);
175 | 				return false;
176 | 				} 
177 | 			}	
178 | 	```
179 | 			
180 | 	恭喜你，这个`</div>`会被当做错误处理掉，于是你的页面就毫无疑问的乱掉了！当然，如果单纯多写了一个`</div>`，好像也不会有什么影响哦？(记得有人跟我讲过为了防止标签未闭合，而在页面底部多写了几个`</div>`的故事)
181 | 	
182 | 2. 写了开始标签，漏写了结束标签
183 | 
184 | 	这个情况分析起来更复杂一点。如果是无法在内部嵌套内容的标签，那么在遇到不可接受的标签时，会进行闭合。而`<div>`标签可以包括大多数标签，这种情况下，其作用域会持续到HTML结束。
185 | 	
186 | 好了，parser系列算是分析结束了，其间学到不少HTML及状态机内容，但是离实际使用比较远。下面开始select部分，这部分可能对日常使用更有意义一点。


--------------------------------------------------------------------------------
/blogs/jsoup7.md:
--------------------------------------------------------------------------------
 1 | Jsoup代码解读之七-实现一个CSS Selector
 2 | -----
 3 | 
 4 | ![street fighter][1]
 5 | 
 6 | 当当当！终于来到了Jsoup的特色：CSS Selector部分。selector也是我写的爬虫框架[webmagic](https://github.com/code4craft/webmagic)开发的一个重点。附上一张street fighter的图，希望以后webmagic也能挑战Jsoup!
 7 | 
 8 | ## select机制
 9 | 
10 | Jsoup的select包里，类结构如下：
11 | 
12 | ![uml][2]
13 | 
14 | 在最开始介绍Jsoup的时候，就已经说过`NodeVisitor`和`Selector`了。`Selector`是select部分的对外facade，而`NodeVisitor`则是遍历树的底层API，CSS Selector也是根据`NodeVisitor`实现的遍历。
15 | 
16 | Jsoup的select核心是`Evaluator`。Selector所传递的表达式，会经过`QueryParser`，最终编译成一个`Evaluator`。`Evaluator`是一个抽象类，它只有一个方法：
17 | 
18 | ```java
19 | 	public abstract boolean matches(Element root, Element element);
20 | ```
21 | 
22 | 注意这里传入了root，是为了某些情况下对树进行遍历时用的。
23 | 
24 | Evaluator的设计简洁明了，所有的Selector表达式单词都会编译到对应的Evaluator。例如`#xx`对应`Id`，`.xx`对应`Class`，`[]`对应`Attribute`。这里补充一下w3c的CSS Selector规范：[http://www.w3.org/TR/CSS2/selector.html](http://www.w3.org/TR/CSS2/selector.html)
25 | 
26 | 当然，只靠这几个还不够，Jsoup还定义了`CombiningEvaluator`(对Evaluator进行And/Or组合)，`StructuralEvaluator`(结合DOM树结构进行筛选)。
27 | 
28 | 这里我们可能最关心的是，“div ul li”这样的父子结构是如何实现的。这个的实现方式在`StructuralEvaluator.Parent`中，贴一下代码了：
29 | 
30 | ```java
31 |     static class Parent extends StructuralEvaluator {
32 |         public Parent(Evaluator evaluator) {
33 |             this.evaluator = evaluator;
34 |         }
35 | 
36 |         public boolean matches(Element root, Element element) {
37 |             if (root == element)
38 |                 return false;
39 | 
40 |             Element parent = element.parent();
41 |             while (parent != root) {
42 |                 if (evaluator.matches(root, parent))
43 |                     return true;
44 |                 parent = parent.parent();
45 |             }
46 |             return false;
47 |         }
48 |     }
49 | ```    
50 | 
51 | 这里Parent包含了一个`evaluator`属性，会根据这个evaluator去验证所有父节点。注意Parent是可以嵌套的，所以这个表达式"div ul li"最终会编译成`And(Parent(And(Parent(Tag("div"))，Tag("ul")),Tag("li")))`这样的Evaluator组合。
52 | 
53 | select部分比想象的要简单，代码可读性也很高。经过了parser部分的研究，这部分应该算是驾轻就熟了。
54 | 
55 | ## 关于webmagic的后续打算
56 | 
57 | webmagic是一个爬虫框架，它的Selector是用于抓取HTML中指定的文本，其机制和Jsoup的Evaluator非常像，只不过webmagic暂时是将Selector封装成较简单的API，而Evaluator直接上了表达式。之前也考虑过自己定制DSL来写一个HTML，现在看了Jsoup的源码，实现能力算是有了，但是引入DSL，实现只是一小部分，如何让DSL易写易懂才是难点。
58 | 
59 | 其实看了Jsoup的源码，精细程度上比webmagic要好得多了，基本每个类都对应一个真实的概念抽象，可能以后会在这方面下点工夫。
60 | 
61 | 下篇文章将讲最后一部分：白名单及HTML过滤机制。
62 | 
63 | [1]: http://static.oschina.net/uploads/space/2013/0830/180244_r1Vb_190591.jpg
64 | 
65 | [2]: http://static.oschina.net/uploads/space/2013/0830/184337_j85b_190591.png


--------------------------------------------------------------------------------
/blogs/jsoup8.md:
--------------------------------------------------------------------------------
  1 | Jsoup代码解读之八-防御XSS攻击
  2 | --------
  3 | ![hacker][1]
  4 | 
  5 | ## 防御XSS攻击的一般原理
  6 | 
  7 | cleaner是Jsoup的重要功能之一，我们常用它来进行富文本输入中的XSS防御。
  8 | 
  9 | 我们知道，XSS攻击的一般方式是，通过在页面输入中嵌入一段恶意脚本，对输出时的DOM结构进行修改，从而达到执行这段脚本的目的。对于纯文本输入，过滤/转义HTML特殊字符`<`,`>`,`"`,`'`是行之有效的办法，但是如果本身用户输入的就是一段HTML文本(例如博客文章)，这种方式就不太有效了。这个时候，就是Jsoup大显身手的时候了。
 10 | 
 11 | 在前面，我们已经知道了，Jsoup里怎么将HTML变成一棵DOM树，怎么对DOM树进行遍历，怎么对DOM文档进行输出，那么其实cleaner的实现方式，也能猜出大概了。使用Jsoup进行XSS防御，大致分为三个步骤:
 12 | 
 13 | 1. 将HTML解析为DOM树
 14 | 
 15 | 	这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入，会尝试用`</textarea>`闭合当前Tag，然后写入攻击脚本。而根据前面对Jsoup的parser的分析，这种时候，这些非闭合标签会被当做错误并丢弃。
 16 | 
 17 | 2. 过滤高风险标签/属性/属性值
 18 | 
 19 | 	高风险标签是指`<script>`以及类似标签，对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本，例如`onclick='alert("xss!")'`。
 20 | 
 21 | 
 22 | 3. 重新将DOM树输出为HTML文本
 23 | 
 24 | 	DOM树的输出，在前面(Jsoup代码解读之三)已经提到过了。
 25 | 
 26 | ## Cleaner与Whitelist
 27 | 
 28 | 对于上述的两个步骤，1、3都已经分别在parser和输出中完成，现在只剩下步骤 2：过滤高风险标签等。
 29 | 
 30 | Jsoup给出的答案是白名单。下面是`Whitelist`的部分代码。
 31 | 
 32 | ```java
 33 | public class Whitelist {
 34 |     private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]
 35 |     private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.
 36 |     private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values
 37 |     private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes
 38 |     private boolean preserveRelativeLinks; // option to preserve relative links
 39 | }
 40 | ```    
 41 | 
 42 | 这里定义了标签名/属性名/属性值的白名单。
 43 | 
 44 | 而`Cleaner`是过滤的执行者。不出所料，Cleaner内部定义了`CleaningVisitor`来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值，而是将符合条件的属性，加入到`Element destination`里去。
 45 | 
 46 | ```java
 47 |     private final class CleaningVisitor implements NodeVisitor {
 48 |         private int numDiscarded = 0;
 49 |         private final Element root;
 50 |         private Element destination; // current element to append nodes to
 51 | 
 52 |         private CleaningVisitor(Element root, Element destination) {
 53 |             this.root = root;
 54 |             this.destination = destination;
 55 |         }
 56 | 
 57 |         public void head(Node source, int depth) {
 58 |             if (source instanceof Element) {
 59 |                 Element sourceEl = (Element) source;
 60 | 
 61 |                 if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
 62 |                     ElementMeta meta = createSafeElement(sourceEl);
 63 |                     Element destChild = meta.el;
 64 |                     destination.appendChild(destChild);
 65 | 
 66 |                     numDiscarded += meta.numAttribsDiscarded;
 67 |                     destination = destChild;
 68 |                 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
 69 |                     numDiscarded++;
 70 |                 }
 71 |             } else if (source instanceof TextNode) {
 72 |                 TextNode sourceText = (TextNode) source;
 73 |                 TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
 74 |                 destination.appendChild(destText);
 75 |             } else { // else, we don't care about comments, xml proc instructions, etc
 76 |                 numDiscarded++;
 77 |             }
 78 |         }
 79 | 
 80 |         public void tail(Node source, int depth) {
 81 |             if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
 82 |                 destination = destination.parent(); // would have descended, so pop destination stack
 83 |             }
 84 |         }
 85 |     }
 86 | ```
 87 |     
 88 | 
 89 | ## 结束语
 90 | 
 91 | 至此，Jsoup的全部模块都已经写完了。Jsoup源码并不多，只有14000多行，但是实现非常精巧，在读代码的过程中，除了相关知识，还验证几个很重要的思想：
 92 | 
 93 | * 最好的代码抽象，是对现实概念的映射。
 94 | 
 95 | 	这句话在看《代码大全》的时候印象很深刻。在Jsoup里，只要有相关知识，每个类的作用都能第一时间明白其作用。
 96 | 
 97 | * 不要过度抽象
 98 | 
 99 | 	在Jsoup里，只用到了两个接口，一个是`NodeVisitor`，一个是`Connection`，其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能，都要先定义一个接口的做法是否必要？现在的答案是没有必要，过度的抽象反而会降低代码质量。
100 | 
101 | 	另外，Jsoup的代码内聚性都很高，每个类的功能基本都定义在类的内部，这是一个典型的充血模型。同时有大量的facade使用，而避免了Factory、Configure等类的出现，个人感觉这点是非常好的。
102 | 
103 |   [1]: http://static.oschina.net/uploads/space/2013/0831/071752_RBZc_190591.png


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <name>jsoup</name>
  5 | 
  6 |   <groupId>org.jsoup</groupId>
  7 |   <artifactId>jsoup</artifactId>
  8 |   <version>1.7.3-SNAPSHOT</version>
  9 |   <description>jsoup HTML parser</description>
 10 |   <url>http://jsoup.org/</url>
 11 |   <inceptionYear>2009</inceptionYear>
 12 |   <issueManagement>
 13 |   	<system>GitHub</system>
 14 |   	<url>http://github.com/jhy/jsoup/issues</url>
 15 |   </issueManagement>
 16 |   <licenses>
 17 |   	<license>
 18 |   		<name>The MIT License</name>
 19 |   		<url>http://jsoup.com/license</url>
 20 |   		<distribution>repo</distribution>
 21 |   	</license>  
 22 |   </licenses>
 23 |   <scm>
 24 |   	<url>http://github.com/jhy/jsoup</url>
 25 |     <connection>scm:git:http://github.com/jhy/jsoup.git</connection>
 26 |     <!-- <developerConnection>scm:git:git@github.com:jhy/jsoup.git</developerConnection> -->
 27 |   </scm>
 28 |   <organization>
 29 |   	<name>Jonathan Hedley</name>
 30 |   	<url>http://jonathanhedley.com/</url>
 31 |   </organization>
 32 | 
 33 |   <build>
 34 |     <plugins>
 35 |       <plugin>
 36 |         <groupId>org.apache.maven.plugins</groupId>
 37 |         <artifactId>maven-compiler-plugin</artifactId>
 38 |         <version>2.0.2</version>
 39 |         <configuration>
 40 |           <source>1.5</source>
 41 |           <target>1.5</target>
 42 |           <encoding>UTF-8</encoding>
 43 |         </configuration>
 44 |       </plugin>
 45 |       <plugin>
 46 |         <groupId>org.apache.maven.plugins</groupId>
 47 |         <artifactId>maven-javadoc-plugin</artifactId>
 48 |         <version>2.6.1</version>
 49 |         <configuration>
 50 |         </configuration>
 51 |         <executions>
 52 |           <execution>
 53 |             <id>attach-javadoc</id>
 54 |             <phase>verify</phase>
 55 |             <goals>
 56 |               <goal>jar</goal>
 57 |             </goals>
 58 |           </execution>
 59 |         </executions>
 60 |       </plugin>
 61 |       <plugin>
 62 |         <groupId>org.apache.maven.plugins</groupId>
 63 |         <artifactId>maven-source-plugin</artifactId>
 64 |         <version>2.1.1</version>
 65 |         <configuration>
 66 |         </configuration>
 67 |         <executions>
 68 |           <execution>
 69 |             <id>attach-sources</id>
 70 |             <phase>verify</phase>
 71 |             <goals>
 72 |               <goal>jar</goal>
 73 |             </goals>
 74 |           </execution>
 75 |         </executions>
 76 |       </plugin>
 77 |       <plugin>
 78 |         <groupId>org.apache.maven.plugins</groupId>
 79 |         <artifactId>maven-jar-plugin</artifactId>
 80 |         <version>2.2</version>
 81 |         <configuration>
 82 |           <archive>
 83 |             <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile>
 84 |           </archive>
 85 |         </configuration>
 86 |       </plugin>
 87 |       <plugin>
 88 |         <groupId>org.apache.felix</groupId>
 89 |         <artifactId>maven-bundle-plugin</artifactId>
 90 |         <version>2.1.0</version>
 91 |         <executions>
 92 |           <execution>
 93 |             <id>bundle-manifest</id>
 94 |             <phase>process-classes</phase>
 95 |             <goals>
 96 |               <goal>manifest</goal>
 97 |             </goals>
 98 |           </execution>
 99 |         </executions>
100 |         <configuration>
101 |           <instructions>
102 |             <Bundle-DocURL>http://jsoup.org/</Bundle-DocURL>
103 |           </instructions>
104 |         </configuration>
105 |       </plugin>
106 |       <plugin>
107 |         <groupId>org.apache.maven.plugins</groupId>
108 |         <artifactId>maven-resources-plugin</artifactId>
109 |         <version>2.4</version>
110 |       </plugin>
111 |     </plugins>
112 |     <resources>
113 |       <resource>
114 |         <directory>src/main/java</directory>
115 |         <includes>
116 |           <include>**/*.properties</include>
117 |         </includes>
118 |       </resource>
119 |     </resources>
120 |   </build>
121 | 
122 |   <distributionManagement>
123 |     <snapshotRepository>
124 |       <id>sonatype-nexus-snapshots</id>
125 |       <name>Sonatype Nexus Snapshots</name>
126 |       <url>http://oss.sonatype.org/content/repositories/snapshots</url>
127 |     </snapshotRepository>
128 |     <repository>
129 |       <id>sonatype-nexus-staging</id>
130 |       <name>Nexus Release Repository</name>
131 |       <url>http://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
132 |     </repository>
133 |   </distributionManagement>
134 | 
135 |   <profiles>
136 |     <profile>
137 |       <id>release-sign-artifacts</id>
138 |       <activation>
139 |         <property>
140 |           <name>performRelease</name>
141 |           <value>true</value>
142 |         </property>
143 |       </activation>
144 |       <build>
145 |         <plugins>
146 |           <plugin>
147 |             <groupId>org.apache.maven.plugins</groupId>
148 |             <artifactId>maven-gpg-plugin</artifactId>
149 |             <executions>
150 |               <execution>
151 |                 <id>sign-artifacts</id>
152 |                 <phase>verify</phase>
153 |                 <goals>
154 |                   <goal>sign</goal>
155 |                 </goals>
156 |               </execution>
157 |             </executions>
158 |           </plugin>
159 |         </plugins>
160 |       </build>
161 |     </profile>
162 |   </profiles>
163 |  
164 |   <dependencies>
165 | 
166 |     <dependency>
167 |       <!-- junit -->
168 |       <groupId>junit</groupId>
169 |       <artifactId>junit</artifactId>
170 |       <version>4.5</version>
171 |       <scope>test</scope>
172 |     </dependency>
173 | 
174 |   </dependencies>
175 |     
176 |   <dependencyManagement>
177 |   	<dependencies>
178 |   	</dependencies>
179 |   </dependencyManagement>
180 | 
181 |   <properties>
182 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
183 |   </properties>
184 | 
185 |   <developers>
186 |     <developer>
187 |       <id>jhy</id>
188 |       <name>Jonathan Hedley</name>
189 |       <email>jonathan@hedley.net</email>
190 |       <roles>
191 |         <role>Lead Developer</role>
192 |       </roles>
193 |       <timezone>+11</timezone>
194 |     </developer>
195 |   </developers>
196 | 
197 | </project>
198 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/HttpStatusException.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Signals that a HTTP request resulted in a not OK HTTP response.
 7 |  */
 8 | public class HttpStatusException extends IOException {
 9 |     private int statusCode;
10 |     private String url;
11 | 
12 |     public HttpStatusException(String message, int statusCode, String url) {
13 |         super(message);
14 |         this.statusCode = statusCode;
15 |         this.url = url;
16 |     }
17 | 
18 |     public int getStatusCode() {
19 |         return statusCode;
20 |     }
21 | 
22 |     public String getUrl() {
23 |         return url;
24 |     }
25 | 
26 |     @Override
27 |     public String toString() {
28 |         return super.toString() + ". Status=" + statusCode + ", URL=" + url;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/UnsupportedMimeTypeException.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Signals that a HTTP response returned a mime type that is not supported.
 7 |  */
 8 | public class UnsupportedMimeTypeException extends IOException {
 9 |     private String mimeType;
10 |     private String url;
11 | 
12 |     public UnsupportedMimeTypeException(String message, String mimeType, String url) {
13 |         super(message);
14 |         this.mimeType = mimeType;
15 |         this.url = url;
16 |     }
17 | 
18 |     public String getMimeType() {
19 |         return mimeType;
20 |     }
21 | 
22 |     public String getUrl() {
23 |         return url;
24 |     }
25 | 
26 |     @Override
27 |     public String toString() {
28 |         return super.toString() + ". Mimetype=" + mimeType + ", URL="+url;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/examples/HtmlToPlainText.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.examples;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.helper.StringUtil;
  5 | import org.jsoup.helper.Validate;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.nodes.Element;
  8 | import org.jsoup.nodes.Node;
  9 | import org.jsoup.nodes.TextNode;
 10 | import org.jsoup.select.NodeTraversor;
 11 | import org.jsoup.select.NodeVisitor;
 12 | 
 13 | import java.io.IOException;
 14 | 
 15 | /**
 16 |  * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted
 17 |  * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a
 18 |  * scrape.
 19 |  * <p/>
 20 |  * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.
 21 |  *
 22 |  * @author Jonathan Hedley, jonathan@hedley.net
 23 |  */
 24 | public class HtmlToPlainText {
 25 |     public static void main(String... args) throws IOException {
 26 |         Validate.isTrue(args.length == 1, "usage: supply url to fetch");
 27 |         String url = args[0];
 28 | 
 29 |         // fetch the specified URL and parse to a HTML DOM
 30 |         Document doc = Jsoup.connect(url).get();
 31 | 
 32 |         HtmlToPlainText formatter = new HtmlToPlainText();
 33 |         String plainText = formatter.getPlainText(doc);
 34 |         System.out.println(plainText);
 35 |     }
 36 | 
 37 |     /**
 38 |      * Format an Element to plain-text
 39 |      * @param element the root element to format
 40 |      * @return formatted text
 41 |      */
 42 |     public String getPlainText(Element element) {
 43 |         FormattingVisitor formatter = new FormattingVisitor();
 44 |         NodeTraversor traversor = new NodeTraversor(formatter);
 45 |         traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node
 46 | 
 47 |         return formatter.toString();
 48 |     }
 49 | 
 50 |     // the formatting rules, implemented in a breadth-first DOM traverse
 51 |     private class FormattingVisitor implements NodeVisitor {
 52 |         private static final int maxWidth = 80;
 53 |         private int width = 0;
 54 |         private StringBuilder accum = new StringBuilder(); // holds the accumulated text
 55 | 
 56 |         // hit when the node is first seen
 57 |         public void head(Node node, int depth) {
 58 |             String name = node.nodeName();
 59 |             if (node instanceof TextNode)
 60 |                 append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
 61 |             else if (name.equals("li"))
 62 |                 append("\n * ");
 63 |         }
 64 | 
 65 |         // hit when all of the node's children (if any) have been visited
 66 |         public void tail(Node node, int depth) {
 67 |             String name = node.nodeName();
 68 |             if (name.equals("br"))
 69 |                 append("\n");
 70 |             else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5"))
 71 |                 append("\n\n");
 72 |             else if (name.equals("a"))
 73 |                 append(String.format(" <%s>", node.absUrl("href")));
 74 |         }
 75 | 
 76 |         // appends text to the string builder with a simple word wrap method
 77 |         private void append(String text) {
 78 |             if (text.startsWith("\n"))
 79 |                 width = 0; // reset counter if starts with a newline. only from formats above, not in natural text
 80 |             if (text.equals(" ") &&
 81 |                     (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))
 82 |                 return; // don't accumulate long runs of empty spaces
 83 | 
 84 |             if (text.length() + width > maxWidth) { // won't fit, needs to wrap
 85 |                 String words[] = text.split("\\s+");
 86 |                 for (int i = 0; i < words.length; i++) {
 87 |                     String word = words[i];
 88 |                     boolean last = i == words.length - 1;
 89 |                     if (!last) // insert a space if not the last word
 90 |                         word = word + " ";
 91 |                     if (word.length() + width > maxWidth) { // wrap and reset counter
 92 |                         accum.append("\n").append(word);
 93 |                         width = word.length();
 94 |                     } else {
 95 |                         accum.append(word);
 96 |                         width += word.length();
 97 |                     }
 98 |                 }
 99 |             } else { // fits as is, without need to wrap text
100 |                 accum.append(text);
101 |                 width += text.length();
102 |             }
103 |         }
104 | 
105 |         public String toString() {
106 |             return accum.toString();
107 |         }
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/examples/ListLinks.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.examples;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.helper.Validate;
 5 | import org.jsoup.nodes.Document;
 6 | import org.jsoup.nodes.Element;
 7 | import org.jsoup.select.Elements;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | /**
12 |  * Example program to list links from a URL.
13 |  */
14 | public class ListLinks {
15 |     public static void main(String[] args) throws IOException {
16 |         Validate.isTrue(args.length == 1, "usage: supply url to fetch");
17 |         String url = args[0];
18 |         print("Fetching %s...", url);
19 | 
20 |         Document doc = Jsoup.connect(url).get();
21 |         Elements links = doc.select("a[href]");
22 |         Elements media = doc.select("[src]");
23 |         Elements imports = doc.select("link[href]");
24 | 
25 |         print("\nMedia: (%d)", media.size());
26 |         for (Element src : media) {
27 |             if (src.tagName().equals("img"))
28 |                 print(" * %s: <%s> %sx%s (%s)",
29 |                         src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
30 |                         trim(src.attr("alt"), 20));
31 |             else
32 |                 print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
33 |         }
34 | 
35 |         print("\nImports: (%d)", imports.size());
36 |         for (Element link : imports) {
37 |             print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
38 |         }
39 | 
40 |         print("\nLinks: (%d)", links.size());
41 |         for (Element link : links) {
42 |             print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
43 |         }
44 |     }
45 | 
46 |     private static void print(String msg, Object... args) {
47 |         System.out.println(String.format(msg, args));
48 |     }
49 | 
50 |     private static String trim(String s, int width) {
51 |         if (s.length() > width)
52 |             return s.substring(0, width-1) + ".";
53 |         else
54 |             return s;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/examples/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains example programs and use of jsoup. See the <a href="http://jsoup.org/cookbook/">jsoup cookbook</a>.
3 |  */
4 | package org.jsoup.examples;


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/DataUtil.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | import org.jsoup.nodes.Document;
  4 | import org.jsoup.nodes.Element;
  5 | import org.jsoup.parser.Parser;
  6 | 
  7 | import java.io.*;
  8 | import java.nio.ByteBuffer;
  9 | import java.nio.charset.Charset;
 10 | import java.util.regex.Matcher;
 11 | import java.util.regex.Pattern;
 12 | import java.util.Locale;
 13 | 
 14 | /**
 15 |  * Internal static utilities for handling data.
 16 |  *
 17 |  */
 18 | public class DataUtil {
 19 |     private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
 20 |     static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
 21 |     private static final int bufferSize = 0x20000; // ~130K.
 22 | 
 23 |     private DataUtil() {}
 24 | 
 25 |     /**
 26 |      * Loads a file to a Document.
 27 |      * @param in file to load
 28 |      * @param charsetName character set of input
 29 |      * @param baseUri base URI of document, to resolve relative links against
 30 |      * @return Document
 31 |      * @throws IOException on IO error
 32 |      */
 33 |     public static Document load(File in, String charsetName, String baseUri) throws IOException {
 34 |         FileInputStream inStream = null;
 35 |         try {
 36 |             inStream = new FileInputStream(in);
 37 |             ByteBuffer byteData = readToByteBuffer(inStream);
 38 |             return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
 39 |         } finally {
 40 |             if (inStream != null)
 41 |                 inStream.close();
 42 |         }
 43 |     }
 44 | 
 45 |     /**
 46 |      * Parses a Document from an input steam.
 47 |      * @param in input stream to parse. You will need to close it.
 48 |      * @param charsetName character set of input
 49 |      * @param baseUri base URI of document, to resolve relative links against
 50 |      * @return Document
 51 |      * @throws IOException on IO error
 52 |      */
 53 |     public static Document load(InputStream in, String charsetName, String baseUri) throws IOException {
 54 |         ByteBuffer byteData = readToByteBuffer(in);
 55 |         return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
 56 |     }
 57 | 
 58 |     /**
 59 |      * Parses a Document from an input steam, using the provided Parser.
 60 |      * @param in input stream to parse. You will need to close it.
 61 |      * @param charsetName character set of input
 62 |      * @param baseUri base URI of document, to resolve relative links against
 63 |      * @param parser alternate {@link Parser#xmlParser() parser} to use.
 64 |      * @return Document
 65 |      * @throws IOException on IO error
 66 |      */
 67 |     public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
 68 |         ByteBuffer byteData = readToByteBuffer(in);
 69 |         return parseByteData(byteData, charsetName, baseUri, parser);
 70 |     }
 71 | 
 72 |     // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
 73 |     // switching the chartset midstream when a meta http-equiv tag defines the charset.
 74 |     static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
 75 |         String docData;
 76 |         Document doc = null;
 77 |         if (charsetName == null) { // determine from meta. safe parse as UTF-8
 78 |             // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
 79 |             docData = Charset.forName(defaultCharset).decode(byteData).toString();
 80 |             doc = parser.parseInput(docData, baseUri);
 81 |             Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
 82 |             if (meta != null) { // if not found, will keep utf-8 as best attempt
 83 |                 String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
 84 |                 if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
 85 |                     charsetName = foundCharset;
 86 |                     byteData.rewind();
 87 |                     docData = Charset.forName(foundCharset).decode(byteData).toString();
 88 |                     doc = null;
 89 |                 }
 90 |             }
 91 |         } else { // specified by content type header (or by user on file load)
 92 |             Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
 93 |             docData = Charset.forName(charsetName).decode(byteData).toString();
 94 |         }
 95 |         if (doc == null) {
 96 |             // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
 97 |             // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
 98 |             // into head mode
 99 |             if (docData.length() > 0 && docData.charAt(0) == 65279)
100 |                 docData = docData.substring(1);
101 | 
102 |             doc = parser.parseInput(docData, baseUri);
103 |             doc.outputSettings().charset(charsetName);
104 |         }
105 |         return doc;
106 |     }
107 | 
108 |     /**
109 |      * Read the input stream into a byte buffer.
110 |      * @param inStream the input stream to read from
111 |      * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
112 |      * @return the filled byte buffer
113 |      * @throws IOException if an exception occurs whilst reading from the input stream.
114 |      */
115 |     static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
116 |         Validate.isTrue(maxSize >= 0, "maxSize must be 0 (unlimited) or larger");
117 |         final boolean capped = maxSize > 0;
118 |         byte[] buffer = new byte[bufferSize];
119 |         ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
120 |         int read;
121 |         int remaining = maxSize;
122 | 
123 |         while (true) {
124 |             read = inStream.read(buffer);
125 |             if (read == -1) break;
126 |             if (capped) {
127 |                 if (read > remaining) {
128 |                     outStream.write(buffer, 0, remaining);
129 |                     break;
130 |                 }
131 |                 remaining -= read;
132 |             }
133 |             outStream.write(buffer, 0, read);
134 |         }
135 |         ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
136 |         return byteData;
137 |     }
138 | 
139 |     static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
140 |         return readToByteBuffer(inStream, 0);
141 |     }
142 | 
143 |     /**
144 |      * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
145 |      * will kick in.)
146 |      * @param contentType e.g. "text/html; charset=EUC-JP"
147 |      * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
148 |      */
149 |     static String getCharsetFromContentType(String contentType) {
150 |         if (contentType == null) return null;
151 |         Matcher m = charsetPattern.matcher(contentType);
152 |         if (m.find()) {
153 |             String charset = m.group(1).trim();
154 |             if (Charset.isSupported(charset)) return charset;
155 |             charset = charset.toUpperCase(Locale.ENGLISH);
156 |             if (Charset.isSupported(charset)) return charset;
157 |         }
158 |         return null;
159 |     }
160 |     
161 |     
162 | }
163 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/DescendableLinkedList.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.helper;
 2 | 
 3 | import java.util.Iterator;
 4 | import java.util.LinkedList;
 5 | import java.util.ListIterator;
 6 | 
 7 | /**
 8 |  * Provides a descending iterator and other 1.6 methods to allow support on the 1.5 JRE.
 9 |  */
10 | public class DescendableLinkedList<E> extends LinkedList<E> {
11 | 
12 |     /**
13 |      * Create a new DescendableLinkedList.
14 |      */
15 |     public DescendableLinkedList() {
16 |         super();
17 |     }
18 | 
19 |     /**
20 |      * Add a new element to the start of the list.
21 |      * @param e element to add
22 |      */
23 |     public void push(E e) {
24 |         addFirst(e);
25 |     }
26 | 
27 |     /**
28 |      * Look at the last element, if there is one.
29 |      * @return the last element, or null
30 |      */
31 |     public E peekLast() {
32 |         return size() == 0 ? null : getLast();
33 |     }
34 | 
35 |     /**
36 |      * Remove and return the last element, if there is one
37 |      * @return the last element, or null
38 |      */
39 |     public E pollLast() {
40 |         return size() == 0 ? null : removeLast();
41 |     }
42 | 
43 |     /**
44 |      * Get an iterator that starts and the end of the list and works towards the start.
45 |      * @return an iterator that starts and the end of the list and works towards the start.
46 |      */
47 |     public Iterator<E> descendingIterator() {
48 |         return new DescendingIterator<E>(size());
49 |     }
50 | 
51 |     private class DescendingIterator<E> implements Iterator<E> {
52 |         private final ListIterator<E> iter;
53 | 
54 |         @SuppressWarnings("unchecked")
55 |         private DescendingIterator(int index) {
56 |             iter = (ListIterator<E>) listIterator(index);
57 |         }
58 | 
59 |         /**
60 |          * Check if there is another element on the list.
61 |          * @return if another element
62 |          */
63 |         public boolean hasNext() {
64 |             return iter.hasPrevious();
65 |         }
66 | 
67 |         /**
68 |          * Get the next element.
69 |          * @return the next element.
70 |          */
71 |         public E next() {
72 |             return iter.previous();
73 |         }
74 | 
75 |         /**
76 |          * Remove the current element.
77 |          */
78 |         public void remove() {
79 |             iter.remove();
80 |         }
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/StringUtil.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | import java.util.Collection;
  4 | import java.util.Iterator;
  5 | 
  6 | /**
  7 |  * A minimal String utility class. Designed for internal jsoup use only.
  8 |  */
  9 | public final class StringUtil {
 10 |     // memoised padding up to 10
 11 |     private static final String[] padding = {"", " ", "  ", "   ", "    ", "     ", "      ", "       ", "        ", "         ", "          "};
 12 | 
 13 |     /**
 14 |      * Join a collection of strings by a seperator
 15 |      * @param strings collection of string objects
 16 |      * @param sep string to place between strings
 17 |      * @return joined string
 18 |      */
 19 |     public static String join(Collection strings, String sep) {
 20 |         return join(strings.iterator(), sep);
 21 |     }
 22 | 
 23 |     /**
 24 |      * Join a collection of strings by a seperator
 25 |      * @param strings iterator of string objects
 26 |      * @param sep string to place between strings
 27 |      * @return joined string
 28 |      */
 29 |     public static String join(Iterator strings, String sep) {
 30 |         if (!strings.hasNext())
 31 |             return "";
 32 | 
 33 |         String start = strings.next().toString();
 34 |         if (!strings.hasNext()) // only one, avoid builder
 35 |             return start;
 36 | 
 37 |         StringBuilder sb = new StringBuilder(64).append(start);
 38 |         while (strings.hasNext()) {
 39 |             sb.append(sep);
 40 |             sb.append(strings.next());
 41 |         }
 42 |         return sb.toString();
 43 |     }
 44 | 
 45 |     /**
 46 |      * Returns space padding
 47 |      * @param width amount of padding desired
 48 |      * @return string of spaces * width
 49 |      */
 50 |     public static String padding(int width) {
 51 |         if (width < 0)
 52 |             throw new IllegalArgumentException("width must be > 0");
 53 | 
 54 |         if (width < padding.length)
 55 |             return padding[width];
 56 | 
 57 |         char[] out = new char[width];
 58 |         for (int i = 0; i < width; i++)
 59 |             out[i] = ' ';
 60 |         return String.valueOf(out);
 61 |     }
 62 | 
 63 |     /**
 64 |      * Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, \t, etc)
 65 |      * @param string string to test
 66 |      * @return if string is blank
 67 |      */
 68 |     public static boolean isBlank(String string) {
 69 |         if (string == null || string.length() == 0)
 70 |             return true;
 71 | 
 72 |         int l = string.length();
 73 |         for (int i = 0; i < l; i++) {
 74 |             if (!StringUtil.isWhitespace(string.codePointAt(i)))
 75 |                 return false;
 76 |         }
 77 |         return true;
 78 |     }
 79 | 
 80 |     /**
 81 |      * Tests if a string is numeric, i.e. contains only digit characters
 82 |      * @param string string to test
 83 |      * @return true if only digit chars, false if empty or null or contains non-digit chrs
 84 |      */
 85 |     public static boolean isNumeric(String string) {
 86 |         if (string == null || string.length() == 0)
 87 |             return false;
 88 | 
 89 |         int l = string.length();
 90 |         for (int i = 0; i < l; i++) {
 91 |             if (!Character.isDigit(string.codePointAt(i)))
 92 |                 return false;
 93 |         }
 94 |         return true;
 95 |     }
 96 | 
 97 |     /**
 98 |      * Tests if a code point is "whitespace" as defined in the HTML spec.
 99 |      * @param c code point to test
100 |      * @return true if code point is whitespace, false otherwise
101 |      */
102 |     public static boolean isWhitespace(int c){
103 |         return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
104 |     }
105 | 
106 |     public static String normaliseWhitespace(String string) {
107 |         StringBuilder sb = new StringBuilder(string.length());
108 | 
109 |         boolean lastWasWhite = false;
110 |         boolean modified = false;
111 | 
112 |         int l = string.length();
113 |         int c;
114 |         for (int i = 0; i < l; i+= Character.charCount(c)) {
115 |             c = string.codePointAt(i);
116 |             if (isWhitespace(c)) {
117 |                 if (lastWasWhite) {
118 |                     modified = true;
119 |                     continue;
120 |                 }
121 |                 if (c != ' ')
122 |                     modified = true;
123 |                 sb.append(' ');
124 |                 lastWasWhite = true;
125 |             }
126 |             else {
127 |                 sb.appendCodePoint(c);
128 |                 lastWasWhite = false;
129 |             }
130 |         }
131 |         return modified ? sb.toString() : string;
132 |     }
133 | 
134 |     public static boolean in(String needle, String... haystack) {
135 |         for (String hay : haystack) {
136 |             if (hay.equals(needle))
137 |             return true;
138 |         }
139 |         return false;
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/Validate.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | /**
  4 |  * Simple validation methods. Designed for jsoup internal use
  5 |  */
  6 | public final class Validate {
  7 |     
  8 |     private Validate() {}
  9 | 
 10 |     /**
 11 |      * Validates that the object is not null
 12 |      * @param obj object to test
 13 |      */
 14 |     public static void notNull(Object obj) {
 15 |         if (obj == null)
 16 |             throw new IllegalArgumentException("Object must not be null");
 17 |     }
 18 | 
 19 |     /**
 20 |      * Validates that the object is not null
 21 |      * @param obj object to test
 22 |      * @param msg message to output if validation fails
 23 |      */
 24 |     public static void notNull(Object obj, String msg) {
 25 |         if (obj == null)
 26 |             throw new IllegalArgumentException(msg);
 27 |     }
 28 | 
 29 |     /**
 30 |      * Validates that the value is true
 31 |      * @param val object to test
 32 |      */
 33 |     public static void isTrue(boolean val) {
 34 |         if (!val)
 35 |             throw new IllegalArgumentException("Must be true");
 36 |     }
 37 | 
 38 |     /**
 39 |      * Validates that the value is true
 40 |      * @param val object to test
 41 |      * @param msg message to output if validation fails
 42 |      */
 43 |     public static void isTrue(boolean val, String msg) {
 44 |         if (!val)
 45 |             throw new IllegalArgumentException(msg);
 46 |     }
 47 | 
 48 |     /**
 49 |      * Validates that the value is false
 50 |      * @param val object to test
 51 |      */
 52 |     public static void isFalse(boolean val) {
 53 |         if (val)
 54 |             throw new IllegalArgumentException("Must be false");
 55 |     }
 56 | 
 57 |     /**
 58 |      * Validates that the value is false
 59 |      * @param val object to test
 60 |      * @param msg message to output if validation fails
 61 |      */
 62 |     public static void isFalse(boolean val, String msg) {
 63 |         if (val)
 64 |             throw new IllegalArgumentException(msg);
 65 |     }
 66 | 
 67 |     /**
 68 |      * Validates that the array contains no null elements
 69 |      * @param objects the array to test
 70 |      */
 71 |     public static void noNullElements(Object[] objects) {
 72 |         noNullElements(objects, "Array must not contain any null objects");
 73 |     }
 74 | 
 75 |     /**
 76 |      * Validates that the array contains no null elements
 77 |      * @param objects the array to test
 78 |      * @param msg message to output if validation fails
 79 |      */
 80 |     public static void noNullElements(Object[] objects, String msg) {
 81 |         for (Object obj : objects)
 82 |             if (obj == null)
 83 |                 throw new IllegalArgumentException(msg);
 84 |     }
 85 | 
 86 |     /**
 87 |      * Validates that the string is not empty
 88 |      * @param string the string to test
 89 |      */
 90 |     public static void notEmpty(String string) {
 91 |         if (string == null || string.length() == 0)
 92 |             throw new IllegalArgumentException("String must not be empty");
 93 |     }
 94 | 
 95 |     /**
 96 |      * Validates that the string is not empty
 97 |      * @param string the string to test
 98 |      * @param msg message to output if validation fails
 99 |      */
100 |     public static void notEmpty(String string, String msg) {
101 |         if (string == null || string.length() == 0)
102 |             throw new IllegalArgumentException(msg);
103 |     }
104 | 
105 |     /**
106 |      Cause a failure.
107 |      @param msg message to output.
108 |      */
109 |     public static void fail(String msg) {
110 |         throw new IllegalArgumentException(msg);
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/Attribute.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.helper.Validate;
  4 | 
  5 | import java.util.Map;
  6 | 
  7 | /**
  8 |  A single key + value attribute. Keys are trimmed and normalised to lower-case.
  9 | 
 10 |  @author Jonathan Hedley, jonathan@hedley.net */
 11 | public class Attribute implements Map.Entry<String, String>, Cloneable  {
 12 |     private String key;
 13 |     private String value;
 14 | 
 15 |     /**
 16 |      * Create a new attribute from unencoded (raw) key and value.
 17 |      * @param key attribute key
 18 |      * @param value attribute value
 19 |      * @see #createFromEncoded
 20 |      */
 21 |     public Attribute(String key, String value) {
 22 |         Validate.notEmpty(key);
 23 |         Validate.notNull(value);
 24 |         this.key = key.trim().toLowerCase();
 25 |         this.value = value;
 26 |     }
 27 | 
 28 |     /**
 29 |      Get the attribute key.
 30 |      @return the attribute key
 31 |      */
 32 |     public String getKey() {
 33 |         return key;
 34 |     }
 35 | 
 36 |     /**
 37 |      Set the attribute key. Gets normalised as per the constructor method.
 38 |      @param key the new key; must not be null
 39 |      */
 40 |     public void setKey(String key) {
 41 |         Validate.notEmpty(key);
 42 |         this.key = key.trim().toLowerCase();
 43 |     }
 44 | 
 45 |     /**
 46 |      Get the attribute value.
 47 |      @return the attribute value
 48 |      */
 49 |     public String getValue() {
 50 |         return value;
 51 |     }
 52 | 
 53 |     /**
 54 |      Set the attribute value.
 55 |      @param value the new attribute value; must not be null
 56 |      */
 57 |     public String setValue(String value) {
 58 |         Validate.notNull(value);
 59 |         String old = this.value;
 60 |         this.value = value;
 61 |         return old;
 62 |     }
 63 | 
 64 |     /**
 65 |      Get the HTML representation of this attribute; e.g. {@code href="index.html"}.
 66 |      @return HTML
 67 |      */
 68 |     public String html() {
 69 |         return key + "=\"" + Entities.escape(value, (new Document("")).outputSettings()) + "\"";
 70 |     }
 71 |     
 72 |     protected void html(StringBuilder accum, Document.OutputSettings out) {
 73 |         accum
 74 |             .append(key)
 75 |             .append("=\"")
 76 |             .append(Entities.escape(value, out))
 77 |             .append("\"");
 78 |     }
 79 | 
 80 |     /**
 81 |      Get the string representation of this attribute, implemented as {@link #html()}.
 82 |      @return string
 83 |      */
 84 |     public String toString() {
 85 |         return html();
 86 |     }
 87 | 
 88 |     /**
 89 |      * Create a new Attribute from an unencoded key and a HTML attribute encoded value.
 90 |      * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars.
 91 |      * @param encodedValue HTML attribute encoded value
 92 |      * @return attribute
 93 |      */
 94 |     public static Attribute createFromEncoded(String unencodedKey, String encodedValue) {
 95 |         String value = Entities.unescape(encodedValue, true);
 96 |         return new Attribute(unencodedKey, value);
 97 |     }
 98 | 
 99 |     protected boolean isDataAttribute() {
100 |         return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
101 |     }
102 | 
103 |     @Override
104 |     public boolean equals(Object o) {
105 |         if (this == o) return true;
106 |         if (!(o instanceof Attribute)) return false;
107 | 
108 |         Attribute attribute = (Attribute) o;
109 | 
110 |         if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false;
111 |         if (value != null ? !value.equals(attribute.value) : attribute.value != null) return false;
112 | 
113 |         return true;
114 |     }
115 | 
116 |     @Override
117 |     public int hashCode() {
118 |         int result = key != null ? key.hashCode() : 0;
119 |         result = 31 * result + (value != null ? value.hashCode() : 0);
120 |         return result;
121 |     }
122 | 
123 |     @Override
124 |     public Attribute clone() {
125 |         try {
126 |             return (Attribute) super.clone(); // only fields are immutable strings key and value, so no more deep copy required
127 |         } catch (CloneNotSupportedException e) {
128 |             throw new RuntimeException(e);
129 |         }
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/Comment.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | /**
 4 |  A comment node.
 5 | 
 6 |  @author Jonathan Hedley, jonathan@hedley.net */
 7 | public class Comment extends Node {
 8 |     private static final String COMMENT_KEY = "comment";
 9 | 
10 |     /**
11 |      Create a new comment node.
12 |      @param data The contents of the comment
13 |      @param baseUri base URI
14 |      */
15 |     public Comment(String data, String baseUri) {
16 |         super(baseUri);
17 |         attributes.put(COMMENT_KEY, data);
18 |     }
19 | 
20 |     public String nodeName() {
21 |         return "#comment";
22 |     }
23 | 
24 |     /**
25 |      Get the contents of the comment.
26 |      @return comment content
27 |      */
28 |     public String getData() {
29 |         return attributes.get(COMMENT_KEY);
30 |     }
31 | 
32 |     void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
33 |         if (out.prettyPrint())
34 |             indent(accum, depth, out);
35 |         accum
36 |                 .append("<!--")
37 |                 .append(getData())
38 |                 .append("-->");
39 |     }
40 | 
41 |     void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
42 | 
43 |     public String toString() {
44 |         return outerHtml();
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/DataNode.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | /**
 4 |  A data node, for contents of style, script tags etc, where contents should not show in text().
 5 | 
 6 |  @author Jonathan Hedley, jonathan@hedley.net */
 7 | public class DataNode extends Node{
 8 |     private static final String DATA_KEY = "data";
 9 | 
10 |     /**
11 |      Create a new DataNode.
12 |      @param data data contents
13 |      @param baseUri base URI
14 |      */
15 |     public DataNode(String data, String baseUri) {
16 |         super(baseUri);
17 |         attributes.put(DATA_KEY, data);
18 |     }
19 | 
20 |     public String nodeName() {
21 |         return "#data";
22 |     }
23 | 
24 |     /**
25 |      Get the data contents of this node. Will be unescaped and with original new lines, space etc.
26 |      @return data
27 |      */
28 |     public String getWholeData() {
29 |         return attributes.get(DATA_KEY);
30 |     }
31 | 
32 |     /**
33 |      * Set the data contents of this node.
34 |      * @param data unencoded data
35 |      * @return this node, for chaining
36 |      */
37 |     public DataNode setWholeData(String data) {
38 |         attributes.put(DATA_KEY, data);
39 |         return this;
40 |     }
41 | 
42 |     void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
43 |         accum.append(getWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain
44 |     }
45 | 
46 |     void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
47 | 
48 |     public String toString() {
49 |         return outerHtml();
50 |     }
51 | 
52 |     /**
53 |      Create a new DataNode from HTML encoded data.
54 |      @param encodedData encoded data
55 |      @param baseUri bass URI
56 |      @return new DataNode
57 |      */
58 |     public static DataNode createFromEncoded(String encodedData, String baseUri) {
59 |         String data = Entities.unescape(encodedData);
60 |         return new DataNode(data, baseUri);
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/DocumentType.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.helper.StringUtil;
 4 | import org.jsoup.helper.Validate;
 5 | 
 6 | /**
 7 |  * A {@code <!DOCTYPE>} node.
 8 |  */
 9 | public class DocumentType extends Node {
10 |     // todo: quirk mode from publicId and systemId
11 | 
12 |     /**
13 |      * Create a new doctype element.
14 |      * @param name the doctype's name
15 |      * @param publicId the doctype's public ID
16 |      * @param systemId the doctype's system ID
17 |      * @param baseUri the doctype's base URI
18 |      */
19 |     public DocumentType(String name, String publicId, String systemId, String baseUri) {
20 |         super(baseUri);
21 | 
22 |         Validate.notEmpty(name);
23 |         attr("name", name);
24 |         attr("publicId", publicId);
25 |         attr("systemId", systemId);
26 |     }
27 | 
28 |     @Override
29 |     public String nodeName() {
30 |         return "#doctype";
31 |     }
32 | 
33 |     @Override
34 |     void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
35 |         accum.append("<!DOCTYPE ").append(attr("name"));
36 |         if (!StringUtil.isBlank(attr("publicId")))
37 |             accum.append(" PUBLIC \"").append(attr("publicId")).append("\"");
38 |         if (!StringUtil.isBlank(attr("systemId")))
39 |             accum.append(" \"").append(attr("systemId")).append("\"");
40 |         accum.append('>');
41 |     }
42 | 
43 |     @Override
44 |     void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/Entities.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.parser.Parser;
  4 | 
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | import java.nio.charset.CharsetEncoder;
  8 | import java.util.*;
  9 | import java.util.regex.Matcher;
 10 | import java.util.regex.Pattern;
 11 | 
 12 | /**
 13 |  * HTML entities, and escape routines.
 14 |  * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML
 15 |  * named character references</a>.
 16 |  */
 17 | public class Entities {
 18 |     public enum EscapeMode {
 19 |         /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */
 20 |         xhtml(xhtmlByVal),
 21 |         /** Default HTML output entities. */
 22 |         base(baseByVal),
 23 |         /** Complete HTML entities. */
 24 |         extended(fullByVal);
 25 | 
 26 |         private Map<Character, String> map;
 27 | 
 28 |         EscapeMode(Map<Character, String> map) {
 29 |             this.map = map;
 30 |         }
 31 | 
 32 |         public Map<Character, String> getMap() {
 33 |             return map;
 34 |         }
 35 |     }
 36 | 
 37 |     private static final Map<String, Character> full;
 38 |     private static final Map<Character, String> xhtmlByVal;
 39 |     private static final Map<String, Character> base;
 40 |     private static final Map<Character, String> baseByVal;
 41 |     private static final Map<Character, String> fullByVal;
 42 |     private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?");
 43 |     private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);");
 44 | 
 45 |     private Entities() {}
 46 | 
 47 |     /**
 48 |      * Check if the input is a known named entity
 49 |      * @param name the possible entity name (e.g. "lt" or "amp")
 50 |      * @return true if a known named entity
 51 |      */
 52 |     public static boolean isNamedEntity(String name) {
 53 |         return full.containsKey(name);
 54 |     }
 55 | 
 56 |     /**
 57 |      * Check if the input is a known named entity in the base entity set.
 58 |      * @param name the possible entity name (e.g. "lt" or "amp")
 59 |      * @return true if a known named entity in the base set
 60 |      * @see #isNamedEntity(String)
 61 |      */
 62 |     public static boolean isBaseNamedEntity(String name) {
 63 |         return base.containsKey(name);
 64 |     }
 65 | 
 66 |     /**
 67 |      * Get the Character value of the named entity
 68 |      * @param name named entity (e.g. "lt" or "amp")
 69 |      * @return the Character value of the named entity (e.g. '<' or '&')
 70 |      */
 71 |     public static Character getCharacterByName(String name) {
 72 |         return full.get(name);
 73 |     }
 74 |     
 75 |     static String escape(String string, Document.OutputSettings out) {
 76 |         return escape(string, out.encoder(), out.escapeMode());
 77 |     }
 78 | 
 79 |     static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) {
 80 |         StringBuilder accum = new StringBuilder(string.length() * 2);
 81 |         Map<Character, String> map = escapeMode.getMap();
 82 | 
 83 |         final int length = string.length();
 84 |         for (int offset = 0; offset < length; ) {
 85 |             final int codePoint = string.codePointAt(offset);
 86 | 
 87 |             // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
 88 |             if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
 89 |                 final char c = (char) codePoint;
 90 |                 if (map.containsKey(c))
 91 |                     accum.append('&').append(map.get(c)).append(';');
 92 |                 else if (encoder.canEncode(c))
 93 |                     accum.append(c);
 94 |                 else
 95 |                     accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
 96 |             } else {
 97 |                 final String c = new String(Character.toChars(codePoint));
 98 |                 if (encoder.canEncode(c))
 99 |                     accum.append(c);
100 |                 else
101 |                     accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
102 |             }
103 | 
104 |             offset += Character.charCount(codePoint);
105 |         }
106 | 
107 |         return accum.toString();
108 |     }
109 | 
110 |     static String unescape(String string) {
111 |         return unescape(string, false);
112 |     }
113 | 
114 |     /**
115 |      * Unescape the input string.
116 |      * @param string
117 |      * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
118 |      * @return
119 |      */
120 |     static String unescape(String string, boolean strict) {
121 |         return Parser.unescapeEntities(string, strict);
122 |     }
123 | 
124 |     // xhtml has restricted entities
125 |     private static final Object[][] xhtmlArray = {
126 |             {"quot", 0x00022},
127 |             {"amp", 0x00026},
128 |             {"apos", 0x00027},
129 |             {"lt", 0x0003C},
130 |             {"gt", 0x0003E}
131 |     };
132 | 
133 |     static {
134 |         xhtmlByVal = new HashMap<Character, String>();
135 |         base = loadEntities("entities-base.properties");  // most common / default
136 |         baseByVal = toCharacterKey(base);
137 |         full = loadEntities("entities-full.properties"); // extended and overblown.
138 |         fullByVal = toCharacterKey(full);
139 | 
140 |         for (Object[] entity : xhtmlArray) {
141 |             Character c = Character.valueOf((char) ((Integer) entity[1]).intValue());
142 |             xhtmlByVal.put(c, ((String) entity[0]));
143 |         }
144 |     }
145 | 
146 |     private static Map<String, Character> loadEntities(String filename) {
147 |         Properties properties = new Properties();
148 |         Map<String, Character> entities = new HashMap<String, Character>();
149 |         try {
150 |             InputStream in = Entities.class.getResourceAsStream(filename);
151 |             properties.load(in);
152 |             in.close();
153 |         } catch (IOException e) {
154 |             throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename);
155 |         }
156 | 
157 |         for (Map.Entry entry: properties.entrySet()) {
158 |             Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16));
159 |             String name = (String) entry.getKey();
160 |             entities.put(name, val);
161 |         }
162 |         return entities;
163 |     }
164 | 
165 |     private static Map<Character, String> toCharacterKey(Map<String, Character> inMap) {
166 |         Map<Character, String> outMap = new HashMap<Character, String>();
167 |         for (Map.Entry<String, Character> entry: inMap.entrySet()) {
168 |             Character character = entry.getValue();
169 |             String name = entry.getKey();
170 | 
171 |             if (outMap.containsKey(character)) {
172 |                 // dupe, prefer the lower case version
173 |                 if (name.toLowerCase().equals(name))
174 |                     outMap.put(character, name);
175 |             } else {
176 |                 outMap.put(character, name);
177 |             }
178 |         }
179 |         return outMap;
180 |     }
181 | }
182 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/FormElement.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.Connection;
 4 | import org.jsoup.Jsoup;
 5 | import org.jsoup.helper.HttpConnection;
 6 | import org.jsoup.helper.Validate;
 7 | import org.jsoup.parser.Tag;
 8 | import org.jsoup.select.Elements;
 9 | 
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | 
13 | /**
14 |  * A HTML Form Element provides ready access to the form fields/controls that are associated with it. It also allows a
15 |  * form to easily be submitted.
16 |  */
17 | public class FormElement extends Element {
18 |     private final Elements elements = new Elements();
19 | 
20 |     /**
21 |      * Create a new, standalone form element.
22 |      *
23 |      * @param tag        tag of this element
24 |      * @param baseUri    the base URI
25 |      * @param attributes initial attributes
26 |      */
27 |     public FormElement(Tag tag, String baseUri, Attributes attributes) {
28 |         super(tag, baseUri, attributes);
29 |     }
30 | 
31 |     /**
32 |      * Get the list of form control elements associated with this form.
33 |      * @return form controls associated with this element.
34 |      */
35 |     public Elements elements() {
36 |         return elements;
37 |     }
38 | 
39 |     /**
40 |      * Add a form control element to this form.
41 |      * @param element form control to add
42 |      * @return this form element, for chaining
43 |      */
44 |     public FormElement addElement(Element element) {
45 |         elements.add(element);
46 |         return this;
47 |     }
48 | 
49 |     /**
50 |      * Prepare to submit this form. A Connection object is created with the request set up from the form values. You
51 |      * can then set up other options (like user-agent, timeout, cookies), then execute it.
52 |      * @return a connection prepared from the values of this form.
53 |      * @throws IllegalArgumentException if the form's absolute action URL cannot be determined. Make sure you pass the
54 |      * document's base URI when parsing.
55 |      */
56 |     public Connection submit() {
57 |         String action = hasAttr("action") ? absUrl("action") : baseUri();
58 |         Validate.notEmpty(action, "Could not determine a form action URL for submit. Ensure you set a base URI when parsing.");
59 |         Connection.Method method = attr("method").toUpperCase().equals("POST") ?
60 |                 Connection.Method.POST : Connection.Method.GET;
61 | 
62 |         Connection con = Jsoup.connect(action)
63 |                 .data(formData())
64 |                 .method(method);
65 | 
66 |         return con;
67 |     }
68 | 
69 |     /**
70 |      * Get the data that this form submits. The returned list is a copy of the data, and changes to the contents of the
71 |      * list will not be reflected in the DOM.
72 |      * @return a list of key vals
73 |      */
74 |     public List<Connection.KeyVal> formData() {
75 |         ArrayList<Connection.KeyVal> data = new ArrayList<Connection.KeyVal>();
76 | 
77 |         // iterate the form control elements and accumulate their values
78 |         for (Element el: elements) {
79 |             if (!el.tag().isFormSubmittable()) continue; // contents are form listable, superset of submitable
80 |             String name = el.attr("name");
81 |             if (name.length() == 0) continue;
82 | 
83 |             if ("select".equals(el.tagName())) {
84 |                 Elements options = el.select("option[selected]");
85 |                 for (Element option: options) {
86 |                     data.add(HttpConnection.KeyVal.create(name, option.val()));
87 |                 }
88 |             } else {
89 |                 data.add(HttpConnection.KeyVal.create(name, el.val()));
90 |             }
91 |         }
92 |         return data;
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/TextNode.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.helper.StringUtil;
  4 | import org.jsoup.helper.Validate;
  5 | 
  6 | /**
  7 |  A text node.
  8 | 
  9 |  @author Jonathan Hedley, jonathan@hedley.net */
 10 | public class TextNode extends Node {
 11 |     /*
 12 |     TextNode is a node, and so by default comes with attributes and children. The attributes are seldom used, but use
 13 |     memory, and the child nodes are never used. So we don't have them, and override accessors to attributes to create
 14 |     them as needed on the fly.
 15 |      */
 16 |     private static final String TEXT_KEY = "text";
 17 |     String text;
 18 | 
 19 |     /**
 20 |      Create a new TextNode representing the supplied (unencoded) text).
 21 | 
 22 |      @param text raw text
 23 |      @param baseUri base uri
 24 |      @see #createFromEncoded(String, String)
 25 |      */
 26 |     public TextNode(String text, String baseUri) {
 27 |         this.baseUri = baseUri;
 28 |         this.text = text;
 29 |     }
 30 | 
 31 |     public String nodeName() {
 32 |         return "#text";
 33 |     }
 34 |     
 35 |     /**
 36 |      * Get the text content of this text node.
 37 |      * @return Unencoded, normalised text.
 38 |      * @see TextNode#getWholeText()
 39 |      */
 40 |     public String text() {
 41 |         return normaliseWhitespace(getWholeText());
 42 |     }
 43 |     
 44 |     /**
 45 |      * Set the text content of this text node.
 46 |      * @param text unencoded text
 47 |      * @return this, for chaining
 48 |      */
 49 |     public TextNode text(String text) {
 50 |         this.text = text;
 51 |         if (attributes != null)
 52 |             attributes.put(TEXT_KEY, text);
 53 |         return this;
 54 |     }
 55 | 
 56 |     /**
 57 |      Get the (unencoded) text of this text node, including any newlines and spaces present in the original.
 58 |      @return text
 59 |      */
 60 |     public String getWholeText() {
 61 |         return attributes == null ? text : attributes.get(TEXT_KEY);
 62 |     }
 63 | 
 64 |     /**
 65 |      Test if this text node is blank -- that is, empty or only whitespace (including newlines).
 66 |      @return true if this document is empty or only whitespace, false if it contains any text content.
 67 |      */
 68 |     public boolean isBlank() {
 69 |         return StringUtil.isBlank(getWholeText());
 70 |     }
 71 | 
 72 |     /**
 73 |      * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the
 74 |      * original text up to the offset, and will have a new text node sibling containing the text after the offset.
 75 |      * @param offset string offset point to split node at.
 76 |      * @return the newly created text node containing the text after the offset.
 77 |      */
 78 |     public TextNode splitText(int offset) {
 79 |         Validate.isTrue(offset >= 0, "Split offset must be not be negative");
 80 |         Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length");
 81 | 
 82 |         String head = getWholeText().substring(0, offset);
 83 |         String tail = getWholeText().substring(offset);
 84 |         text(head);
 85 |         TextNode tailNode = new TextNode(tail, this.baseUri());
 86 |         if (parent() != null)
 87 |             parent().addChildren(siblingIndex()+1, tailNode);
 88 | 
 89 |         return tailNode;
 90 |     }
 91 | 
 92 |     void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
 93 |         String html = Entities.escape(getWholeText(), out);
 94 |         if (out.prettyPrint() && parent() instanceof Element && !Element.preserveWhitespace((Element) parent())) {
 95 |             html = normaliseWhitespace(html);
 96 |         }
 97 | 
 98 |         if (out.prettyPrint() && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) ))
 99 |             indent(accum, depth, out);
100 |         accum.append(html);
101 |     }
102 | 
103 |     void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
104 | 
105 |     public String toString() {
106 |         return outerHtml();
107 |     }
108 | 
109 |     /**
110 |      * Create a new TextNode from HTML encoded (aka escaped) data.
111 |      * @param encodedText Text containing encoded HTML (e.g. &amp;lt;)
112 |      * @return TextNode containing unencoded data (e.g. &lt;)
113 |      */
114 |     public static TextNode createFromEncoded(String encodedText, String baseUri) {
115 |         String text = Entities.unescape(encodedText);
116 |         return new TextNode(text, baseUri);
117 |     }
118 | 
119 |     static String normaliseWhitespace(String text) {
120 |         text = StringUtil.normaliseWhitespace(text);
121 |         return text;
122 |     }
123 | 
124 |     static String stripLeadingWhitespace(String text) {
125 |         return text.replaceFirst("^\\s+", "");
126 |     }
127 | 
128 |     static boolean lastCharIsWhitespace(StringBuilder sb) {
129 |         return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
130 |     }
131 | 
132 |     // attribute fiddling. create on first access.
133 |     private void ensureAttributes() {
134 |         if (attributes == null) {
135 |             attributes = new Attributes();
136 |             attributes.put(TEXT_KEY, text);
137 |         }
138 |     }
139 | 
140 |     @Override
141 |     public String attr(String attributeKey) {
142 |         ensureAttributes();
143 |         return super.attr(attributeKey);
144 |     }
145 | 
146 |     @Override
147 |     public Attributes attributes() {
148 |         ensureAttributes();
149 |         return super.attributes();
150 |     }
151 | 
152 |     @Override
153 |     public Node attr(String attributeKey, String attributeValue) {
154 |         ensureAttributes();
155 |         return super.attr(attributeKey, attributeValue);
156 |     }
157 | 
158 |     @Override
159 |     public boolean hasAttr(String attributeKey) {
160 |         ensureAttributes();
161 |         return super.hasAttr(attributeKey);
162 |     }
163 | 
164 |     @Override
165 |     public Node removeAttr(String attributeKey) {
166 |         ensureAttributes();
167 |         return super.removeAttr(attributeKey);
168 |     }
169 | 
170 |     @Override
171 |     public String absUrl(String attributeKey) {
172 |         ensureAttributes();
173 |         return super.absUrl(attributeKey);
174 |     }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/XmlDeclaration.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | /**
 4 |  An XML Declaration.
 5 | 
 6 |  @author Jonathan Hedley, jonathan@hedley.net */
 7 | public class XmlDeclaration extends Node {
 8 |     private static final String DECL_KEY = "declaration";
 9 |     private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)
10 | 
11 |     /**
12 |      Create a new XML declaration
13 |      @param data data
14 |      @param baseUri base uri
15 |      @param isProcessingInstruction is processing instruction
16 |      */
17 |     public XmlDeclaration(String data, String baseUri, boolean isProcessingInstruction) {
18 |         super(baseUri);
19 |         attributes.put(DECL_KEY, data);
20 |         this.isProcessingInstruction = isProcessingInstruction;
21 |     }
22 | 
23 |     public String nodeName() {
24 |         return "#declaration";
25 |     }
26 | 
27 |     /**
28 |      Get the unencoded XML declaration.
29 |      @return XML declaration
30 |      */
31 |     public String getWholeDeclaration() {
32 |         return attributes.get(DECL_KEY);
33 |     }
34 | 
35 |     void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
36 |         accum
37 |                 .append("<")
38 |                 .append(isProcessingInstruction ? "!" : "?")
39 |                 .append(getWholeDeclaration())
40 |                 .append(">");
41 |     }
42 | 
43 |     void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
44 | 
45 |     public String toString() {
46 |         return outerHtml();
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/entities-base.properties:
--------------------------------------------------------------------------------
  1 | AElig=000C6
  2 | AMP=00026
  3 | Aacute=000C1
  4 | Acirc=000C2
  5 | Agrave=000C0
  6 | Aring=000C5
  7 | Atilde=000C3
  8 | Auml=000C4
  9 | COPY=000A9
 10 | Ccedil=000C7
 11 | ETH=000D0
 12 | Eacute=000C9
 13 | Ecirc=000CA
 14 | Egrave=000C8
 15 | Euml=000CB
 16 | GT=0003E
 17 | Iacute=000CD
 18 | Icirc=000CE
 19 | Igrave=000CC
 20 | Iuml=000CF
 21 | LT=0003C
 22 | Ntilde=000D1
 23 | Oacute=000D3
 24 | Ocirc=000D4
 25 | Ograve=000D2
 26 | Oslash=000D8
 27 | Otilde=000D5
 28 | Ouml=000D6
 29 | QUOT=00022
 30 | REG=000AE
 31 | THORN=000DE
 32 | Uacute=000DA
 33 | Ucirc=000DB
 34 | Ugrave=000D9
 35 | Uuml=000DC
 36 | Yacute=000DD
 37 | aacute=000E1
 38 | acirc=000E2
 39 | acute=000B4
 40 | aelig=000E6
 41 | agrave=000E0
 42 | amp=00026
 43 | aring=000E5
 44 | atilde=000E3
 45 | auml=000E4
 46 | brvbar=000A6
 47 | ccedil=000E7
 48 | cedil=000B8
 49 | cent=000A2
 50 | copy=000A9
 51 | curren=000A4
 52 | deg=000B0
 53 | divide=000F7
 54 | eacute=000E9
 55 | ecirc=000EA
 56 | egrave=000E8
 57 | eth=000F0
 58 | euml=000EB
 59 | frac12=000BD
 60 | frac14=000BC
 61 | frac34=000BE
 62 | gt=0003E
 63 | iacute=000ED
 64 | icirc=000EE
 65 | iexcl=000A1
 66 | igrave=000EC
 67 | iquest=000BF
 68 | iuml=000EF
 69 | laquo=000AB
 70 | lt=0003C
 71 | macr=000AF
 72 | micro=000B5
 73 | middot=000B7
 74 | nbsp=000A0
 75 | not=000AC
 76 | ntilde=000F1
 77 | oacute=000F3
 78 | ocirc=000F4
 79 | ograve=000F2
 80 | ordf=000AA
 81 | ordm=000BA
 82 | oslash=000F8
 83 | otilde=000F5
 84 | ouml=000F6
 85 | para=000B6
 86 | plusmn=000B1
 87 | pound=000A3
 88 | quot=00022
 89 | raquo=000BB
 90 | reg=000AE
 91 | sect=000A7
 92 | shy=000AD
 93 | sup1=000B9
 94 | sup2=000B2
 95 | sup3=000B3
 96 | szlig=000DF
 97 | thorn=000FE
 98 | times=000D7
 99 | uacute=000FA
100 | ucirc=000FB
101 | ugrave=000F9
102 | uml=000A8
103 | uuml=000FC
104 | yacute=000FD
105 | yen=000A5
106 | yuml=000FF
107 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  HTML document structure nodes.
3 |  */
4 | package org.jsoup.nodes;


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains the main {@link org.jsoup.Jsoup} class, which provides convenient static access to the jsoup functionality. 
3 |  */
4 | package org.jsoup;


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/ITokeniserState.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | /**
 4 |  * @author code4crafter@gmail.com
 5 |  */
 6 | interface ITokeniserState {
 7 | 
 8 |     abstract void read(Tokeniser t, CharacterReader r);
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/MiniSoupTokeniserState.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | /**
 4 |  * 词法分析状态机。
 5 |  * States and transition activations for the Tokeniser.
 6 |  */
 7 | enum MiniSoupTokeniserState implements ITokeniserState {
 8 |     /**
 9 |      * 什么层级都没有的状态
10 |      * ⬇
11 |      * <div>test</div>
12 |      *      ⬇
13 |      * <div>test</div>
14 |      */
15 |     Data {
16 |         // in data state, gather characters until a character reference or tag is found
17 |         public void read(Tokeniser t, CharacterReader r) {
18 |             switch (r.current()) {
19 |                 case '<':
20 |                     t.advanceTransition(TagOpen);
21 |                     break;
22 |                 case eof:
23 |                     t.emit(new Token.EOF());
24 |                     break;
25 |                 default:
26 |                     String data = r.consumeToAny('&', '<', nullChar);
27 |                     t.emit(data);
28 |                     break;
29 |             }
30 |         }
31 |     },
32 |     /**
33 |      * ⬇
34 |      * <div>test</div>
35 |      */
36 |     TagOpen {
37 |         // from < in data
38 |         public void read(Tokeniser t, CharacterReader r) {
39 |             switch (r.current()) {
40 |                 case '/':
41 |                     t.advanceTransition(EndTagOpen);
42 |                     break;
43 |                 default:
44 |                     if (r.matchesLetter()) {
45 |                         t.createTagPending(true);
46 |                         t.transition(TagName);
47 |                     } else {
48 |                         t.error(this);
49 |                         t.emit('<'); // char that got us here
50 |                         t.transition(Data);
51 |                     }
52 |                     break;
53 |             }
54 |         }
55 |     },
56 |     /**
57 |      *           ⬇
58 |      * <div>test</div>
59 |      */
60 |     EndTagOpen {
61 |         public void read(Tokeniser t, CharacterReader r) {
62 |             if (r.isEmpty()) {
63 |                 t.eofError(this);
64 |                 t.emit("</");
65 |                 t.transition(Data);
66 |             } else if (r.matches('>')) {
67 |                 t.error(this);
68 |                 t.advanceTransition(Data);
69 |             }
70 |         }
71 |     },
72 |     /**
73 |      *  ⬇
74 |      * <div>test</div>
75 |      */
76 |     TagName {
77 |         // from < or </ in data, will have start or end tag pending
78 |         public void read(Tokeniser t, CharacterReader r) {
79 |             // previous TagOpen state did NOT consume, will have a letter char in current
80 |             String tagName = r.consumeToAny('\t', '\n', '\r', '\f', ' ', '/', '>', nullChar).toLowerCase();
81 |             t.tagPending.appendTagName(tagName);
82 | 
83 |             switch (r.consume()) {
84 |                 case '>':
85 |                     t.emitTagPending();
86 |                     t.transition(Data);
87 |                     break;
88 |             }
89 |         }
90 |     };
91 | 
92 | 
93 |     public abstract void read(Tokeniser t, CharacterReader r);
94 | 
95 |     private static final char nullChar = '\u0000';
96 |     private static final char eof = CharacterReader.EOF;
97 | 
98 | }


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/ParseError.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | /**
 4 |  * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase.
 5 |  */
 6 | public class ParseError {
 7 |     private int pos;
 8 |     private String errorMsg;
 9 | 
10 |     ParseError(int pos, String errorMsg) {
11 |         this.pos = pos;
12 |         this.errorMsg = errorMsg;
13 |     }
14 | 
15 |     ParseError(int pos, String errorFormat, Object... args) {
16 |         this.errorMsg = String.format(errorFormat, args);
17 |         this.pos = pos;
18 |     }
19 | 
20 |     /**
21 |      * Retrieve the error message.
22 |      * @return the error message.
23 |      */
24 |     public String getErrorMessage() {
25 |         return errorMsg;
26 |     }
27 | 
28 |     /**
29 |      * Retrieves the offset of the error.
30 |      * @return error offset within input
31 |      */
32 |     public int getPosition() {
33 |         return pos;
34 |     }
35 | 
36 |     @Override
37 |     public String toString() {
38 |         return pos + ": " + errorMsg;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/ParseErrorList.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * parse错误结果容器
 7 |  * A container for ParseErrors.
 8 |  * 
 9 |  * @author Jonathan Hedley
10 |  */
11 | class ParseErrorList extends ArrayList<ParseError>{
12 |     private static final int INITIAL_CAPACITY = 16;
13 |     private final int maxSize;
14 |     
15 |     ParseErrorList(int initialCapacity, int maxSize) {
16 |         super(initialCapacity);
17 |         this.maxSize = maxSize;
18 |     }
19 |     
20 |     boolean canAddError() {
21 |         return size() < maxSize;
22 |     }
23 | 
24 |     int getMaxSize() {
25 |         return maxSize;
26 |     }
27 | 
28 |     static ParseErrorList noTracking() {
29 |         return new ParseErrorList(0, 0);
30 |     }
31 |     
32 |     static ParseErrorList tracking(int maxSize) {
33 |         return new ParseErrorList(INITIAL_CAPACITY, maxSize);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/Parser.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import org.jsoup.nodes.Document;
  4 | import org.jsoup.nodes.Element;
  5 | import org.jsoup.nodes.Node;
  6 | 
  7 | import java.util.List;
  8 | 
  9 | /**
 10 |  * HTML parser的facade，封装了常用的parse函数。
 11 |  * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the  more convenient parse methods
 12 |  * in {@link org.jsoup.Jsoup}.
 13 |  */
 14 | public class Parser {
 15 |     private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled.
 16 |     
 17 |     private TreeBuilder treeBuilder;
 18 |     private int maxErrors = DEFAULT_MAX_ERRORS;
 19 |     private ParseErrorList errors;
 20 | 
 21 |     /**
 22 |      * Create a new Parser, using the specified TreeBuilder
 23 |      * @param treeBuilder TreeBuilder to use to parse input into Documents.
 24 |      */
 25 |     public Parser(TreeBuilder treeBuilder) {
 26 |         this.treeBuilder = treeBuilder;
 27 |     }
 28 |     
 29 |     public Document parseInput(String html, String baseUri) {
 30 |         errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
 31 |         Document doc = treeBuilder.parse(html, baseUri, errors);
 32 |         return doc;
 33 |     }
 34 | 
 35 |     // gets & sets
 36 |     /**
 37 |      * Get the TreeBuilder currently in use.
 38 |      * @return current TreeBuilder.
 39 |      */
 40 |     public TreeBuilder getTreeBuilder() {
 41 |         return treeBuilder;
 42 |     }
 43 | 
 44 |     /**
 45 |      * Update the TreeBuilder used when parsing content.
 46 |      * @param treeBuilder current TreeBuilder
 47 |      * @return this, for chaining
 48 |      */
 49 |     public Parser setTreeBuilder(TreeBuilder treeBuilder) {
 50 |         this.treeBuilder = treeBuilder;
 51 |         return this;
 52 |     }
 53 | 
 54 |     /**
 55 |      * Check if parse error tracking is enabled.
 56 |      * @return current track error state.
 57 |      */
 58 |     public boolean isTrackErrors() {
 59 |         return maxErrors > 0;
 60 |     }
 61 | 
 62 |     /**
 63 |      * Enable or disable parse error tracking for the next parse.
 64 |      * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
 65 |      * @return this, for chaining
 66 |      */
 67 |     public Parser setTrackErrors(int maxErrors) {
 68 |         this.maxErrors = maxErrors;
 69 |         return this;
 70 |     }
 71 | 
 72 |     /**
 73 |      * Retrieve the parse errors, if any, from the last parse.
 74 |      * @return list of parse errors, up to the size of the maximum errors tracked.
 75 |      */
 76 |     public List<ParseError> getErrors() {
 77 |         return errors;
 78 |     }
 79 | 
 80 |     // static parse functions below
 81 |     /**
 82 |      * Parse HTML into a Document.
 83 |      *
 84 |      * @param html HTML to parse
 85 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
 86 |      *
 87 |      * @return parsed Document
 88 |      */
 89 |     public static Document parse(String html, String baseUri) {
 90 |         TreeBuilder treeBuilder = new HtmlTreeBuilder();
 91 |         return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking());
 92 |     }
 93 | 
 94 |     /**
 95 |      * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
 96 |      *
 97 |      * @param fragmentHtml the fragment of HTML to parse
 98 |      * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
 99 |      * provides stack context (for implicit element creation).
100 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
101 |      *
102 |      * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
103 |      */
104 |     public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
105 |         HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
106 |         return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking());
107 |     }
108 | 
109 |     /**
110 |      * Parse a fragment of XML into a list of nodes.
111 |      *
112 |      * @param fragmentXml the fragment of XML to parse
113 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
114 |      * @return list of nodes parsed from the input XML.
115 |      */
116 |     public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
117 |         XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
118 |         return treeBuilder.parseFragment(fragmentXml, baseUri, ParseErrorList.noTracking());
119 |     }
120 | 
121 |     /**
122 |      * Parse a fragment of HTML into the {@code body} of a Document.
123 |      *
124 |      * @param bodyHtml fragment of HTML
125 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
126 |      *
127 |      * @return Document, with empty head, and HTML parsed into body
128 |      */
129 |     public static Document parseBodyFragment(String bodyHtml, String baseUri) {
130 |         Document doc = Document.createShell(baseUri);
131 |         Element body = doc.body();
132 |         List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
133 |         Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
134 |         for (Node node : nodes) {
135 |             body.appendChild(node);
136 |         }
137 |         return doc;
138 |     }
139 | 
140 |     /**
141 |      * Utility method to unescape HTML entities from a string
142 |      * @param string HTML escaped string
143 |      * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
144 |      * @return an unescaped string
145 |      */
146 |     public static String unescapeEntities(String string, boolean inAttribute) {
147 |         Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking());
148 |         return tokeniser.unescapeEntities(inAttribute);
149 |     }
150 | 
151 |     /**
152 |      * @param bodyHtml HTML to parse
153 |      * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
154 |      *
155 |      * @return parsed Document
156 |      * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead.
157 |      */
158 |     public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) {
159 |         return parse(bodyHtml, baseUri);
160 |     }
161 |     
162 |     // builders
163 | 
164 |     /**
165 |      * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
166 |      * based on a knowledge of the semantics of the incoming tags.
167 |      * @return a new HTML parser.
168 |      */
169 |     public static Parser htmlParser() {
170 |         return new Parser(new HtmlTreeBuilder());
171 |     }
172 | 
173 |     /**
174 |      * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
175 |      * rather creates a simple tree directly from the input.
176 |      * @return a new simple XML parser.
177 |      */
178 |     public static Parser xmlParser() {
179 |         return new Parser(new XmlTreeBuilder());
180 |     }
181 | }
182 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/TreeBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.jsoup.helper.DescendableLinkedList;
 4 | import org.jsoup.helper.Validate;
 5 | import org.jsoup.nodes.Document;
 6 | import org.jsoup.nodes.Element;
 7 | 
 8 | import java.util.ArrayList;
 9 | import java.util.List;
10 | 
11 | /**
12 |  * @author Jonathan Hedley
13 |  */
14 | abstract class TreeBuilder {
15 |     CharacterReader reader;
16 |     Tokeniser tokeniser;
17 |     protected Document doc; // current doc we are building into
18 |     protected DescendableLinkedList<Element> stack; // the stack of open elements
19 |     protected String baseUri; // current base uri, for creating new elements
20 |     protected Token currentToken; // currentToken is used only for error tracking.
21 |     protected ParseErrorList errors; // null when not tracking errors
22 | 
23 |     protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
24 |         Validate.notNull(input, "String input must not be null");
25 |         Validate.notNull(baseUri, "BaseURI must not be null");
26 | 
27 |         doc = new Document(baseUri);
28 |         reader = new CharacterReader(input);
29 |         this.errors = errors;
30 |         tokeniser = new Tokeniser(reader, errors);
31 |         stack = new DescendableLinkedList<Element>();
32 |         this.baseUri = baseUri;
33 |     }
34 | 
35 |     Document parse(String input, String baseUri) {
36 |         return parse(input, baseUri, ParseErrorList.noTracking());
37 |     }
38 | 
39 |     Document parse(String input, String baseUri, ParseErrorList errors) {
40 |         initialiseParse(input, baseUri, errors);
41 |         runParser();
42 |         return doc;
43 |     }
44 | 
45 |     protected void runParser() {
46 |         while (true) {
47 |             Token token = tokeniser.read();
48 |             process(token);
49 | 
50 |             if (token.type == Token.TokenType.EOF)
51 |                 break;
52 |         }
53 |     }
54 | 
55 |     protected abstract boolean process(Token token);
56 | 
57 |     protected Element currentElement() {
58 |         return stack.getLast();
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/XmlTreeBuilder.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import org.jsoup.helper.Validate;
  4 | import org.jsoup.nodes.*;
  5 | 
  6 | import java.util.Iterator;
  7 | import java.util.List;
  8 | 
  9 | /**
 10 |  * @author Jonathan Hedley
 11 |  */
 12 | public class XmlTreeBuilder extends TreeBuilder {
 13 |     @Override
 14 |     protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
 15 |         super.initialiseParse(input, baseUri, errors);
 16 |         stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack)
 17 |     }
 18 | 
 19 |     @Override
 20 |     protected boolean process(Token token) {
 21 |         // start tag, end tag, doctype, comment, character, eof
 22 |         switch (token.type) {
 23 |             case StartTag:
 24 |                 insert(token.asStartTag());
 25 |                 break;
 26 |             case EndTag:
 27 |                 popStackToClose(token.asEndTag());
 28 |                 break;
 29 |             case Comment:
 30 |                 insert(token.asComment());
 31 |                 break;
 32 |             case Character:
 33 |                 insert(token.asCharacter());
 34 |                 break;
 35 |             case Doctype:
 36 |                 insert(token.asDoctype());
 37 |                 break;
 38 |             case EOF: // could put some normalisation here if desired
 39 |                 break;
 40 |             default:
 41 |                 Validate.fail("Unexpected token type: " + token.type);
 42 |         }
 43 |         return true;
 44 |     }
 45 | 
 46 |     private void insertNode(Node node) {
 47 |         currentElement().appendChild(node);
 48 |     }
 49 | 
 50 |     Element insert(Token.StartTag startTag) {
 51 |         Tag tag = Tag.valueOf(startTag.name());
 52 |         // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html.
 53 |         Element el = new Element(tag, baseUri, startTag.attributes);
 54 |         insertNode(el);
 55 |         if (startTag.isSelfClosing()) {
 56 |             tokeniser.acknowledgeSelfClosingFlag();
 57 |             if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above.
 58 |                 tag.setSelfClosing();
 59 |         } else {
 60 |             stack.add(el);
 61 |         }
 62 |         return el;
 63 |     }
 64 | 
 65 |     void insert(Token.Comment commentToken) {
 66 |         Comment comment = new Comment(commentToken.getData(), baseUri);
 67 |         Node insert = comment;
 68 |         if (commentToken.bogus) { // xml declarations are emitted as bogus comments (which is right for html, but not xml)
 69 |             String data = comment.getData();
 70 |             if (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) {
 71 |                 String declaration = data.substring(1);
 72 |                 insert = new XmlDeclaration(declaration, comment.baseUri(), data.startsWith("!"));
 73 |             }
 74 |         }
 75 |         insertNode(insert);
 76 |     }
 77 | 
 78 |     void insert(Token.Character characterToken) {
 79 |         Node node = new TextNode(characterToken.getData(), baseUri);
 80 |         insertNode(node);
 81 |     }
 82 | 
 83 |     void insert(Token.Doctype d) {
 84 |         DocumentType doctypeNode = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri);
 85 |         insertNode(doctypeNode);
 86 |     }
 87 | 
 88 |     /**
 89 |      * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
 90 |      * found, skips.
 91 |      *
 92 |      * @param endTag
 93 |      */
 94 |     private void popStackToClose(Token.EndTag endTag) {
 95 |         String elName = endTag.name();
 96 |         Element firstFound = null;
 97 | 
 98 |         Iterator<Element> it = stack.descendingIterator();
 99 |         while (it.hasNext()) {
100 |             Element next = it.next();
101 |             if (next.nodeName().equals(elName)) {
102 |                 firstFound = next;
103 |                 break;
104 |             }
105 |         }
106 |         if (firstFound == null)
107 |             return; // not found, skip
108 | 
109 |         it = stack.descendingIterator();
110 |         while (it.hasNext()) {
111 |             Element next = it.next();
112 |             if (next == firstFound) {
113 |                 it.remove();
114 |                 break;
115 |             } else {
116 |                 it.remove();
117 |             }
118 |         }
119 |     }
120 | 
121 |     List<Node> parseFragment(String inputFragment, String baseUri, ParseErrorList errors) {
122 |         initialiseParse(inputFragment, baseUri, errors);
123 |         runParser();
124 |         return doc.childNodes();
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains the HTML parser, tag specifications, and HTML tokeniser.
3 |  */
4 | package org.jsoup.parser;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/safety/Cleaner.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.safety;
  2 | 
  3 | import org.jsoup.helper.Validate;
  4 | import org.jsoup.nodes.*;
  5 | import org.jsoup.parser.Tag;
  6 | import org.jsoup.select.NodeTraversor;
  7 | import org.jsoup.select.NodeVisitor;
  8 | 
  9 | import java.util.List;
 10 | 
 11 | /**
 12 |  The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
 13 |  that you are expecting; no junk, and no cross-site scripting attacks!
 14 |  <p/>
 15 |  The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain
 16 |  HTML that is allowed by the whitelist.
 17 |  <p/>
 18 |  It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
 19 |  canned white-lists only allow body contained tags.
 20 |  <p/>
 21 |  Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
 22 |  */
 23 | public class Cleaner {
 24 |     private Whitelist whitelist;
 25 | 
 26 |     /**
 27 |      Create a new cleaner, that sanitizes documents using the supplied whitelist.
 28 |      @param whitelist white-list to clean with
 29 |      */
 30 |     public Cleaner(Whitelist whitelist) {
 31 |         Validate.notNull(whitelist);
 32 |         this.whitelist = whitelist;
 33 |     }
 34 | 
 35 |     /**
 36 |      Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
 37 |      The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
 38 |      @param dirtyDocument Untrusted base document to clean.
 39 |      @return cleaned document.
 40 |      */
 41 |     public Document clean(Document dirtyDocument) {
 42 |         Validate.notNull(dirtyDocument);
 43 | 
 44 |         Document clean = Document.createShell(dirtyDocument.baseUri());
 45 |         if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
 46 |             copySafeNodes(dirtyDocument.body(), clean.body());
 47 | 
 48 |         return clean;
 49 |     }
 50 | 
 51 |     /**
 52 |      Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
 53 |      in the input HTML are allowed by the whitelist.
 54 |      <p/>
 55 |      This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
 56 |      using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
 57 |      to ensure enforced attributes are set correctly, and that the output is tidied.
 58 |      @param dirtyDocument document to test
 59 |      @return true if no tags or attributes need to be removed; false if they do
 60 |      */
 61 |     public boolean isValid(Document dirtyDocument) {
 62 |         Validate.notNull(dirtyDocument);
 63 | 
 64 |         Document clean = Document.createShell(dirtyDocument.baseUri());
 65 |         int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
 66 |         return numDiscarded == 0;
 67 |     }
 68 | 
 69 |     /**
 70 |      Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
 71 |      */
 72 |     private final class CleaningVisitor implements NodeVisitor {
 73 |         private int numDiscarded = 0;
 74 |         private final Element root;
 75 |         private Element destination; // current element to append nodes to
 76 | 
 77 |         private CleaningVisitor(Element root, Element destination) {
 78 |             this.root = root;
 79 |             this.destination = destination;
 80 |         }
 81 | 
 82 |         public void head(Node source, int depth) {
 83 |             if (source instanceof Element) {
 84 |                 Element sourceEl = (Element) source;
 85 | 
 86 |                 if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
 87 |                     ElementMeta meta = createSafeElement(sourceEl);
 88 |                     Element destChild = meta.el;
 89 |                     destination.appendChild(destChild);
 90 | 
 91 |                     numDiscarded += meta.numAttribsDiscarded;
 92 |                     destination = destChild;
 93 |                 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
 94 |                     numDiscarded++;
 95 |                 }
 96 |             } else if (source instanceof TextNode) {
 97 |                 TextNode sourceText = (TextNode) source;
 98 |                 TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
 99 |                 destination.appendChild(destText);
100 |             } else { // else, we don't care about comments, xml proc instructions, etc
101 |                 numDiscarded++;
102 |             }
103 |         }
104 | 
105 |         public void tail(Node source, int depth) {
106 |             if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
107 |                 destination = destination.parent(); // would have descended, so pop destination stack
108 |             }
109 |         }
110 |     }
111 | 
112 |     private int copySafeNodes(Element source, Element dest) {
113 |         CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
114 |         NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
115 |         traversor.traverse(source);
116 |         return cleaningVisitor.numDiscarded;
117 |     }
118 | 
119 |     private ElementMeta createSafeElement(Element sourceEl) {
120 |         String sourceTag = sourceEl.tagName();
121 |         Attributes destAttrs = new Attributes();
122 |         Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
123 |         int numDiscarded = 0;
124 | 
125 |         Attributes sourceAttrs = sourceEl.attributes();
126 |         for (Attribute sourceAttr : sourceAttrs) {
127 |             if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
128 |                 destAttrs.put(sourceAttr);
129 |             else
130 |                 numDiscarded++;
131 |         }
132 |         Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
133 |         destAttrs.addAll(enforcedAttrs);
134 | 
135 |         return new ElementMeta(dest, numDiscarded);
136 |     }
137 | 
138 |     private static class ElementMeta {
139 |         Element el;
140 |         int numAttribsDiscarded;
141 | 
142 |         ElementMeta(Element el, int numAttribsDiscarded) {
143 |             this.el = el;
144 |             this.numAttribsDiscarded = numAttribsDiscarded;
145 |         }
146 |     }
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/safety/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains the jsoup HTML cleaner, and whitelist definitions.
3 |  */
4 | package org.jsoup.safety;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/Collector.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.jsoup.nodes.Element;
 4 | import org.jsoup.nodes.Node;
 5 | 
 6 | /**
 7 |  * Collects a list of elements that match the supplied criteria.
 8 |  *
 9 |  * @author Jonathan Hedley
10 |  */
11 | public class Collector {
12 | 
13 |     private Collector() {
14 |     }
15 | 
16 |     /**
17 |      Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator.
18 |      @param eval Evaluator to test elements against
19 |      @param root root of tree to descend
20 |      @return list of matches; empty if none
21 |      */
22 |     public static Elements collect (Evaluator eval, Element root) {
23 |         Elements elements = new Elements();
24 |         new NodeTraversor(new Accumulator(root, elements, eval)).traverse(root);
25 |         return elements;
26 |     }
27 | 
28 |     private static class Accumulator implements NodeVisitor {
29 |         private final Element root;
30 |         private final Elements elements;
31 |         private final Evaluator eval;
32 | 
33 |         Accumulator(Element root, Elements elements, Evaluator eval) {
34 |             this.root = root;
35 |             this.elements = elements;
36 |             this.eval = eval;
37 |         }
38 | 
39 |         public void head(Node node, int depth) {
40 |             if (node instanceof Element) {
41 |                 Element el = (Element) node;
42 |                 if (eval.matches(root, el))
43 |                     elements.add(el);
44 |             }
45 |         }
46 | 
47 |         public void tail(Node node, int depth) {
48 |             // void
49 |         }
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/CombiningEvaluator.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.jsoup.helper.StringUtil;
 4 | import org.jsoup.nodes.Element;
 5 | 
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.Collection;
 9 | import java.util.List;
10 | 
11 | /**
12 |  * Base combining (and, or) evaluator.
13 |  */
14 | abstract class CombiningEvaluator extends Evaluator {
15 |     final List<Evaluator> evaluators;
16 | 
17 |     CombiningEvaluator() {
18 |         super();
19 |         evaluators = new ArrayList<Evaluator>();
20 |     }
21 | 
22 |     CombiningEvaluator(Collection<Evaluator> evaluators) {
23 |         this();
24 |         this.evaluators.addAll(evaluators);
25 |     }
26 | 
27 |     Evaluator rightMostEvaluator() {
28 |         return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) : null;
29 |     }
30 |     
31 |     void replaceRightMostEvaluator(Evaluator replacement) {
32 |         evaluators.set(evaluators.size() - 1, replacement);
33 |     }
34 | 
35 |     static final class And extends CombiningEvaluator {
36 |         And(Collection<Evaluator> evaluators) {
37 |             super(evaluators);
38 |         }
39 | 
40 |         And(Evaluator... evaluators) {
41 |             this(Arrays.asList(evaluators));
42 |         }
43 | 
44 |         @Override
45 |         public boolean matches(Element root, Element node) {
46 |             for (int i = 0; i < evaluators.size(); i++) {
47 |                 Evaluator s = evaluators.get(i);
48 |                 if (!s.matches(root, node))
49 |                     return false;
50 |             }
51 |             return true;
52 |         }
53 | 
54 |         @Override
55 |         public String toString() {
56 |             return StringUtil.join(evaluators, " ");
57 |         }
58 |     }
59 | 
60 |     static final class Or extends CombiningEvaluator {
61 |         /**
62 |          * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR.
63 |          * @param evaluators initial OR clause (these are wrapped into an AND evaluator).
64 |          */
65 |         Or(Collection<Evaluator> evaluators) {
66 |             super();
67 |             if (evaluators.size() > 1)
68 |                 this.evaluators.add(new And(evaluators));
69 |             else // 0 or 1
70 |                 this.evaluators.addAll(evaluators);
71 |         }
72 | 
73 |         Or() {
74 |             super();
75 |         }
76 | 
77 |         public void add(Evaluator e) {
78 |             evaluators.add(e);
79 |         }
80 | 
81 |         @Override
82 |         public boolean matches(Element root, Element node) {
83 |             for (int i = 0; i < evaluators.size(); i++) {
84 |                 Evaluator s = evaluators.get(i);
85 |                 if (s.matches(root, node))
86 |                     return true;
87 |             }
88 |             return false;
89 |         }
90 | 
91 |         @Override
92 |         public String toString() {
93 |             return String.format(":or%s", evaluators);
94 |         }
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/NodeTraversor.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.jsoup.nodes.Node;
 4 | 
 5 | /**
 6 |  * Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
 7 |  * <p/>
 8 |  * This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
 9 |  */
10 | public class NodeTraversor {
11 |     private NodeVisitor visitor;
12 | 
13 |     /**
14 |      * Create a new traversor.
15 |      * @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node.
16 |      */
17 |     public NodeTraversor(NodeVisitor visitor) {
18 |         this.visitor = visitor;
19 |     }
20 | 
21 |     /**
22 |      * Start a depth-first traverse of the root and all of its descendants.
23 |      * @param root the root node point to traverse.
24 |      */
25 |     public void traverse(Node root) {
26 |         Node node = root;
27 |         int depth = 0;
28 |         
29 |         while (node != null) {
30 |             visitor.head(node, depth);
31 |             if (node.childNodeSize() > 0) {
32 |                 node = node.childNode(0);
33 |                 depth++;
34 |             } else {
35 |                 while (node.nextSibling() == null && depth > 0) {
36 |                     visitor.tail(node, depth);
37 |                     node = node.parent();
38 |                     depth--;
39 |                 }
40 |                 visitor.tail(node, depth);
41 |                 if (node == root)
42 |                     break;
43 |                 node = node.nextSibling();
44 |             }
45 |         }
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/NodeVisitor.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.jsoup.nodes.Node;
 4 | 
 5 | /**
 6 |  * Node visitor interface. Provide an implementing class to {@link NodeTraversor} to iterate through nodes.
 7 |  * <p/>
 8 |  * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
 9 |  * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
10 |  * create a start tag for a node, and tail to create the end tag.
11 |  */
12 | public interface NodeVisitor {
13 |     /**
14 |      * Callback for when a node is first visited.
15 |      *
16 |      * @param node the node being visited.
17 |      * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
18 |      * of that will have depth 1.
19 |      */
20 |     public void head(Node node, int depth);
21 | 
22 |     /**
23 |      * Callback for when a node is last visited, after all of its descendants have been visited.
24 |      *
25 |      * @param node the node being visited.
26 |      * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
27 |      * of that will have depth 1.
28 |      */
29 |     public void tail(Node node, int depth);
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/StructuralEvaluator.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.select;
  2 | 
  3 | import org.jsoup.nodes.Element;
  4 | 
  5 | /**
  6 |  * Base structural evaluator.
  7 |  */
  8 | abstract class StructuralEvaluator extends Evaluator {
  9 |     Evaluator evaluator;
 10 | 
 11 |     static class Root extends Evaluator {
 12 |         public boolean matches(Element root, Element element) {
 13 |             return root == element;
 14 |         }
 15 |     }
 16 | 
 17 |     static class Has extends StructuralEvaluator {
 18 |         public Has(Evaluator evaluator) {
 19 |             this.evaluator = evaluator;
 20 |         }
 21 | 
 22 |         public boolean matches(Element root, Element element) {
 23 |             for (Element e : element.getAllElements()) {
 24 |                 if (e != element && evaluator.matches(root, e))
 25 |                     return true;
 26 |             }
 27 |             return false;
 28 |         }
 29 | 
 30 |         public String toString() {
 31 |             return String.format(":has(%s)", evaluator);
 32 |         }
 33 |     }
 34 | 
 35 |     static class Not extends StructuralEvaluator {
 36 |         public Not(Evaluator evaluator) {
 37 |             this.evaluator = evaluator;
 38 |         }
 39 | 
 40 |         public boolean matches(Element root, Element node) {
 41 |             return !evaluator.matches(root, node);
 42 |         }
 43 | 
 44 |         public String toString() {
 45 |             return String.format(":not%s", evaluator);
 46 |         }
 47 |     }
 48 | 
 49 |     static class Parent extends StructuralEvaluator {
 50 |         public Parent(Evaluator evaluator) {
 51 |             this.evaluator = evaluator;
 52 |         }
 53 | 
 54 |         public boolean matches(Element root, Element element) {
 55 |             if (root == element)
 56 |                 return false;
 57 | 
 58 |             Element parent = element.parent();
 59 |             while (parent != root) {
 60 |                 if (evaluator.matches(root, parent))
 61 |                     return true;
 62 |                 parent = parent.parent();
 63 |             }
 64 |             return false;
 65 |         }
 66 | 
 67 |         public String toString() {
 68 |             return String.format(":parent%s", evaluator);
 69 |         }
 70 |     }
 71 | 
 72 |     static class ImmediateParent extends StructuralEvaluator {
 73 |         public ImmediateParent(Evaluator evaluator) {
 74 |             this.evaluator = evaluator;
 75 |         }
 76 | 
 77 |         public boolean matches(Element root, Element element) {
 78 |             if (root == element)
 79 |                 return false;
 80 | 
 81 |             Element parent = element.parent();
 82 |             return parent != null && evaluator.matches(root, parent);
 83 |         }
 84 | 
 85 |         public String toString() {
 86 |             return String.format(":ImmediateParent%s", evaluator);
 87 |         }
 88 |     }
 89 | 
 90 |     static class PreviousSibling extends StructuralEvaluator {
 91 |         public PreviousSibling(Evaluator evaluator) {
 92 |             this.evaluator = evaluator;
 93 |         }
 94 | 
 95 |         public boolean matches(Element root, Element element) {
 96 |             if (root == element)
 97 |                 return false;
 98 | 
 99 |             Element prev = element.previousElementSibling();
100 | 
101 |             while (prev != null) {
102 |                 if (evaluator.matches(root, prev))
103 |                     return true;
104 | 
105 |                 prev = prev.previousElementSibling();
106 |             }
107 |             return false;
108 |         }
109 | 
110 |         public String toString() {
111 |             return String.format(":prev*%s", evaluator);
112 |         }
113 |     }
114 | 
115 |     static class ImmediatePreviousSibling extends StructuralEvaluator {
116 |         public ImmediatePreviousSibling(Evaluator evaluator) {
117 |             this.evaluator = evaluator;
118 |         }
119 | 
120 |         public boolean matches(Element root, Element element) {
121 |             if (root == element)
122 |                 return false;
123 | 
124 |             Element prev = element.previousElementSibling();
125 |             return prev != null && evaluator.matches(root, prev);
126 |         }
127 | 
128 |         public String toString() {
129 |             return String.format(":prev%s", evaluator);
130 |         }
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Packages to support the CSS-style element selector.
3 |  */
4 | package org.jsoup.select;


--------------------------------------------------------------------------------
/src/main/java/us/codecraft/learning/automata/ABStateMachine.java:
--------------------------------------------------------------------------------
 1 | package us.codecraft.learning.automata;
 2 | 
 3 | /**
 4 |  * @author code4crafter@gmail.com
 5 |  */
 6 | public interface ABStateMachine {
 7 | 
 8 |     void process(StringReader reader) throws StringReader.EOFException;
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/us/codecraft/learning/automata/StateModelABStateMachine.java:
--------------------------------------------------------------------------------
 1 | package us.codecraft.learning.automata;
 2 | 
 3 | /**
 4 |  * @author code4crafter@gmail.com
 5 |  */
 6 | public class StateModelABStateMachine implements ABStateMachine {
 7 | 
 8 |     State state;
 9 | 
10 |     StringBuilder accum;
11 | 
12 |     enum State {
13 |         Init {
14 |             @Override
15 |             public void process(StateModelABStateMachine stateModelABStateMachine, StringReader reader) throws StringReader.EOFException {
16 |                 char ch = reader.read();
17 |                 if (ch == 'a') {
18 |                     stateModelABStateMachine.state = AfterA;
19 |                     stateModelABStateMachine.accum.append(ch);
20 |                 }
21 |             }
22 |         },
23 |         Accept {
24 |             @Override
25 |             public void process(StateModelABStateMachine stateModelABStateMachine, StringReader reader) throws StringReader.EOFException {
26 |                 System.out.println("find " + stateModelABStateMachine.accum.toString());
27 |                 stateModelABStateMachine.accum = new StringBuilder();
28 |                 stateModelABStateMachine.state = Init;
29 |                 reader.unread();
30 |             }
31 |         },
32 |         AfterA {
33 |             @Override
34 |             public void process(StateModelABStateMachine stateModelABStateMachine, StringReader reader) throws StringReader.EOFException {
35 |                 char ch = reader.read();
36 |                 if (ch == 'b') {
37 |                     stateModelABStateMachine.accum.append(ch);
38 |                     stateModelABStateMachine.state = AfterB;
39 |                 } else {
40 |                     stateModelABStateMachine.state = Accept;
41 |                 }
42 |             }
43 |         },
44 |         AfterB {
45 |             @Override
46 |             public void process(StateModelABStateMachine stateModelABStateMachine, StringReader reader) throws StringReader.EOFException {
47 |                 char ch = reader.read();
48 |                 if (ch == 'b') {
49 |                     stateModelABStateMachine.accum.append(ch);
50 |                     stateModelABStateMachine.state = AfterB;
51 |                 } else {
52 |                     stateModelABStateMachine.state = Accept;
53 |                 }
54 |             }
55 |         };
56 | 
57 |         public void process(StateModelABStateMachine stateModelABStateMachine, StringReader reader) throws StringReader.EOFException {
58 |         }
59 |     }
60 | 
61 |     public void process(StringReader reader) throws StringReader.EOFException {
62 |         state.process(this, reader);
63 |     }
64 | 
65 |     public static void main(String[] args) {
66 |         ABStateMachine abStateMachine = new StateModelABStateMachine();
67 |         String text = "abbbababbbaa";
68 |         StringReader reader = new StringReader(text);
69 |         try {
70 |             while (true) {
71 |                 abStateMachine.process(reader);
72 |             }
73 |         } catch (StringReader.EOFException e) {
74 |         }
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/java/us/codecraft/learning/automata/StringReader.java:
--------------------------------------------------------------------------------
 1 | package us.codecraft.learning.automata;
 2 | 
 3 | /**
 4 |  * @author code4crafter@gmail.com
 5 |  */
 6 | public class StringReader {
 7 | 
 8 |     class EOFException extends Exception {
 9 | 
10 |     }
11 | 
12 |     private String string;
13 | 
14 |     private int index;
15 | 
16 |     public StringReader(String string) {
17 |         this.string = string;
18 |     }
19 | 
20 |     public char read() throws EOFException {
21 |         if (index < string.length() - 1) {
22 |             return string.charAt(index++);
23 |         } else {
24 |             throw new EOFException();
25 |         }
26 |     }
27 | 
28 |     public void unread() {
29 |         index--;
30 |         if (index < 0) {
31 |             index = 0;
32 |         }
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/us/codecraft/learning/automata/SwitchABStateMachine.java:
--------------------------------------------------------------------------------
 1 | package us.codecraft.learning.automata;
 2 | 
 3 | /**
 4 |  * @author code4crafter@gmail.com
 5 |  */
 6 | public class SwitchABStateMachine implements ABStateMachine {
 7 | 
 8 |     enum State {
 9 |         Init, Accept, AfterA, AfterB;
10 |     }
11 | 
12 |     private StringBuilder accum = new StringBuilder();
13 | 
14 |     private State state = State.Init;
15 | 
16 |     public void process(StringReader reader) throws StringReader.EOFException {
17 |         char ch;
18 |         switch (state) {
19 |             case Init:
20 |                 ch = reader.read();
21 |                 if (ch == 'a') {
22 |                     state = State.AfterA;
23 |                     accum.append(ch);
24 |                 }
25 |                 break;
26 |             case AfterA:
27 |                 ch = reader.read();
28 |                 if (ch == 'b') {
29 |                     accum.append(ch);
30 |                     state = State.AfterB;
31 |                 } else {
32 |                     state = State.Accept;
33 |                 }
34 |                 break;
35 |             case AfterB:
36 |                 ch = reader.read();
37 |                 if (ch == 'b') {
38 |                     accum.append(ch);
39 |                     state = State.AfterB;
40 |                 } else {
41 |                     state = State.Accept;
42 |                 }
43 |                 break;
44 |             case Accept:
45 |                 System.out.println("find " + accum.toString());
46 |                 accum = new StringBuilder();
47 |                 state = State.Init;
48 |                 reader.unread();
49 |                 break;
50 |         }
51 |     }
52 | 
53 |     public static void main(String[] args) {
54 |         ABStateMachine abStateMachine = new SwitchABStateMachine();
55 |         String text = "abbbababbbaa";
56 |         StringReader reader = new StringReader(text);
57 |         try {
58 |             while (true){
59 |                 abStateMachine.process(reader);
60 |             }
61 |         } catch (StringReader.EOFException e) {
62 |         }
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/us/codecraft/learning/parser/PageErrorChecker.java:
--------------------------------------------------------------------------------
 1 | package us.codecraft.learning.parser;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.parser.ParseError;
 5 | import org.jsoup.parser.Parser;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * @author code4crafter@gmail.com
12 |  */
13 | public class PageErrorChecker {
14 | 
15 |     public static List<ParseError> check(String url) throws IOException {
16 |         Parser parser = Parser.htmlParser();
17 |         parser.setTrackErrors(100);
18 |         String body = Jsoup.connect(url).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36")
19 |                 .execute().body();
20 |         parser.parseInput(body, url);
21 |         List<ParseError> errors = parser.getErrors();
22 |         return errors;
23 |     }
24 | 
25 |     public static void main(String[] args) throws IOException {
26 |         List<ParseError> check = check("http://www.dianping.com");
27 |         System.out.println(check);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/us/codecraft/learning/parser/ParserCorrectorTest.java:
--------------------------------------------------------------------------------
 1 | package us.codecraft.learning.parser;
 2 | 
 3 | import org.jsoup.nodes.Document;
 4 | import org.jsoup.parser.ParseError;
 5 | import org.jsoup.parser.Parser;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * @author code4crafter@gmail.com
11 |  */
12 | public class ParserCorrectorTest {
13 | 
14 |     public static void main(String[] args) {
15 |         String htmlWithDivUnclosed = "<body>\n" +
16 |                 " <textarea>\n" +
17 |                 "        &lt;!-- Text --&gt;\n" +
18 |                 "        xxx\n" +
19 |                 "    </textarea> \n" +
20 |                 " <div> \n" +
21 |                 " <div>\n" +
22 |                 "  <table> \n" +
23 |                 "   <!-- InTable --> \n" +
24 |                 "   <!-- InTableText --> xxx \n" +
25 |                 "   <tbody> \n" +
26 |                 "    <tr> \n" +
27 |                 "     <!-- InRow --> \n" +
28 |                 "     <td> \n" +
29 |                 "      <!-- InCell --> </td> \n" +
30 |                 "    </tr> \n" +
31 |                 "   </tbody> \n" +
32 |                 "  </table> \n" +
33 |                 " </div> \n" +
34 |                 "</body>";
35 |         Parser parser = Parser.htmlParser();
36 |         parser.setTrackErrors(100);
37 |         Document document = parser.parseInput(htmlWithDivUnclosed, "");
38 |         List<ParseError> errors = parser.getErrors();
39 |         System.out.println(errors);
40 | 
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/us/codecraft/learning/select/SelectorTest.java:
--------------------------------------------------------------------------------
 1 | package us.codecraft.learning.select;
 2 | 
 3 | import org.jsoup.nodes.Document;
 4 | import org.jsoup.parser.Parser;
 5 | import org.jsoup.select.Elements;
 6 | 
 7 | /**
 8 |  * @author code4crafter@gmail.com
 9 |  */
10 | public class SelectorTest {
11 | 
12 |     public static void main(String[] args) {
13 |         String html = "<body>\n" +
14 |                 " <textarea>\n" +
15 |                 "        &lt;!-- Text --&gt;\n" +
16 |                 "        xxx\n" +
17 |                 "    </textarea> \n" +
18 |                 " <div> \n" +
19 |                 "  <table> \n" +
20 |                 "   <!-- InTable --> \n" +
21 |                 "   <!-- InTableText --> xxx \n" +
22 |                 "   <tbody> \n" +
23 |                 "    <tr> \n" +
24 |                 "     <!-- InRow --> \n" +
25 |                 "     <td> \n" +
26 |                 "      <!-- InCell --> </td> \n" +
27 |                 "    </tr> \n" +
28 |                 "   </tbody> \n" +
29 |                 "  </table> \n" +
30 |                 " </div> \n" +
31 |                 "</body>";
32 |         Parser parser = Parser.htmlParser();
33 |         Document document = parser.parseInput(html, "");
34 |         Elements select = document.select("body div");
35 |         System.out.println(select);
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/javadoc/overview.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <title>jsoup Javadoc overview</title>
 5 | </head>
 6 | <body>
 7 | <h1>jsoup: Java HTML parser that makes sense of real-world HTML soup.</h1>
 8 | 
 9 | <p><b>jsoup</b> is a Java library for working with real-world HTML. It provides a very convenient API
10 | for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.</p>
11 | 
12 | <p>jsoup implements the <a href="http://whatwg.org/html">WHATWG HTML</a> specification, and parses HTML to the same DOM
13 | as modern browsers do.</p>
14 | 
15 | <ul>
16 | <li>parse HTML from a URL, file, or string
17 | <li>find and extract data, using DOM traversal or CSS selectors
18 | <li>manipulate the HTML elements, attributes, and text
19 | <li>clean user-submitted content against a safe white-list, to prevent XSS
20 | <li>output tidy HTML
21 | </ul>
22 | 
23 | <p>jsoup is designed to deal with all varieties of HTML found in the wild; from pristine and validating,
24 | to invalid tag-soup; jsoup will create a sensible parse tree.</p>
25 | 
26 | <p>See <a href="http://jsoup.org/"><b>jsoup.org</b></a> for downloads, documentation, and examples...</p>
27 | 
28 | @author <a href="http://jonathanhedley.com/">Jonathan Hedley</a>
29 | 
30 | </body>
31 | </html>
32 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/TextUtil.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup;
 2 | 
 3 | /**
 4 |  Text utils to ease testing
 5 | 
 6 |  @author Jonathan Hedley, jonathan@hedley.net */
 7 | public class TextUtil {
 8 |     public static String stripNewlines(String text) {
 9 |         text = text.replaceAll("\\n\\s*", "");
10 |         return text;
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/helper/DataUtilTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.helper;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import org.jsoup.nodes.Document;
 6 | import org.jsoup.parser.Parser;
 7 | import org.junit.Test;
 8 | 
 9 | import java.nio.ByteBuffer;
10 | import java.nio.charset.Charset;
11 | 
12 | public class DataUtilTest {
13 |     @Test
14 |     public void testCharset() {
15 |         assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 "));
16 |         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8"));
17 |         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1"));
18 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html"));
19 |         assertEquals(null, DataUtil.getCharsetFromContentType(null));
20 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html;charset=Unknown"));
21 |     }
22 | 
23 |     @Test public void testQuotedCharset() {
24 |         assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\""));
25 |         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"UTF-8\""));
26 |         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\""));
27 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=\"Unsupported\""));
28 |     }
29 |     
30 |     @Test public void discardsSpuriousByteOrderMark() {
31 |         String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
32 |         ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
33 |         Document doc = DataUtil.parseByteData(buffer, "UTF-8", "http://foo.com/", Parser.htmlParser());
34 |         assertEquals("One", doc.head().text());
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/helper/HttpConnectionTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | import static org.junit.Assert.*;
  4 | import org.junit.Test;
  5 | import org.jsoup.Connection;
  6 | 
  7 | import java.io.IOException;
  8 | import java.util.*;
  9 | import java.net.URL;
 10 | import java.net.MalformedURLException;
 11 | 
 12 | public class HttpConnectionTest {
 13 |     /* most actual network http connection tests are in integration */
 14 | 
 15 |     @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnParseWithoutExecute() throws IOException {
 16 |         Connection con = HttpConnection.connect("http://example.com");
 17 |         con.response().parse();
 18 |     }
 19 | 
 20 |     @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnBodyWithoutExecute() throws IOException {
 21 |         Connection con = HttpConnection.connect("http://example.com");
 22 |         con.response().body();
 23 |     }
 24 | 
 25 |     @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnBodyAsBytesWithoutExecute() throws IOException {
 26 |         Connection con = HttpConnection.connect("http://example.com");
 27 |         con.response().bodyAsBytes();
 28 |     }
 29 | 
 30 |     @Test public void caseInsensitiveHeaders() {
 31 |         Connection.Response res = new HttpConnection.Response();
 32 |         Map<String, String> headers = res.headers();
 33 |         headers.put("Accept-Encoding", "gzip");
 34 |         headers.put("content-type", "text/html");
 35 |         headers.put("refErrer", "http://example.com");
 36 | 
 37 |         assertTrue(res.hasHeader("Accept-Encoding"));
 38 |         assertTrue(res.hasHeader("accept-encoding"));
 39 |         assertTrue(res.hasHeader("accept-Encoding"));
 40 | 
 41 |         assertEquals("gzip", res.header("accept-Encoding"));
 42 |         assertEquals("text/html", res.header("Content-Type"));
 43 |         assertEquals("http://example.com", res.header("Referrer"));
 44 | 
 45 |         res.removeHeader("Content-Type");
 46 |         assertFalse(res.hasHeader("content-type"));
 47 | 
 48 |         res.header("accept-encoding", "deflate");
 49 |         assertEquals("deflate", res.header("Accept-Encoding"));
 50 |         assertEquals("deflate", res.header("accept-Encoding"));
 51 |     }
 52 | 
 53 |     @Test public void ignoresEmptySetCookies() {
 54 |         // prep http response header map
 55 |         Map<String, List<String>> headers = new HashMap<String, List<String>>();
 56 |         headers.put("Set-Cookie", Collections.<String>emptyList());
 57 |         HttpConnection.Response res = new HttpConnection.Response();
 58 |         res.processResponseHeaders(headers);
 59 |         assertEquals(0, res.cookies().size());
 60 |     }
 61 | 
 62 |     @Test public void ignoresEmptyCookieNameAndVals() {
 63 |         // prep http response header map
 64 |         Map<String, List<String>> headers = new HashMap<String, List<String>>();
 65 |         List<String> cookieStrings = new ArrayList<String>();
 66 |         cookieStrings.add(null);
 67 |         cookieStrings.add("");
 68 |         cookieStrings.add("one");
 69 |         cookieStrings.add("two=");
 70 |         cookieStrings.add("three=;");
 71 |         cookieStrings.add("four=data; Domain=.example.com; Path=/");
 72 | 
 73 |         headers.put("Set-Cookie", cookieStrings);
 74 |         HttpConnection.Response res = new HttpConnection.Response();
 75 |         res.processResponseHeaders(headers);
 76 |         assertEquals(4, res.cookies().size());
 77 |         assertEquals("", res.cookie("one"));
 78 |         assertEquals("", res.cookie("two"));
 79 |         assertEquals("", res.cookie("three"));
 80 |         assertEquals("data", res.cookie("four"));
 81 |     }
 82 | 
 83 |     @Test public void connectWithUrl() throws MalformedURLException {
 84 |         Connection con = HttpConnection.connect(new URL("http://example.com"));
 85 |         assertEquals("http://example.com", con.request().url().toExternalForm());
 86 |     }
 87 | 
 88 |     @Test(expected=IllegalArgumentException.class) public void throwsOnMalformedUrl() {
 89 |         Connection con = HttpConnection.connect("bzzt");
 90 |     }
 91 | 
 92 |     @Test public void userAgent() {
 93 |         Connection con = HttpConnection.connect("http://example.com/");
 94 |         con.userAgent("Mozilla");
 95 |         assertEquals("Mozilla", con.request().header("User-Agent"));
 96 |     }
 97 | 
 98 |     @Test public void timeout() {
 99 |         Connection con = HttpConnection.connect("http://example.com/");
100 |         con.timeout(1000);
101 |         assertEquals(1000, con.request().timeout());
102 |     }
103 | 
104 |     @Test public void referrer() {
105 |         Connection con = HttpConnection.connect("http://example.com/");
106 |         con.referrer("http://foo.com");
107 |         assertEquals("http://foo.com", con.request().header("Referer"));
108 |     }
109 | 
110 |     @Test public void method() {
111 |         Connection con = HttpConnection.connect("http://example.com/");
112 |         assertEquals(Connection.Method.GET, con.request().method());
113 |         con.method(Connection.Method.POST);
114 |         assertEquals(Connection.Method.POST, con.request().method());
115 |     }
116 | 
117 |     @Test(expected=IllegalArgumentException.class) public void throwsOnOdddData() {
118 |         Connection con = HttpConnection.connect("http://example.com/");
119 |         con.data("Name", "val", "what");
120 |     }
121 | 
122 |     @Test public void data() {
123 |         Connection con = HttpConnection.connect("http://example.com/");
124 |         con.data("Name", "Val", "Foo", "bar");
125 |         Collection<Connection.KeyVal> values = con.request().data();
126 |         Object[] data =  values.toArray();
127 |         Connection.KeyVal one = (Connection.KeyVal) data[0];
128 |         Connection.KeyVal two = (Connection.KeyVal) data[1];
129 |         assertEquals("Name", one.key());
130 |         assertEquals("Val", one.value());
131 |         assertEquals("Foo", two.key());
132 |         assertEquals("bar", two.value());
133 |     }
134 | 
135 |     @Test public void cookie() {
136 |         Connection con = HttpConnection.connect("http://example.com/");
137 |         con.cookie("Name", "Val");
138 |         assertEquals("Val", con.request().cookie("Name"));
139 |     }
140 | }


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/helper/StringUtilTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.helper;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.junit.Test;
 5 | 
 6 | import java.util.Arrays;
 7 | 
 8 | import static org.junit.Assert.assertEquals;
 9 | import static org.junit.Assert.assertFalse;
10 | import static org.junit.Assert.assertTrue;
11 | 
12 | public class StringUtilTest {
13 | 
14 |     @Test public void join() {
15 |         assertEquals("", StringUtil.join(Arrays.<String>asList(""), " "));
16 |         assertEquals("one", StringUtil.join(Arrays.<String>asList("one"), " "));
17 |         assertEquals("one two three", StringUtil.join(Arrays.<String>asList("one", "two", "three"), " "));
18 |     }
19 | 
20 |     @Test public void padding() {
21 |         assertEquals("", StringUtil.padding(0));
22 |         assertEquals(" ", StringUtil.padding(1));
23 |         assertEquals("  ", StringUtil.padding(2));
24 |         assertEquals("               ", StringUtil.padding(15));
25 |     }
26 | 
27 |     @Test public void isBlank() {
28 |         assertTrue(StringUtil.isBlank(null));
29 |         assertTrue(StringUtil.isBlank(""));
30 |         assertTrue(StringUtil.isBlank("      "));
31 |         assertTrue(StringUtil.isBlank("   \r\n  "));
32 | 
33 |         assertFalse(StringUtil.isBlank("hello"));
34 |         assertFalse(StringUtil.isBlank("   hello   "));
35 |     }
36 | 
37 |     @Test public void isNumeric() {
38 |         assertFalse(StringUtil.isNumeric(null));
39 |         assertFalse(StringUtil.isNumeric(" "));
40 |         assertFalse(StringUtil.isNumeric("123 546"));
41 |         assertFalse(StringUtil.isNumeric("hello"));
42 |         assertFalse(StringUtil.isNumeric("123.334"));
43 | 
44 |         assertTrue(StringUtil.isNumeric("1"));
45 |         assertTrue(StringUtil.isNumeric("1234"));
46 |     }
47 | 
48 |     @Test public void isWhitespace() {
49 |         assertTrue(StringUtil.isWhitespace('\t'));
50 |         assertTrue(StringUtil.isWhitespace('\n'));
51 |         assertTrue(StringUtil.isWhitespace('\r'));
52 |         assertTrue(StringUtil.isWhitespace('\f'));
53 |         assertTrue(StringUtil.isWhitespace(' '));
54 |         
55 |         assertFalse(StringUtil.isWhitespace('\u00a0'));
56 |         assertFalse(StringUtil.isWhitespace('\u2000'));
57 |         assertFalse(StringUtil.isWhitespace('\u3000'));
58 |     }
59 | 
60 |     @Test public void normaliseWhiteSpace() {
61 |         assertEquals(" ", StringUtil.normaliseWhitespace("    \r \n \r\n"));
62 |         assertEquals(" hello there ", StringUtil.normaliseWhitespace("   hello   \r \n  there    \n"));
63 |         assertEquals("hello", StringUtil.normaliseWhitespace("hello"));
64 |         assertEquals("hello there", StringUtil.normaliseWhitespace("hello\nthere"));
65 |     }
66 | 
67 |     @Test public void normaliseWhiteSpaceModified() {
68 |         String check1 = "Hello there";
69 |         String check2 = "Hello\nthere";
70 |         String check3 = "Hello  there";
71 | 
72 |         // does not create new string no mods done
73 |         assertTrue(check1 == StringUtil.normaliseWhitespace(check1));
74 |         assertTrue(check2 != StringUtil.normaliseWhitespace(check2));
75 |         assertTrue(check3 != StringUtil.normaliseWhitespace(check3));
76 |     }
77 | 
78 |     @Test public void normaliseWhiteSpaceHandlesHighSurrogates() {
79 |         String test71540chars = "\ud869\udeb2\u304b\u309a  1";
80 |         String test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1";
81 | 
82 |         assertEquals(test71540charsExpectedSingleWhitespace, StringUtil.normaliseWhitespace(test71540chars));
83 |         String extractedText = Jsoup.parse(test71540chars).text();
84 |         assertEquals(test71540charsExpectedSingleWhitespace, extractedText);
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/integration/Benchmark.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.integration;
 2 | 
 3 | import java.util.Date;
 4 | 
 5 | /**
 6 |  Does an A/B test on two methods, and prints out how long each took.
 7 | 
 8 |  @author Jonathan Hedley, jonathan@hedley.net */
 9 | public class Benchmark {
10 |     public static void run(Runnable a, Runnable b, int count) {
11 |         long aMillis;
12 |         long bMillis;
13 | 
14 |         print("Running test A (x%d)", count);
15 |         aMillis = time(a, count);
16 |         print("Running test B");
17 |         bMillis = time(b, count);
18 | 
19 |         print("\nResults:");
20 |         print("A: %.2fs", aMillis / 1000f);
21 |         print("B: %.2fs", bMillis / 1000f);
22 |         print("\nB ran in %.2f %% time of A\n", (bMillis *1f / aMillis * 1f) * 100f);
23 |     }
24 | 
25 |     private static long time(Runnable test, int count) {
26 |         Date start = new Date();
27 |         for (int i = 0; i < count; i++) {
28 |             test.run();
29 |         }
30 |         Date end = new Date();
31 |         return end.getTime() - start.getTime();
32 |     }
33 | 
34 |     private static void print(String msgFormat, Object... msgParams) {
35 |         System.out.println(String.format(msgFormat, msgParams));
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/AttributeTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import static org.junit.Assert.assertEquals;
 6 | 
 7 | public class AttributeTest {
 8 |     @Test public void html() {
 9 |         Attribute attr = new Attribute("key", "value &");
10 |         assertEquals("key=\"value &amp;\"", attr.html());
11 |         assertEquals(attr.html(), attr.toString());
12 |     }
13 | 
14 |     @Test public void testWithSupplementaryCharacterInAttributeKeyAndValue() {
15 |         String s = new String(Character.toChars(135361));
16 |         Attribute attr = new Attribute(s, "A" + s + "B");
17 |         assertEquals(s + "=\"A" + s + "B\"", attr.html());
18 |         assertEquals(attr.html(), attr.toString());
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/AttributesTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | /**
 8 |  * Tests for Attributes.
 9 |  *
10 |  * @author Jonathan Hedley
11 |  */
12 | public class AttributesTest {
13 |     @Test public void html() {
14 |         Attributes a = new Attributes();
15 |         a.put("Tot", "a&p");
16 |         a.put("Hello", "There");
17 |         a.put("data-name", "Jsoup");
18 | 
19 |         assertEquals(3, a.size());
20 |         assertTrue(a.hasKey("tot"));
21 |         assertTrue(a.hasKey("Hello"));
22 |         assertTrue(a.hasKey("data-name"));
23 |         assertEquals(1, a.dataset().size());
24 |         assertEquals("Jsoup", a.dataset().get("name"));
25 |         assertEquals("a&p", a.get("tot"));
26 | 
27 |         assertEquals(" tot=\"a&amp;p\" hello=\"There\" data-name=\"Jsoup\"", a.html());
28 |         assertEquals(a.html(), a.toString());
29 |     }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/DocumentTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.TextUtil;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.*;
 8 | 
 9 | /**
10 |  Tests for Document.
11 | 
12 |  @author Jonathan Hedley, jonathan@hedley.net */
13 | public class DocumentTest {
14 |     @Test public void setTextPreservesDocumentStructure() {
15 |         Document doc = Jsoup.parse("<p>Hello</p>");
16 |         doc.text("Replaced");
17 |         assertEquals("Replaced", doc.text());
18 |         assertEquals("Replaced", doc.body().text());
19 |         assertEquals(1, doc.select("head").size());
20 |     }
21 |     
22 |     @Test public void testTitles() {
23 |         Document noTitle = Jsoup.parse("<p>Hello</p>");
24 |         Document withTitle = Jsoup.parse("<title>First</title><title>Ignore</title><p>Hello</p>");
25 |         
26 |         assertEquals("", noTitle.title());
27 |         noTitle.title("Hello");
28 |         assertEquals("Hello", noTitle.title());
29 |         assertEquals("Hello", noTitle.select("title").first().text());
30 |         
31 |         assertEquals("First", withTitle.title());
32 |         withTitle.title("Hello");
33 |         assertEquals("Hello", withTitle.title());
34 |         assertEquals("Hello", withTitle.select("title").first().text());
35 | 
36 |         Document normaliseTitle = Jsoup.parse("<title>   Hello\nthere   \n   now   \n");
37 |         assertEquals("Hello there now", normaliseTitle.title());
38 |     }
39 | 
40 |     @Test public void testOutputEncoding() {
41 |         Document doc = Jsoup.parse("<p title=π>π & < > </p>");
42 |         // default is utf-8
43 |         assertEquals("<p title=\"π\">π &amp; &lt; &gt; </p>", doc.body().html());
44 |         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
45 | 
46 |         doc.outputSettings().charset("ascii");
47 |         assertEquals(Entities.EscapeMode.base, doc.outputSettings().escapeMode());
48 |         assertEquals("<p title=\"&#x3c0;\">&#x3c0; &amp; &lt; &gt; </p>", doc.body().html());
49 | 
50 |         doc.outputSettings().escapeMode(Entities.EscapeMode.extended);
51 |         assertEquals("<p title=\"&pi;\">&pi; &amp; &lt; &gt; </p>", doc.body().html());
52 |     }
53 | 
54 |     @Test public void testXhtmlReferences() {
55 |         Document doc = Jsoup.parse("&lt; &gt; &amp; &quot; &apos; &times;");
56 |         doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
57 |         assertEquals("&lt; &gt; &amp; &quot; &apos; ×", doc.body().html());
58 |     }
59 | 
60 |     @Test public void testNormalisesStructure() {
61 |         Document doc = Jsoup.parse("<html><head><script>one</script><noscript><p>two</p></noscript></head><body><p>three</p></body><p>four</p></html>");
62 |         assertEquals("<html><head><script>one</script><noscript></noscript></head><body><p>two</p><p>three</p><p>four</p></body></html>", TextUtil.stripNewlines(doc.html()));
63 |     }
64 | 
65 |     @Test public void testClone() {
66 |         Document doc = Jsoup.parse("<title>Hello</title> <p>One<p>Two");
67 |         Document clone = doc.clone();
68 | 
69 |         assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
70 |         clone.title("Hello there");
71 |         clone.select("p").first().text("One more").attr("id", "1");
72 |         assertEquals("<html><head><title>Hello there</title> </head><body><p id=\"1\">One more</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
73 |         assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html()));
74 |     }
75 | 
76 |     @Test public void testClonesDeclarations() {
77 |         Document doc = Jsoup.parse("<!DOCTYPE html><html><head><title>Doctype test");
78 |         Document clone = doc.clone();
79 | 
80 |         assertEquals(doc.html(), clone.html());
81 |         assertEquals("<!DOCTYPE html><html><head><title>Doctype test</title></head><body></body></html>",
82 |                 TextUtil.stripNewlines(clone.html()));
83 |     }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/DocumentTypeTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import static org.junit.Assert.*;
 6 | 
 7 | /**
 8 |  * Tests for the DocumentType node
 9 |  *
10 |  * @author Jonathan Hedley, http://jonathanhedley.com/
11 |  */
12 | public class DocumentTypeTest {
13 |     @Test(expected = IllegalArgumentException.class)
14 |     public void constructorValidationThrowsExceptionOnBlankName() {
15 |         DocumentType fail = new DocumentType("","", "", "");
16 |     }
17 | 
18 |     @Test(expected = IllegalArgumentException.class)
19 |     public void constructorValidationThrowsExceptionOnNulls() {
20 |         DocumentType fail = new DocumentType("html", null, null, "");
21 |     }
22 | 
23 |     @Test
24 |     public void constructorValidationOkWithBlankPublicAndSystemIds() {
25 |         DocumentType fail = new DocumentType("html","", "","");
26 |     }
27 | 
28 |     @Test public void outerHtmlGeneration() {
29 |         DocumentType html5 = new DocumentType("html", "", "", "");
30 |         assertEquals("<!DOCTYPE html>", html5.outerHtml());
31 | 
32 |         DocumentType publicDocType = new DocumentType("html", "-//IETF//DTD HTML//", "", "");
33 |         assertEquals("<!DOCTYPE html PUBLIC \"-//IETF//DTD HTML//\">", publicDocType.outerHtml());
34 | 
35 |         DocumentType systemDocType = new DocumentType("html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd", "");
36 |         assertEquals("<!DOCTYPE html \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">", systemDocType.outerHtml());
37 | 
38 |         DocumentType combo = new DocumentType("notHtml", "--public", "--system", "");
39 |         assertEquals("<!DOCTYPE notHtml PUBLIC \"--public\" \"--system\">", combo.outerHtml());
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/EntitiesTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.junit.Test;
 5 | 
 6 | import static org.junit.Assert.*;
 7 | 
 8 | import java.nio.charset.Charset;
 9 | 
10 | public class EntitiesTest {
11 |     @Test public void escape() {
12 |         String text = "Hello &<> Å å π 新 there ¾ ©";
13 |         String escapedAscii = Entities.escape(text, Charset.forName("ascii").newEncoder(), Entities.EscapeMode.base);
14 |         String escapedAsciiFull = Entities.escape(text, Charset.forName("ascii").newEncoder(), Entities.EscapeMode.extended);
15 |         String escapedAsciiXhtml = Entities.escape(text, Charset.forName("ascii").newEncoder(), Entities.EscapeMode.xhtml);
16 |         String escapedUtfFull = Entities.escape(text, Charset.forName("UTF-8").newEncoder(), Entities.EscapeMode.base);
17 |         String escapedUtfMin = Entities.escape(text, Charset.forName("UTF-8").newEncoder(), Entities.EscapeMode.xhtml);
18 | 
19 |         assertEquals("Hello &amp;&lt;&gt; &Aring; &aring; &#x3c0; &#x65b0; there &frac34; &copy;", escapedAscii);
20 |         assertEquals("Hello &amp;&lt;&gt; &angst; &aring; &pi; &#x65b0; there &frac34; &copy;", escapedAsciiFull);
21 |         assertEquals("Hello &amp;&lt;&gt; &#xc5; &#xe5; &#x3c0; &#x65b0; there &#xbe; &#xa9;", escapedAsciiXhtml);
22 |         assertEquals("Hello &amp;&lt;&gt; &Aring; &aring; π 新 there &frac34; &copy;", escapedUtfFull);
23 |         assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ ©", escapedUtfMin);
24 |         // odd that it's defined as aring in base but angst in full
25 | 
26 |         // round trip
27 |         assertEquals(text, Entities.unescape(escapedAscii));
28 |         assertEquals(text, Entities.unescape(escapedAsciiFull));
29 |         assertEquals(text, Entities.unescape(escapedAsciiXhtml));
30 |         assertEquals(text, Entities.unescape(escapedUtfFull));
31 |         assertEquals(text, Entities.unescape(escapedUtfMin));
32 |     }
33 | 
34 |     @Test public void escapeSupplementaryCharacter(){
35 |         String text = new String(Character.toChars(135361));
36 |         String escapedAscii = Entities.escape(text, Charset.forName("ascii").newEncoder(), Entities.EscapeMode.base);
37 |         assertEquals("&#x210c1;", escapedAscii);
38 |         String escapedUtf = Entities.escape(text, Charset.forName("UTF-8").newEncoder(), Entities.EscapeMode.base);
39 |         assertEquals(text, escapedUtf);
40 |     }
41 | 
42 |     @Test public void unescape() {
43 |         String text = "Hello &amp;&LT&gt; &reg &angst; &angst &#960; &#960 &#x65B0; there &! &frac34; &copy; &COPY;";
44 |         assertEquals("Hello &<> ® Å &angst π π 新 there &! ¾ © ©", Entities.unescape(text));
45 | 
46 |         assertEquals("&0987654321; &unknown", Entities.unescape("&0987654321; &unknown"));
47 |     }
48 | 
49 |     @Test public void strictUnescape() { // for attributes, enforce strict unescaping (must look like &#xxx; , not just &#xxx)
50 |         String text = "Hello &amp= &amp;";
51 |         assertEquals("Hello &amp= &", Entities.unescape(text, true));
52 |         assertEquals("Hello &= &", Entities.unescape(text));
53 |         assertEquals("Hello &= &", Entities.unescape(text, false));
54 |     }
55 | 
56 |     
57 |     @Test public void caseSensitive() {
58 |         String unescaped = "Ü ü & &";
59 |         assertEquals("&Uuml; &uuml; &amp; &amp;", Entities.escape(unescaped, Charset.forName("ascii").newEncoder(), Entities.EscapeMode.extended));
60 |         
61 |         String escaped = "&Uuml; &uuml; &amp; &AMP";
62 |         assertEquals("Ü ü & &", Entities.unescape(escaped));
63 |     }
64 |     
65 |     @Test public void quoteReplacements() {
66 |         String escaped = "&#92; &#36;";
67 |         String unescaped = "\\ $";
68 |         
69 |         assertEquals(unescaped, Entities.unescape(escaped));
70 |     }
71 | 
72 |     @Test public void letterDigitEntities() {
73 |         String html = "<p>&sup1;&sup2;&sup3;&frac14;&frac12;&frac34;</p>";
74 |         Document doc = Jsoup.parse(html);
75 |         Element p = doc.select("p").first();
76 |         assertEquals("&sup1;&sup2;&sup3;&frac14;&frac12;&frac34;", p.html());
77 |         assertEquals("¹²³¼½¾", p.text());
78 |     }
79 | 
80 |     @Test public void noSpuriousDecodes() {
81 |         String string = "http://www.foo.com?a=1&num_rooms=1&children=0&int=VA&b=2";
82 |         assertEquals(string, Entities.unescape(string));
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/FormElementTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.Connection;
  4 | import org.jsoup.Jsoup;
  5 | import org.junit.Test;
  6 | 
  7 | import java.io.IOException;
  8 | import java.util.Collection;
  9 | import java.util.List;
 10 | 
 11 | import static org.junit.Assert.*;
 12 | 
 13 | /**
 14 |  * Tests for FormElement
 15 |  *
 16 |  * @author Jonathan Hedley
 17 |  */
 18 | public class FormElementTest {
 19 |     @Test public void hasAssociatedControls() {
 20 |         //"button", "fieldset", "input", "keygen", "object", "output", "select", "textarea"
 21 |         String html = "<form id=1><button id=1><fieldset id=2 /><input id=3><keygen id=4><object id=5><output id=6>" +
 22 |                 "<select id=7><option></select><textarea id=8><p id=9>";
 23 |         Document doc = Jsoup.parse(html);
 24 | 
 25 |         FormElement form = (FormElement) doc.select("form").first();
 26 |         assertEquals(8, form.elements().size());
 27 |     }
 28 | 
 29 |     @Test public void createsFormData() {
 30 |         String html = "<form><input name='one' value='two'><select name='three'><option value='not'>" +
 31 |                 "<option value='four' selected><option value='five' selected><textarea name=six>seven</textarea></form>";
 32 |         Document doc = Jsoup.parse(html);
 33 |         FormElement form = (FormElement) doc.select("form").first();
 34 |         List<Connection.KeyVal> data = form.formData();
 35 | 
 36 |         assertEquals(4, data.size());
 37 |         assertEquals("one=two", data.get(0).toString());
 38 |         assertEquals("three=four", data.get(1).toString());
 39 |         assertEquals("three=five", data.get(2).toString());
 40 |         assertEquals("six=seven", data.get(3).toString());
 41 |     }
 42 | 
 43 |     @Test public void createsSubmitableConnection() {
 44 |         String html = "<form action='/search'><input name='q'></form>";
 45 |         Document doc = Jsoup.parse(html, "http://example.com/");
 46 |         doc.select("[name=q]").attr("value", "jsoup");
 47 | 
 48 |         FormElement form = ((FormElement) doc.select("form").first());
 49 |         Connection con = form.submit();
 50 | 
 51 |         assertEquals(Connection.Method.GET, con.request().method());
 52 |         assertEquals("http://example.com/search", con.request().url().toExternalForm());
 53 |         List<Connection.KeyVal> dataList = (List<Connection.KeyVal>) con.request().data();
 54 |         assertEquals("q=jsoup", dataList.get(0).toString());
 55 | 
 56 |         doc.select("form").attr("method", "post");
 57 |         Connection con2 = form.submit();
 58 |         assertEquals(Connection.Method.POST, con2.request().method());
 59 |     }
 60 | 
 61 |     @Test public void actionWithNoValue() {
 62 |         String html = "<form><input name='q'></form>";
 63 |         Document doc = Jsoup.parse(html, "http://example.com/");
 64 |         FormElement form = ((FormElement) doc.select("form").first());
 65 |         Connection con = form.submit();
 66 | 
 67 |         assertEquals("http://example.com/", con.request().url().toExternalForm());
 68 |     }
 69 | 
 70 |     @Test public void actionWithNoBaseUri() {
 71 |         String html = "<form><input name='q'></form>";
 72 |         Document doc = Jsoup.parse(html);
 73 |         FormElement form = ((FormElement) doc.select("form").first());
 74 | 
 75 | 
 76 |         boolean threw = false;
 77 |         try {
 78 |             Connection con = form.submit();
 79 |         } catch (IllegalArgumentException e) {
 80 |             threw = true;
 81 |             assertEquals("Could not determine a form action URL for submit. Ensure you set a base URI when parsing.",
 82 |                     e.getMessage());
 83 |         }
 84 |         assertTrue(threw);
 85 |     }
 86 | 
 87 |     @Test public void formsAddedAfterParseAreFormElements() {
 88 |         Document doc = Jsoup.parse("<body />");
 89 |         doc.body().html("<form action='http://example.com/search'><input name='q' value='search'>");
 90 |         Element formEl = doc.select("form").first();
 91 |         assertTrue(formEl instanceof FormElement);
 92 | 
 93 |         FormElement form = (FormElement) formEl;
 94 |         assertEquals(1, form.elements().size());
 95 |     }
 96 | 
 97 |     @Test public void controlsAddedAfterParseAreLinkedWithForms() {
 98 |         Document doc = Jsoup.parse("<body />");
 99 |         doc.body().html("<form />");
100 | 
101 |         Element formEl = doc.select("form").first();
102 |         formEl.append("<input name=foo value=bar>");
103 | 
104 |         assertTrue(formEl instanceof FormElement);
105 |         FormElement form = (FormElement) formEl;
106 |         assertEquals(1, form.elements().size());
107 | 
108 |         List<Connection.KeyVal> data = form.formData();
109 |         assertEquals("foo=bar", data.get(0).toString());
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/TextNodeTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.TextUtil;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.*;
 8 | 
 9 | /**
10 |  Test TextNodes
11 | 
12 |  @author Jonathan Hedley, jonathan@hedley.net */
13 | public class TextNodeTest {
14 |     @Test public void testBlank() {
15 |         TextNode one = new TextNode("", "");
16 |         TextNode two = new TextNode("     ", "");
17 |         TextNode three = new TextNode("  \n\n   ", "");
18 |         TextNode four = new TextNode("Hello", "");
19 |         TextNode five = new TextNode("  \nHello ", "");
20 | 
21 |         assertTrue(one.isBlank());
22 |         assertTrue(two.isBlank());
23 |         assertTrue(three.isBlank());
24 |         assertFalse(four.isBlank());
25 |         assertFalse(five.isBlank());
26 |     }
27 |     
28 |     @Test public void testTextBean() {
29 |         Document doc = Jsoup.parse("<p>One <span>two &amp;</span> three &amp;</p>");
30 |         Element p = doc.select("p").first();
31 | 
32 |         Element span = doc.select("span").first();
33 |         assertEquals("two &", span.text());
34 |         TextNode spanText = (TextNode) span.childNode(0);
35 |         assertEquals("two &", spanText.text());
36 |         
37 |         TextNode tn = (TextNode) p.childNode(2);
38 |         assertEquals(" three &", tn.text());
39 |         
40 |         tn.text(" POW!");
41 |         assertEquals("One <span>two &amp;</span> POW!", TextUtil.stripNewlines(p.html()));
42 | 
43 |         tn.attr("text", "kablam &");
44 |         assertEquals("kablam &", tn.text());
45 |         assertEquals("One <span>two &amp;</span>kablam &amp;", TextUtil.stripNewlines(p.html()));
46 |     }
47 | 
48 |     @Test public void testSplitText() {
49 |         Document doc = Jsoup.parse("<div>Hello there</div>");
50 |         Element div = doc.select("div").first();
51 |         TextNode tn = (TextNode) div.childNode(0);
52 |         TextNode tail = tn.splitText(6);
53 |         assertEquals("Hello ", tn.getWholeText());
54 |         assertEquals("there", tail.getWholeText());
55 |         tail.text("there!");
56 |         assertEquals("Hello there!", div.text());
57 |         assertTrue(tn.parent() == tail.parent());
58 |     }
59 | 
60 |     @Test public void testSplitAnEmbolden() {
61 |         Document doc = Jsoup.parse("<div>Hello there</div>");
62 |         Element div = doc.select("div").first();
63 |         TextNode tn = (TextNode) div.childNode(0);
64 |         TextNode tail = tn.splitText(6);
65 |         tail.wrap("<b></b>");
66 | 
67 |         assertEquals("Hello <b>there</b>", TextUtil.stripNewlines(div.html())); // not great that we get \n<b>there there... must correct
68 |     }
69 | 
70 |     @Test public void testWithSupplementaryCharacter(){
71 |         Document doc = Jsoup.parse(new String(Character.toChars(135361)));
72 |         TextNode t = doc.body().textNodes().get(0);
73 |         assertEquals(new String(Character.toChars(135361)), t.outerHtml().trim());
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/AttributeParseTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.nodes.Attributes;
 5 | import org.jsoup.nodes.Element;
 6 | import org.jsoup.select.Elements;
 7 | import org.junit.Test;
 8 | 
 9 | import static org.junit.Assert.*;
10 | 
11 | /**
12 |  Test suite for attribute parser.
13 | 
14 |  @author Jonathan Hedley, jonathan@hedley.net */
15 | public class AttributeParseTest {
16 | 
17 |     @Test public void parsesRoughAttributeString() {
18 |         String html = "<a id=\"123\" class=\"baz = 'bar'\" style = 'border: 2px'qux zim foo = 12 mux=18 />";
19 |         // should be: <id=123>, <class=baz = 'bar'>, <qux=>, <zim=>, <foo=12>, <mux.=18>
20 | 
21 |         Element el = Jsoup.parse(html).getElementsByTag("a").get(0);
22 |         Attributes attr = el.attributes();
23 |         assertEquals(7, attr.size());
24 |         assertEquals("123", attr.get("id"));
25 |         assertEquals("baz = 'bar'", attr.get("class"));
26 |         assertEquals("border: 2px", attr.get("style"));
27 |         assertEquals("", attr.get("qux"));
28 |         assertEquals("", attr.get("zim"));
29 |         assertEquals("12", attr.get("foo"));
30 |         assertEquals("18", attr.get("mux"));
31 |     }
32 | 
33 |     @Test public void handlesNewLinesAndReturns() {
34 |         String html = "<a\r\nfoo='bar\r\nqux'\r\nbar\r\n=\r\ntwo>One</a>";
35 |         Element el = Jsoup.parse(html).select("a").first();
36 |         assertEquals(2, el.attributes().size());
37 |         assertEquals("bar\r\nqux", el.attr("foo")); // currently preserves newlines in quoted attributes. todo confirm if should.
38 |         assertEquals("two", el.attr("bar"));
39 |     }
40 | 
41 |     @Test public void parsesEmptyString() {
42 |         String html = "<a />";
43 |         Element el = Jsoup.parse(html).getElementsByTag("a").get(0);
44 |         Attributes attr = el.attributes();
45 |         assertEquals(0, attr.size());
46 |     }
47 | 
48 |     @Test public void canStartWithEq() {
49 |         String html = "<a =empty />";
50 |         Element el = Jsoup.parse(html).getElementsByTag("a").get(0);
51 |         Attributes attr = el.attributes();
52 |         assertEquals(1, attr.size());
53 |         assertTrue(attr.hasKey("=empty"));
54 |         assertEquals("", attr.get("=empty"));
55 |     }
56 | 
57 |     @Test public void strictAttributeUnescapes() {
58 |         String html = "<a id=1 href='?foo=bar&mid&lt=true'>One</a> <a id=2 href='?foo=bar&lt;qux&lg=1'>Two</a>";
59 |         Elements els = Jsoup.parse(html).select("a");
60 |         assertEquals("?foo=bar&mid&lt=true", els.first().attr("href"));
61 |         assertEquals("?foo=bar<qux&lg=1", els.last().attr("href"));
62 |     }
63 | 
64 |     @Test public void moreAttributeUnescapes() {
65 |         String html = "<a href='&wr_id=123&mid-size=true&ok=&wr'>Check</a>";
66 |         Elements els = Jsoup.parse(html).select("a");
67 |         assertEquals("&wr_id=123&mid-size=true&ok=&wr", els.first().attr("href"));
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/CharacterReaderTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import org.junit.Test;
  4 | 
  5 | import static org.junit.Assert.*;
  6 | 
  7 | /**
  8 |  * Test suite for character reader.
  9 |  *
 10 |  * @author Jonathan Hedley, jonathan@hedley.net
 11 |  */
 12 | public class CharacterReaderTest {
 13 | 
 14 |     @Test public void consume() {
 15 |         CharacterReader r = new CharacterReader("one");
 16 |         assertEquals(0, r.pos());
 17 |         assertEquals('o', r.current());
 18 |         assertEquals('o', r.consume());
 19 |         assertEquals(1, r.pos());
 20 |         assertEquals('n', r.current());
 21 |         assertEquals(1, r.pos());
 22 |         assertEquals('n', r.consume());
 23 |         assertEquals('e', r.consume());
 24 |         assertTrue(r.isEmpty());
 25 |         assertEquals(CharacterReader.EOF, r.consume());
 26 |         assertTrue(r.isEmpty());
 27 |         assertEquals(CharacterReader.EOF, r.consume());
 28 |     }
 29 | 
 30 |     @Test public void unconsume() {
 31 |         CharacterReader r = new CharacterReader("one");
 32 |         assertEquals('o', r.consume());
 33 |         assertEquals('n', r.current());
 34 |         r.unconsume();
 35 |         assertEquals('o', r.current());
 36 | 
 37 |         assertEquals('o', r.consume());
 38 |         assertEquals('n', r.consume());
 39 |         assertEquals('e', r.consume());
 40 |         assertTrue(r.isEmpty());
 41 |         r.unconsume();
 42 |         assertFalse(r.isEmpty());
 43 |         assertEquals('e', r.current());
 44 |         assertEquals('e', r.consume());
 45 |         assertTrue(r.isEmpty());
 46 | 
 47 |         assertEquals(CharacterReader.EOF, r.consume());
 48 |         r.unconsume();
 49 |         assertTrue(r.isEmpty());
 50 |         assertEquals(CharacterReader.EOF, r.current());
 51 |     }
 52 | 
 53 |     @Test public void mark() {
 54 |         CharacterReader r = new CharacterReader("one");
 55 |         r.consume();
 56 |         r.mark();
 57 |         assertEquals('n', r.consume());
 58 |         assertEquals('e', r.consume());
 59 |         assertTrue(r.isEmpty());
 60 |         r.rewindToMark();
 61 |         assertEquals('n', r.consume());
 62 |     }
 63 | 
 64 |     @Test public void consumeToEnd() {
 65 |         String in = "one two three";
 66 |         CharacterReader r = new CharacterReader(in);
 67 |         String toEnd = r.consumeToEnd();
 68 |         assertEquals(in, toEnd);
 69 |         assertTrue(r.isEmpty());
 70 |     }
 71 | 
 72 |     @Test public void nextIndexOfChar() {
 73 |         String in = "blah blah";
 74 |         CharacterReader r = new CharacterReader(in);
 75 | 
 76 |         assertEquals(-1, r.nextIndexOf('x'));
 77 |         assertEquals(3, r.nextIndexOf('h'));
 78 |         String pull = r.consumeTo('h');
 79 |         assertEquals("bla", pull);
 80 |         r.consume();
 81 |         assertEquals(2, r.nextIndexOf('l'));
 82 |         assertEquals(" blah", r.consumeToEnd());
 83 |         assertEquals(-1, r.nextIndexOf('x'));
 84 |     }
 85 | 
 86 |     @Test public void nextIndexOfString() {
 87 |         String in = "One Two something Two Three Four";
 88 |         CharacterReader r = new CharacterReader(in);
 89 | 
 90 |         assertEquals(-1, r.nextIndexOf("Foo"));
 91 |         assertEquals(4, r.nextIndexOf("Two"));
 92 |         assertEquals("One Two ", r.consumeTo("something"));
 93 |         assertEquals(10, r.nextIndexOf("Two"));
 94 |         assertEquals("something Two Three Four", r.consumeToEnd());
 95 |         assertEquals(-1, r.nextIndexOf("Two"));
 96 |     }
 97 | 
 98 |     @Test public void consumeToChar() {
 99 |         CharacterReader r = new CharacterReader("One Two Three");
100 |         assertEquals("One ", r.consumeTo('T'));
101 |         assertEquals("", r.consumeTo('T')); // on Two
102 |         assertEquals('T', r.consume());
103 |         assertEquals("wo ", r.consumeTo('T'));
104 |         assertEquals('T', r.consume());
105 |         assertEquals("hree", r.consumeTo('T')); // consume to end
106 |     }
107 | 
108 |     @Test public void consumeToString() {
109 |         CharacterReader r = new CharacterReader("One Two Two Four");
110 |         assertEquals("One ", r.consumeTo("Two"));
111 |         assertEquals('T', r.consume());
112 |         assertEquals("wo ", r.consumeTo("Two"));
113 |         assertEquals('T', r.consume());
114 |         assertEquals("wo Four", r.consumeTo("Qux"));
115 |     }
116 | 
117 |     @Test public void advance() {
118 |         CharacterReader r = new CharacterReader("One Two Three");
119 |         assertEquals('O', r.consume());
120 |         r.advance();
121 |         assertEquals('e', r.consume());
122 |     }
123 | 
124 |     @Test public void consumeToAny() {
125 |         CharacterReader r = new CharacterReader("One &bar; qux");
126 |         assertEquals("One ", r.consumeToAny('&', ';'));
127 |         assertTrue(r.matches('&'));
128 |         assertTrue(r.matches("&bar;"));
129 |         assertEquals('&', r.consume());
130 |         assertEquals("bar", r.consumeToAny('&', ';'));
131 |         assertEquals(';', r.consume());
132 |         assertEquals(" qux", r.consumeToAny('&', ';'));
133 |     }
134 | 
135 |     @Test public void consumeLetterSequence() {
136 |         CharacterReader r = new CharacterReader("One &bar; qux");
137 |         assertEquals("One", r.consumeLetterSequence());
138 |         assertEquals(" &", r.consumeTo("bar;"));
139 |         assertEquals("bar", r.consumeLetterSequence());
140 |         assertEquals("; qux", r.consumeToEnd());
141 |     }
142 | 
143 |     @Test public void consumeLetterThenDigitSequence() {
144 |         CharacterReader r = new CharacterReader("One12 Two &bar; qux");
145 |         assertEquals("One12", r.consumeLetterThenDigitSequence());
146 |         assertEquals(' ', r.consume());
147 |         assertEquals("Two", r.consumeLetterThenDigitSequence());
148 |         assertEquals(" &bar; qux", r.consumeToEnd());
149 |     }
150 | 
151 |     @Test public void matches() {
152 |         CharacterReader r = new CharacterReader("One Two Three");
153 |         assertTrue(r.matches('O'));
154 |         assertTrue(r.matches("One Two Three"));
155 |         assertTrue(r.matches("One"));
156 |         assertFalse(r.matches("one"));
157 |         assertEquals('O', r.consume());
158 |         assertFalse(r.matches("One"));
159 |         assertTrue(r.matches("ne Two Three"));
160 |         assertFalse(r.matches("ne Two Three Four"));
161 |         assertEquals("ne Two Three", r.consumeToEnd());
162 |         assertFalse(r.matches("ne"));
163 |     }
164 | 
165 |     @Test
166 |     public void matchesIgnoreCase() {
167 |         CharacterReader r = new CharacterReader("One Two Three");
168 |         assertTrue(r.matchesIgnoreCase("O"));
169 |         assertTrue(r.matchesIgnoreCase("o"));
170 |         assertTrue(r.matches('O'));
171 |         assertFalse(r.matches('o'));
172 |         assertTrue(r.matchesIgnoreCase("One Two Three"));
173 |         assertTrue(r.matchesIgnoreCase("ONE two THREE"));
174 |         assertTrue(r.matchesIgnoreCase("One"));
175 |         assertTrue(r.matchesIgnoreCase("one"));
176 |         assertEquals('O', r.consume());
177 |         assertFalse(r.matchesIgnoreCase("One"));
178 |         assertTrue(r.matchesIgnoreCase("NE Two Three"));
179 |         assertFalse(r.matchesIgnoreCase("ne Two Three Four"));
180 |         assertEquals("ne Two Three", r.consumeToEnd());
181 |         assertFalse(r.matchesIgnoreCase("ne"));
182 |     }
183 | 
184 |     @Test public void containsIgnoreCase() {
185 |         CharacterReader r = new CharacterReader("One TWO three");
186 |         assertTrue(r.containsIgnoreCase("two"));
187 |         assertTrue(r.containsIgnoreCase("three"));
188 |         // weird one: does not find one, because it scans for consistent case only
189 |         assertFalse(r.containsIgnoreCase("one"));
190 |     }
191 | 
192 |     @Test public void matchesAny() {
193 |         char[] scan = {' ', '\n', '\t'};
194 |         CharacterReader r = new CharacterReader("One\nTwo\tThree");
195 |         assertFalse(r.matchesAny(scan));
196 |         assertEquals("One", r.consumeToAny(scan));
197 |         assertTrue(r.matchesAny(scan));
198 |         assertEquals('\n', r.consume());
199 |         assertFalse(r.matchesAny(scan));
200 |     }
201 | 
202 | }
203 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/TagTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.junit.Test;
 4 | import static org.junit.Assert.*;
 5 | 
 6 | /**
 7 |  Tag tests.
 8 |  @author Jonathan Hedley, jonathan@hedley.net */
 9 | public class TagTest {
10 | 
11 |     @Test public void isCaseInsensitive() {
12 |         Tag p1 = Tag.valueOf("P");
13 |         Tag p2 = Tag.valueOf("p");
14 |         assertEquals(p1, p2);
15 |     }
16 | 
17 |     @Test public void trims() {
18 |         Tag p1 = Tag.valueOf("p");
19 |         Tag p2 = Tag.valueOf(" p ");
20 |         assertEquals(p1, p2);
21 |     }
22 | 
23 |     @Test public void equality() {
24 |         Tag p1 = Tag.valueOf("p");
25 |         Tag p2 = Tag.valueOf("p");
26 |         assertTrue(p1.equals(p2));
27 |         assertTrue(p1 == p2);
28 |     }
29 | 
30 |     @Test public void divSemantics() {
31 |         Tag div = Tag.valueOf("div");
32 | 
33 |         assertTrue(div.isBlock());
34 |         assertTrue(div.formatAsBlock());
35 |     }
36 | 
37 |     @Test public void pSemantics() {
38 |         Tag p = Tag.valueOf("p");
39 | 
40 |         assertTrue(p.isBlock());
41 |         assertFalse(p.formatAsBlock());
42 |     }
43 | 
44 |     @Test public void imgSemantics() {
45 |         Tag img = Tag.valueOf("img");
46 |         assertTrue(img.isInline());
47 |         assertTrue(img.isSelfClosing());
48 |         assertFalse(img.isBlock());
49 |     }
50 | 
51 |     @Test public void defaultSemantics() {
52 |         Tag foo = Tag.valueOf("foo"); // not defined
53 |         Tag foo2 = Tag.valueOf("FOO");
54 | 
55 |         assertEquals(foo, foo2);
56 |         assertTrue(foo.isInline());
57 |         assertTrue(foo.formatAsBlock());
58 |     }
59 | 
60 |     @Test(expected = IllegalArgumentException.class) public void valueOfChecksNotNull() {
61 |         Tag.valueOf(null);
62 |     }
63 | 
64 |     @Test(expected = IllegalArgumentException.class) public void valueOfChecksNotEmpty() {
65 |         Tag.valueOf(" ");
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/TokenQueueTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.junit.Test;
 4 | import static org.junit.Assert.*;
 5 | 
 6 | /**
 7 |  * Token queue tests.
 8 |  */
 9 | public class TokenQueueTest {
10 |     @Test public void chompBalanced() {
11 |         TokenQueue tq = new TokenQueue(":contains(one (two) three) four");
12 |         String pre = tq.consumeTo("(");
13 |         String guts = tq.chompBalanced('(', ')');
14 |         String remainder = tq.remainder();
15 | 
16 |         assertEquals(":contains", pre);
17 |         assertEquals("one (two) three", guts);
18 |         assertEquals(" four", remainder);
19 |     }
20 |     
21 |     @Test public void chompEscapedBalanced() {
22 |         TokenQueue tq = new TokenQueue(":contains(one (two) \\( \\) \\) three) four");
23 |         String pre = tq.consumeTo("(");
24 |         String guts = tq.chompBalanced('(', ')');
25 |         String remainder = tq.remainder();
26 | 
27 |         assertEquals(":contains", pre);
28 |         assertEquals("one (two) \\( \\) \\) three", guts);
29 |         assertEquals("one (two) ( ) ) three", TokenQueue.unescape(guts));
30 |         assertEquals(" four", remainder);
31 |     }
32 | 
33 |     @Test public void chompBalancedMatchesAsMuchAsPossible() {
34 |         TokenQueue tq = new TokenQueue("unbalanced(something(or another");
35 |         tq.consumeTo("(");
36 |         String match = tq.chompBalanced('(', ')');
37 |         assertEquals("something(or another", match);
38 |     }
39 |     
40 |     @Test public void unescape() {
41 |         assertEquals("one ( ) \\", TokenQueue.unescape("one \\( \\) \\\\"));
42 |     }
43 |     
44 |     @Test public void chompToIgnoreCase() {
45 |         String t = "<textarea>one < two </TEXTarea>";
46 |         TokenQueue tq = new TokenQueue(t);
47 |         String data = tq.chompToIgnoreCase("</textarea");
48 |         assertEquals("<textarea>one < two ", data);
49 |         
50 |         tq = new TokenQueue("<textarea> one two < three </oops>");
51 |         data = tq.chompToIgnoreCase("</textarea");
52 |         assertEquals("<textarea> one two < three </oops>", data);
53 |     }
54 | 
55 |     @Test public void addFirst() {
56 |         TokenQueue tq = new TokenQueue("One Two");
57 |         tq.consumeWord();
58 |         tq.addFirst("Three");
59 |         assertEquals("Three Two", tq.remainder());
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.TextUtil;
  5 | import org.jsoup.helper.StringUtil;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.nodes.Element;
  8 | import org.jsoup.nodes.Node;
  9 | import org.jsoup.nodes.TextNode;
 10 | import org.junit.Ignore;
 11 | import org.junit.Test;
 12 | 
 13 | import java.io.File;
 14 | import java.io.FileInputStream;
 15 | import java.io.IOException;
 16 | import java.io.InputStream;
 17 | import java.net.URISyntaxException;
 18 | import java.util.List;
 19 | 
 20 | import static org.junit.Assert.assertEquals;
 21 | import static org.junit.Assert.assertNotSame;
 22 | 
 23 | /**
 24 |  * Tests XmlTreeBuilder.
 25 |  *
 26 |  * @author Jonathan Hedley
 27 |  */
 28 | public class XmlTreeBuilderTest {
 29 |     @Test
 30 |     public void testSimpleXmlParse() {
 31 |         String xml = "<doc id=2 href='/bar'>Foo <br /><link>One</link><link>Two</link></doc>";
 32 |         XmlTreeBuilder tb = new XmlTreeBuilder();
 33 |         Document doc = tb.parse(xml, "http://foo.com/");
 34 |         assertEquals("<doc id=\"2\" href=\"/bar\">Foo <br /><link>One</link><link>Two</link></doc>",
 35 |                 TextUtil.stripNewlines(doc.html()));
 36 |         assertEquals(doc.getElementById("2").absUrl("href"), "http://foo.com/bar");
 37 |     }
 38 | 
 39 |     @Test
 40 |     public void testPopToClose() {
 41 |         // test: </val> closes Two, </bar> ignored
 42 |         String xml = "<doc><val>One<val>Two</val></bar>Three</doc>";
 43 |         XmlTreeBuilder tb = new XmlTreeBuilder();
 44 |         Document doc = tb.parse(xml, "http://foo.com/");
 45 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 46 |                 TextUtil.stripNewlines(doc.html()));
 47 |     }
 48 | 
 49 |     @Test
 50 |     public void testCommentAndDocType() {
 51 |         String xml = "<!DOCTYPE html><!-- a comment -->One <qux />Two";
 52 |         XmlTreeBuilder tb = new XmlTreeBuilder();
 53 |         Document doc = tb.parse(xml, "http://foo.com/");
 54 |         assertEquals("<!DOCTYPE html><!-- a comment -->One <qux />Two",
 55 |                 TextUtil.stripNewlines(doc.html()));
 56 |     }
 57 | 
 58 |     @Test
 59 |     public void testSupplyParserToJsoupClass() {
 60 |         String xml = "<doc><val>One<val>Two</val></bar>Three</doc>";
 61 |         Document doc = Jsoup.parse(xml, "http://foo.com/", Parser.xmlParser());
 62 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 63 |                 TextUtil.stripNewlines(doc.html()));
 64 |     }
 65 | 
 66 |     @Ignore
 67 |     @Test
 68 |     public void testSupplyParserToConnection() throws IOException {
 69 |         String xmlUrl = "http://direct.infohound.net/tools/jsoup-xml-test.xml";
 70 | 
 71 |         // parse with both xml and html parser, ensure different
 72 |         Document xmlDoc = Jsoup.connect(xmlUrl).parser(Parser.xmlParser()).get();
 73 |         Document htmlDoc = Jsoup.connect(xmlUrl).get();
 74 | 
 75 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 76 |                 TextUtil.stripNewlines(xmlDoc.html()));
 77 |         assertNotSame(htmlDoc, xmlDoc);
 78 |         assertEquals(1, htmlDoc.select("head").size()); // html parser normalises
 79 |         assertEquals(0, xmlDoc.select("head").size()); // xml parser does not
 80 |     }
 81 | 
 82 |     @Test
 83 |     public void testSupplyParserToDataStream() throws IOException, URISyntaxException {
 84 |         File xmlFile = new File(XmlTreeBuilder.class.getResource("/htmltests/xml-test.xml").toURI());
 85 |         InputStream inStream = new FileInputStream(xmlFile);
 86 |         Document doc = Jsoup.parse(inStream, null, "http://foo.com", Parser.xmlParser());
 87 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 88 |                 TextUtil.stripNewlines(doc.html()));
 89 |     }
 90 | 
 91 |     @Test
 92 |     public void testDoesNotForceSelfClosingKnownTags() {
 93 |         // html will force "<br>one</br>" to "<br />One<br />". XML should be stay "<br>one</br> -- don't recognise tag.
 94 |         Document htmlDoc = Jsoup.parse("<br>one</br>");
 95 |         assertEquals("<br />one\n<br />", htmlDoc.body().html());
 96 | 
 97 |         Document xmlDoc = Jsoup.parse("<br>one</br>", "", Parser.xmlParser());
 98 |         assertEquals("<br>one</br>", xmlDoc.html());
 99 |     }
100 | 
101 |     @Test public void handlesXmlDeclarationAsDeclaration() {
102 |         String html = "<?xml encoding='UTF-8' ?><body>One</body><!-- comment -->";
103 |         Document doc = Jsoup.parse(html, "", Parser.xmlParser());
104 |         assertEquals("<?xml encoding='UTF-8' ?> <body> One </body> <!-- comment -->",
105 |                 StringUtil.normaliseWhitespace(doc.outerHtml()));
106 |         assertEquals("#declaration", doc.childNode(0).nodeName());
107 |         assertEquals("#comment", doc.childNode(2).nodeName());
108 |     }
109 | 
110 |     @Test public void xmlFragment() {
111 |         String xml = "<one src='/foo/' />Two<three><four /></three>";
112 |         List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
113 |         assertEquals(3, nodes.size());
114 | 
115 |         assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
116 |         assertEquals("one", nodes.get(0).nodeName());
117 |         assertEquals("Two", ((TextNode)nodes.get(1)).text());
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/select/CssTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.select;
  2 | 
  3 | import static org.junit.Assert.*;
  4 | 
  5 | import org.jsoup.Jsoup;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.parser.Tag;
  8 | import org.junit.Before;
  9 | import org.junit.BeforeClass;
 10 | import org.junit.Test;
 11 | 
 12 | public class CssTest {
 13 | 
 14 | 	private Document html = null;
 15 | 	private static String htmlString;
 16 | 	
 17 | 	@BeforeClass
 18 | 	public static void initClass() {
 19 | 		StringBuilder sb = new StringBuilder("<html><head></head><body>");
 20 | 		
 21 | 		sb.append("<div id='pseudo'>");
 22 | 		for (int i = 1; i <= 10; i++) {
 23 | 			sb.append(String.format("<p>%d</p>",i));
 24 | 		}
 25 | 		sb.append("</div>");
 26 | 
 27 | 		sb.append("<div id='type'>");
 28 | 		for (int i = 1; i <= 10; i++) {
 29 | 			sb.append(String.format("<p>%d</p>",i));
 30 | 			sb.append(String.format("<span>%d</span>",i));
 31 | 			sb.append(String.format("<em>%d</em>",i));
 32 | 		}
 33 | 		sb.append("</div>");
 34 | 
 35 | 		sb.append("<span id='onlySpan'><br /></span>");
 36 | 		sb.append("<p class='empty'><!-- Comment only is still empty! --></p>");
 37 | 		
 38 | 		sb.append("<div id='only'>");
 39 | 		sb.append("Some text before the <em>only</em> child in this div");
 40 | 		sb.append("</div>");
 41 | 		
 42 | 		sb.append("</body></html>");
 43 | 		htmlString = sb.toString();
 44 | 	}
 45 | 
 46 | 	@Before
 47 | 	public void init() {
 48 | 		html  = Jsoup.parse(htmlString);
 49 | 	}
 50 | 	
 51 | 	@Test
 52 | 	public void firstChild() {
 53 | 		check(html.select("#pseudo :first-child"), "1");
 54 | 		check(html.select("html:first-child"));
 55 | 	}
 56 | 
 57 | 	@Test
 58 | 	public void lastChild() {
 59 | 		check(html.select("#pseudo :last-child"), "10");
 60 | 		check(html.select("html:last-child"));
 61 | 	}
 62 | 	
 63 | 	@Test
 64 | 	public void nthChild_simple() {
 65 | 		for(int i = 1; i <=10; i++) {
 66 | 			check(html.select(String.format("#pseudo :nth-child(%d)", i)), String.valueOf(i));
 67 | 		}
 68 | 	}
 69 | 
 70 | 
 71 | 	@Test
 72 | 	public void nthLastChild_simple() {
 73 | 		for(int i = 1; i <=10; i++) {
 74 | 			check(html.select(String.format("#pseudo :nth-last-child(%d)", i)), String.valueOf(11-i));
 75 | 		}
 76 | 	}
 77 | 
 78 | 	@Test
 79 | 	public void nthOfType_simple() {
 80 | 		for(int i = 1; i <=10; i++) {
 81 | 			check(html.select(String.format("#type p:nth-of-type(%d)", i)), String.valueOf(i));
 82 | 		}
 83 | 	}
 84 | 	
 85 | 	@Test
 86 | 	public void nthLastOfType_simple() {
 87 | 		for(int i = 1; i <=10; i++) {
 88 | 			check(html.select(String.format("#type :nth-last-of-type(%d)", i)), String.valueOf(11-i),String.valueOf(11-i),String.valueOf(11-i));
 89 | 		}
 90 | 	}
 91 | 
 92 | 	@Test
 93 | 	public void nthChild_advanced() {
 94 | 		check(html.select("#pseudo :nth-child(-5)"));
 95 | 		check(html.select("#pseudo :nth-child(odd)"), "1", "3", "5", "7", "9");
 96 | 		check(html.select("#pseudo :nth-child(2n-1)"), "1", "3", "5", "7", "9");
 97 | 		check(html.select("#pseudo :nth-child(2n+1)"), "1", "3", "5", "7", "9");
 98 | 		check(html.select("#pseudo :nth-child(2n+3)"), "3", "5", "7", "9");
 99 | 		check(html.select("#pseudo :nth-child(even)"), "2", "4", "6", "8", "10");
100 | 		check(html.select("#pseudo :nth-child(2n)"), "2", "4", "6", "8", "10");
101 | 		check(html.select("#pseudo :nth-child(3n-1)"), "2", "5", "8");
102 | 		check(html.select("#pseudo :nth-child(-2n+5)"), "1", "3", "5");
103 | 		check(html.select("#pseudo :nth-child(+5)"), "5");
104 | 	}
105 | 
106 | 	@Test
107 | 	public void nthOfType_advanced() {
108 | 		check(html.select("#type :nth-of-type(-5)"));
109 | 		check(html.select("#type p:nth-of-type(odd)"), "1", "3", "5", "7", "9");
110 | 		check(html.select("#type em:nth-of-type(2n-1)"), "1", "3", "5", "7", "9");
111 | 		check(html.select("#type p:nth-of-type(2n+1)"), "1", "3", "5", "7", "9");
112 | 		check(html.select("#type span:nth-of-type(2n+3)"), "3", "5", "7", "9");
113 | 		check(html.select("#type p:nth-of-type(even)"), "2", "4", "6", "8", "10");
114 | 		check(html.select("#type p:nth-of-type(2n)"), "2", "4", "6", "8", "10");
115 | 		check(html.select("#type p:nth-of-type(3n-1)"), "2", "5", "8");
116 | 		check(html.select("#type p:nth-of-type(-2n+5)"), "1", "3", "5");
117 | 		check(html.select("#type :nth-of-type(+5)"), "5", "5", "5");
118 | 	}
119 | 
120 | 	
121 | 	@Test
122 | 	public void nthLastChild_advanced() {
123 | 		check(html.select("#pseudo :nth-last-child(-5)"));
124 | 		check(html.select("#pseudo :nth-last-child(odd)"), "2", "4", "6", "8", "10");
125 | 		check(html.select("#pseudo :nth-last-child(2n-1)"), "2", "4", "6", "8", "10");
126 | 		check(html.select("#pseudo :nth-last-child(2n+1)"), "2", "4", "6", "8", "10");
127 | 		check(html.select("#pseudo :nth-last-child(2n+3)"), "2", "4", "6", "8");
128 | 		check(html.select("#pseudo :nth-last-child(even)"), "1", "3", "5", "7", "9");
129 | 		check(html.select("#pseudo :nth-last-child(2n)"), "1", "3", "5", "7", "9");
130 | 		check(html.select("#pseudo :nth-last-child(3n-1)"), "3", "6", "9");
131 | 
132 | 		check(html.select("#pseudo :nth-last-child(-2n+5)"), "6", "8", "10");
133 | 		check(html.select("#pseudo :nth-last-child(+5)"), "6");
134 | 	}
135 | 
136 | 	@Test
137 | 	public void nthLastOfType_advanced() {
138 | 		check(html.select("#type :nth-last-of-type(-5)"));
139 | 		check(html.select("#type p:nth-last-of-type(odd)"), "2", "4", "6", "8", "10");
140 | 		check(html.select("#type em:nth-last-of-type(2n-1)"), "2", "4", "6", "8", "10");
141 | 		check(html.select("#type p:nth-last-of-type(2n+1)"), "2", "4", "6", "8", "10");
142 | 		check(html.select("#type span:nth-last-of-type(2n+3)"), "2", "4", "6", "8");
143 | 		check(html.select("#type p:nth-last-of-type(even)"), "1", "3", "5", "7", "9");
144 | 		check(html.select("#type p:nth-last-of-type(2n)"), "1", "3", "5", "7", "9");
145 | 		check(html.select("#type p:nth-last-of-type(3n-1)"), "3", "6", "9");
146 | 
147 | 		check(html.select("#type span:nth-last-of-type(-2n+5)"), "6", "8", "10");
148 | 		check(html.select("#type :nth-last-of-type(+5)"), "6", "6", "6");
149 | 	}
150 | 	
151 | 	@Test
152 | 	public void firstOfType() {
153 | 		check(html.select("div:not(#only) :first-of-type"), "1", "1", "1", "1");
154 | 	}
155 | 
156 | 	@Test
157 | 	public void lastOfType() {
158 | 		check(html.select("div:not(#only) :last-of-type"), "10", "10", "10", "10");
159 | 	}
160 | 
161 | 	@Test
162 | 	public void empty() {
163 | 		final Elements sel = html.select(":empty");
164 | 		assertEquals(3, sel.size());
165 | 		assertEquals("head", sel.get(0).tagName());
166 | 		assertEquals("br", sel.get(1).tagName());
167 | 		assertEquals("p", sel.get(2).tagName());
168 | 	}
169 | 	
170 | 	@Test
171 | 	public void onlyChild() {
172 | 		final Elements sel = html.select("span :only-child");
173 | 		assertEquals(1, sel.size());
174 | 		assertEquals("br", sel.get(0).tagName());
175 | 		
176 | 		check(html.select("#only :only-child"), "only");
177 | 	}
178 | 	
179 | 	@Test
180 | 	public void onlyOfType() {
181 | 		final Elements sel = html.select(":only-of-type");
182 | 		assertEquals(6, sel.size());
183 | 		assertEquals("head", sel.get(0).tagName());
184 | 		assertEquals("body", sel.get(1).tagName());
185 | 		assertEquals("span", sel.get(2).tagName());
186 | 		assertEquals("br", sel.get(3).tagName());
187 | 		assertEquals("p", sel.get(4).tagName());
188 | 		assertTrue(sel.get(4).hasClass("empty"));
189 | 		assertEquals("em", sel.get(5).tagName());
190 | 	}
191 | 	
192 | 	protected void check(Elements result, String...expectedContent ) {
193 | 		assertEquals("Number of elements", expectedContent.length, result.size());
194 | 		for (int i = 0; i < expectedContent.length; i++) {
195 | 			assertNotNull(result.get(i));
196 | 			assertEquals("Expected element",expectedContent[i], result.get(i).ownText());
197 | 		}
198 | 	}
199 | 
200 | 	
201 | 	@Test
202 | 	public void root() {
203 | 		Elements sel = html.select(":root");
204 | 		assertEquals(1, sel.size());
205 | 		assertNotNull(sel.get(0));
206 | 		assertEquals(Tag.valueOf("html"), sel.get(0).tag());
207 | 
208 | 		Elements sel2 = html.select("body").select(":root");
209 | 		assertEquals(1, sel2.size());
210 | 		assertNotNull(sel2.get(0));
211 | 		assertEquals(Tag.valueOf("body"), sel2.get(0).tag());
212 | 	}
213 | 
214 | }
215 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/select/QueryParserTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.junit.Test;
 4 | import static org.junit.Assert.*;
 5 | 
 6 | /**
 7 |  * Tests for the Selector Query Parser.
 8 |  *
 9 |  * @author Jonathan Hedley
10 |  */
11 | public class QueryParserTest {
12 |     @Test public void testOrGetsCorrectPrecedence() {
13 |         // tests that a selector "a b, c d, e f" evals to (a AND b) OR (c AND d) OR (e AND f)"
14 |         // top level or, three child ands
15 |         Evaluator eval = QueryParser.parse("a b, c d, e f");
16 |         assertTrue(eval instanceof CombiningEvaluator.Or);
17 |         CombiningEvaluator.Or or = (CombiningEvaluator.Or) eval;
18 |         assertEquals(3, or.evaluators.size());
19 |         for (Evaluator innerEval: or.evaluators) {
20 |             assertTrue(innerEval instanceof CombiningEvaluator.And);
21 |             CombiningEvaluator.And and = (CombiningEvaluator.And) innerEval;
22 |             assertEquals(2, and.evaluators.size());
23 |             assertTrue(and.evaluators.get(0) instanceof Evaluator.Tag);
24 |             assertTrue(and.evaluators.get(1) instanceof StructuralEvaluator.Parent);
25 |         }
26 |     }
27 | 
28 |     @Test public void testParsesMultiCorrectly() {
29 |         Evaluator eval = QueryParser.parse(".foo > ol, ol > li + li");
30 |         assertTrue(eval instanceof CombiningEvaluator.Or);
31 |         CombiningEvaluator.Or or = (CombiningEvaluator.Or) eval;
32 |         assertEquals(2, or.evaluators.size());
33 | 
34 |         CombiningEvaluator.And andLeft = (CombiningEvaluator.And) or.evaluators.get(0);
35 |         CombiningEvaluator.And andRight = (CombiningEvaluator.And) or.evaluators.get(1);
36 | 
37 |         assertEquals("ol :ImmediateParent.foo", andLeft.toString());
38 |         assertEquals(2, andLeft.evaluators.size());
39 |         assertEquals("li :prevli :ImmediateParentol", andRight.toString());
40 |         assertEquals(2, andLeft.evaluators.size());
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/resources/htmltests/README:
--------------------------------------------------------------------------------
 1 | Note
 2 | ====
 3 | 
 4 | The HTML files in this directory (htmltests) are intended to be used for testing the Jsoup parser and improving its
 5 | interoperability with real world published HTML. These files are not distributed in the core Jsoup library.
 6 | 
 7 | These files remain the copyright of the original owner.
 8 | 
 9 | If you are the copyright holder and do not wish your works to be used in this manner, please contact Jonathan Hedley
10 | (jonathan@hedley.net) and your works will be removed from this test-suite.
11 | 
12 | Sources
13 | ========
14 | 
15 | * yahoo-article-1.html    http://news.yahoo.com/s/nm/20100831/bs_nm/us_gm_china 1-Sep-2010
16 | * smh-biz-article-1.html  http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html
17 | * news-com-au-home.html   http://www.news.com.au/	11-Jan-2010
18 | * google-ipod.html		  http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10	11-Jan-2010
19 | * yahoo-jp.html			  http://www.yahoo.co.jp/index.html	12-Jan-2010
20 | * baidu-cn-home.html	  http://www.baidu.com/ 15-Jul-2010
21 | * nyt-article-1.html      http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp
22 | 


--------------------------------------------------------------------------------
/src/test/resources/htmltests/baidu-cn-home.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/src/test/resources/htmltests/baidu-cn-home.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/baidu-variant.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/src/test/resources/htmltests/baidu-variant.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/meta-charset-1.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/src/test/resources/htmltests/meta-charset-1.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/meta-charset-2.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/src/test/resources/htmltests/meta-charset-2.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/meta-charset-3.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <head></head>
3 | <body>新</body>
4 | </html>


--------------------------------------------------------------------------------
/src/test/resources/htmltests/thumb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code4craft/jsoup-learning/2c0580fdd895cabedeb5eee14241bf511270dc61/src/test/resources/htmltests/thumb.jpg


--------------------------------------------------------------------------------
/src/test/resources/htmltests/xml-test.xml:
--------------------------------------------------------------------------------
1 | <doc><val>One<val>Two</val>Three</val></doc>
2 | 


--------------------------------------------------------------------------------