├── .gitignore
├── .travis.yml
├── CHANGES
├── LICENSE
├── README.md
├── common.png
├── jsoup01.png
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── org
        │   │   └── jsoup
        │   │       ├── Connection.java
        │   │       ├── HttpStatusException.java
        │   │       ├── Jsoup.java
        │   │       ├── SerializationException.java
        │   │       ├── UnsupportedMimeTypeException.java
        │   │       ├── examples
        │   │           ├── HtmlToPlainText.java
        │   │           ├── ListLinks.java
        │   │           └── package-info.java
        │   │       ├── helper
        │   │           ├── DataUtil.java
        │   │           ├── DescendableLinkedList.java
        │   │           ├── HttpConnection.java
        │   │           ├── StringUtil.java
        │   │           ├── Validate.java
        │   │           └── W3CDom.java
        │   │       ├── nodes
        │   │           ├── Attribute.java
        │   │           ├── Attributes.java
        │   │           ├── BooleanAttribute.java
        │   │           ├── Comment.java
        │   │           ├── DataNode.java
        │   │           ├── Document.java
        │   │           ├── DocumentType.java
        │   │           ├── Element.java
        │   │           ├── Entities.java
        │   │           ├── FormElement.java
        │   │           ├── Node.java
        │   │           ├── TextNode.java
        │   │           ├── XmlDeclaration.java
        │   │           ├── entities-base.properties
        │   │           ├── entities-full.properties
        │   │           ├── entities-xhtml.properties
        │   │           └── package-info.java
        │   │       ├── package-info.java
        │   │       ├── parser
        │   │           ├── CharacterReader.java
        │   │           ├── HtmlTreeBuilder.java
        │   │           ├── HtmlTreeBuilderState.java
        │   │           ├── ParseError.java
        │   │           ├── ParseErrorList.java
        │   │           ├── ParseSettings.java
        │   │           ├── Parser.java
        │   │           ├── Tag.java
        │   │           ├── Token.java
        │   │           ├── TokenQueue.java
        │   │           ├── Tokeniser.java
        │   │           ├── TokeniserState.java
        │   │           ├── TreeBuilder.java
        │   │           ├── XmlTreeBuilder.java
        │   │           └── package-info.java
        │   │       ├── safety
        │   │           ├── Cleaner.java
        │   │           ├── Whitelist.java
        │   │           └── package-info.java
        │   │       └── select
        │   │           ├── Collector.java
        │   │           ├── CombiningEvaluator.java
        │   │           ├── Elements.java
        │   │           ├── Evaluator.java
        │   │           ├── NodeTraversor.java
        │   │           ├── NodeVisitor.java
        │   │           ├── QueryParser.java
        │   │           ├── Selector.java
        │   │           ├── StructuralEvaluator.java
        │   │           └── package-info.java
        └── javadoc
        │   └── overview.html
    └── test
        ├── java
            └── org
            │   └── jsoup
            │       ├── TextUtil.java
            │       ├── helper
            │           ├── DataUtilTest.java
            │           ├── HttpConnectionTest.java
            │           ├── StringUtilTest.java
            │           └── W3CDomTest.java
            │       ├── integration
            │           ├── Benchmark.java
            │           ├── ParseTest.java
            │           └── UrlConnectTest.java
            │       ├── nodes
            │           ├── AttributeTest.java
            │           ├── AttributesTest.java
            │           ├── BuildEntities.java
            │           ├── DocumentTest.java
            │           ├── DocumentTypeTest.java
            │           ├── ElementTest.java
            │           ├── EntitiesTest.java
            │           ├── FormElementTest.java
            │           ├── NodeTest.java
            │           └── TextNodeTest.java
            │       ├── parser
            │           ├── AttributeParseTest.java
            │           ├── CharacterReaderTest.java
            │           ├── HtmlParserTest.java
            │           ├── ParserSettingsTest.java
            │           ├── TagTest.java
            │           ├── TokenQueueTest.java
            │           └── XmlTreeBuilderTest.java
            │       ├── safety
            │           └── CleanerTest.java
            │       └── select
            │           ├── CssTest.java
            │           ├── ElementsTest.java
            │           ├── QueryParserTest.java
            │           └── SelectorTest.java
        └── resources
            ├── bomtests
                ├── bom_utf16be.html
                ├── bom_utf16le.html
                ├── bom_utf32be.html
                └── bom_utf32le.html
            └── htmltests
                ├── README
                ├── baidu-cn-home.html
                ├── baidu-variant.html
                ├── google-ipod.html
                ├── meta-charset-1.html
                ├── meta-charset-2.html
                ├── meta-charset-3.html
                ├── namespaces.xhtml
                ├── news-com-au-home.html
                ├── nyt-article-1.html
                ├── smh-biz-article-1.html
                ├── table-invalid-elements.html
                ├── thumb.jpg
                ├── xml-charset.xml
                ├── xml-test.xml
                ├── yahoo-article-1.html
                └── yahoo-jp.html


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | jsoup.iml
 3 | jsoup.ipr
 4 | jsoup.iws
 5 | target/
 6 | .classpath
 7 | .project
 8 | .settings/
 9 | *Thrash*
10 | 
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: java
 2 | 
 3 | jdk:
 4 |     - openjdk6
 5 |     - openjdk7
 6 |     - oraclejdk7
 7 |     - oraclejdk8
 8 | 
 9 | cache:
10 |     directories:
11 |         - $HOME/.m2
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | © 2009-2017, Jonathan Hedley <jonathan@hedley.net>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 开源项目Jsoup使用简介
  2 | 
  3 | jsoup 是一款 Java 的HTML 解析器，可通过DOM，CSS选择器以及类似于JQuery的操作方法来提取和操作Html文档数据。
  4 | 
  5 | 
  6 | 开源地址：[https://github.com/open-android/Jsoup](https://github.com/open-android/Jsoup "开源项目地址")
  7 | 
  8 | * [配套视频](https://www.boxuegu.com/web/html/video.html?courseId=172&sectionId=8a2c9bed5a3a4c7e015a4aa700eb0a2a&chapterId=8a2c9bed5a3a4c7e015a4aa767150a2b&vId=8a2c9bed5a3a4c7e015a4aa7ad870a2c&videoId=D9C78456B7F047A79C33DC5901307461)
  9 | 
 10 | * 爱生活,爱学习,更爱做代码的搬运工,分类查找更方便请下载黑马助手app
 11 | 
 12 | ![黑马助手.png](http://upload-images.jianshu.io/upload_images/4037105-f777f1214328dcc4.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 13 | 
 14 | # 使用效果
 15 | ![](jsoup01.png)
 16 | 
 17 | ## 使用步骤
 18 | 
 19 | ### 1. 在project的build.gradle添加如下代码(如下图)
 20 | 
 21 | 	allprojects {
 22 | 	    repositories {
 23 | 	        ...
 24 | 	        maven { url "https://jitpack.io" }
 25 | 	    }
 26 | 	}
 27 | 
 28 | ![](common.png)
 29 | 
 30 | ### 2. 在Module的build.gradle添加依赖
 31 | 	
 32 | 	compile 'com.github.open-android:Jsoup:jsoup-1.10.2'
 33 | 
 34 | ### 3.演示步骤
 35 | 
 36 | * a.测试用html内容如下
 37 | 
 38 | 
 39 | 		<html>
 40 | 		 <head>
 41 | 		  <title>First parse</title>
 42 | 		 </head>
 43 | 		 <body>
 44 | 		  <p align="center">attribute parse</p>
 45 | 		  <p>text parse</p>
 46 | 		 </body>
 47 | 		</html>
 48 | 
 49 | * b.将演示代码复制到Activity的onCreate方法中
 50 | 
 51 |         //测试用html字符串
 52 |         String html = "<html><head><title>First parse</title></head>"
 53 |                 + "<body><p align=\"center\">attribute parse</p>"
 54 |                 + "<p>text parse</p></body></html>";
 55 | 
 56 |         //Jsoup解析获得Document对象
 57 |         Document doc = Jsoup.parse(html);
 58 | 
 59 |         System.out.println("解析出来的html:\n"+doc.toString());
 60 | 
 61 | 
 62 |         //获得head元素对象
 63 |         Element head = doc.head();
 64 | 
 65 |         //DOM方式获得第一个title元素
 66 |         Element title = head.getElementsByTag("title").first();
 67 | 
 68 |         //获得title元素中文本
 69 |         String text = title.text();
 70 |         System.out.println("title标签中文本: " + text);
 71 | 
 72 | 
 73 |         //---------------------------------------
 74 | 
 75 | 
 76 |         //获得body元素对象
 77 |         Element body = doc.body();
 78 | 
 79 |         //选择器语法查找p元素
 80 |         Elements lists = body.select("p");
 81 | 
 82 |         //遍历所有p元素，输出p元素文本
 83 |         for(Element p : lists){
 84 |             System.out.println("p元素文本: " + p.text());
 85 |         }
 86 | 
 87 | 
 88 |         //选择器语法查找第一个拥有align属性的p元素
 89 |         Element pElement = body.select("p[align]").first();
 90 | 
 91 |         //获得p元素align属性值
 92 |         String align = pElement.attr("align");
 93 |         System.out.println("p元素align属性值: " + align);
 94 | 
 95 | 
 96 | > 注意：如果解析指定url需要添加网络访问权限
 97 | > 
 98 | 
 99 | * 欢迎关注微信公众号
100 | 
101 | ![](http://upload-images.jianshu.io/upload_images/4037105-8f737b5104dd0b5d.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
102 | 


--------------------------------------------------------------------------------
/common.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/common.png


--------------------------------------------------------------------------------
/jsoup01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/jsoup01.png


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <name>jsoup Java HTML Parser</name>
  5 | 
  6 |   <groupId>org.jsoup</groupId>
  7 |   <artifactId>jsoup</artifactId>
  8 |   <version>1.10.3-SNAPSHOT</version>
  9 |   <description>jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers do.</description>
 10 |   <url>https://jsoup.org/</url>
 11 |   <inceptionYear>2009</inceptionYear>
 12 |   <issueManagement>
 13 |   	<system>GitHub</system>
 14 |   	<url>http://github.com/jhy/jsoup/issues</url>
 15 |   </issueManagement>
 16 |   <licenses>
 17 |   	<license>
 18 |   		<name>The MIT License</name>
 19 |   		<url>https://jsoup.org/license</url>
 20 |   		<distribution>repo</distribution>
 21 |   	</license>
 22 |   </licenses>
 23 |   <scm>
 24 |   	<url>https://github.com/jhy/jsoup</url>
 25 |     <connection>scm:git:https://github.com/jhy/jsoup.git</connection>
 26 |     <!-- <developerConnection>scm:git:git@github.com:jhy/jsoup.git</developerConnection> -->
 27 |     <tag>HEAD</tag>
 28 |   </scm>
 29 |   <organization>
 30 |   	<name>Jonathan Hedley</name>
 31 |   	<url>http://jonathanhedley.com/</url>
 32 |   </organization>
 33 | 
 34 |   <build>
 35 |     <plugins>
 36 |       <plugin>
 37 |         <groupId>org.apache.maven.plugins</groupId>
 38 |         <artifactId>maven-compiler-plugin</artifactId>
 39 |         <version>3.5.1</version>
 40 |         <configuration>
 41 |           <source>1.5</source>
 42 |           <target>1.5</target>
 43 |           <encoding>UTF-8</encoding>
 44 |         </configuration>
 45 |       </plugin>
 46 |       <plugin>
 47 |       	<!-- this plugin allows us to ensure Java 5 API compatibility -->
 48 |         <groupId>org.codehaus.mojo</groupId>
 49 |         <artifactId>animal-sniffer-maven-plugin</artifactId>
 50 |         <version>1.15</version>
 51 |         <executions>
 52 |           <execution>
 53 |             <id>animal-sniffer</id>
 54 |             <phase>compile</phase>
 55 |             <goals>
 56 |               <goal>check</goal>
 57 |             </goals>
 58 |             <configuration>
 59 |               <signature>
 60 |                 <groupId>org.codehaus.mojo.signature</groupId>
 61 |                 <artifactId>java15</artifactId>
 62 |                 <version>1.0</version>
 63 |               </signature>
 64 |             </configuration>
 65 |           </execution>
 66 |         </executions>
 67 |       </plugin>
 68 |       <plugin>
 69 |         <groupId>org.apache.maven.plugins</groupId>
 70 |         <artifactId>maven-javadoc-plugin</artifactId>
 71 |         <version>2.10.4</version>
 72 |         <configuration>
 73 |           <additionalparam>-Xdoclint:none</additionalparam>
 74 |         </configuration>
 75 |         <executions>
 76 |           <execution>
 77 |             <id>attach-javadoc</id>
 78 |             <phase>verify</phase>
 79 |             <goals>
 80 |               <goal>jar</goal>
 81 |             </goals>
 82 |           </execution>
 83 |         </executions>
 84 |       </plugin>
 85 |       <plugin>
 86 |         <groupId>org.apache.maven.plugins</groupId>
 87 |         <artifactId>maven-source-plugin</artifactId>
 88 |         <version>3.0.1</version>
 89 |         <configuration>
 90 |         </configuration>
 91 |         <executions>
 92 |           <execution>
 93 |             <id>attach-sources</id>
 94 |             <phase>verify</phase>
 95 |             <goals>
 96 |               <goal>jar</goal>
 97 |             </goals>
 98 |           </execution>
 99 |         </executions>
100 |       </plugin>
101 |       <plugin>
102 |         <groupId>org.apache.maven.plugins</groupId>
103 |         <artifactId>maven-jar-plugin</artifactId>
104 |         <version>3.0.2</version>
105 |         <configuration>
106 |           <archive>
107 |             <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile>
108 |           </archive>
109 |         </configuration>
110 |       </plugin>
111 |       <plugin>
112 |         <groupId>org.apache.felix</groupId>
113 |         <artifactId>maven-bundle-plugin</artifactId>
114 |         <version>2.5.4</version>
115 |         <executions>
116 |           <execution>
117 |             <id>bundle-manifest</id>
118 |             <phase>process-classes</phase>
119 |             <goals>
120 |               <goal>manifest</goal>
121 |             </goals>
122 |           </execution>
123 |         </executions>
124 |         <configuration>
125 |           <instructions>
126 |             <Bundle-DocURL>https://jsoup.org/</Bundle-DocURL>
127 |           </instructions>
128 |         </configuration>
129 |       </plugin>
130 |       <plugin>
131 |         <groupId>org.apache.maven.plugins</groupId>
132 |         <artifactId>maven-resources-plugin</artifactId>
133 |         <version>3.0.1</version>
134 |       </plugin>
135 |       <plugin>
136 |         <artifactId>maven-release-plugin</artifactId>
137 |         <version>2.5.3</version>
138 |       </plugin>
139 |     </plugins>
140 |     <resources>
141 |       <resource>
142 |         <directory>src/main/java</directory>
143 |         <includes>
144 |           <include>**/*.properties</include>
145 |         </includes>
146 |       </resource>
147 |       <resource>
148 |         <directory>./</directory>
149 |         <targetPath>META-INF/</targetPath>
150 |         <filtering>false</filtering>
151 |         <includes>
152 |           <include>LICENSE</include>
153 |           <include>README.md</include>
154 |           <include>CHANGES</include>
155 |         </includes>
156 |       </resource>
157 |     </resources>
158 |   </build>
159 | 
160 |   <distributionManagement>
161 |     <snapshotRepository>
162 |       <id>sonatype-nexus-snapshots</id>
163 |       <name>Sonatype Nexus Snapshots</name>
164 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
165 |     </snapshotRepository>
166 |     <repository>
167 |       <id>sonatype-nexus-staging</id>
168 |       <name>Nexus Release Repository</name>
169 |       <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
170 |     </repository>
171 |   </distributionManagement>
172 | 
173 |   <profiles>
174 |     <profile>
175 |       <id>release-sign-artifacts</id>
176 |       <activation>
177 |         <property>
178 |           <name>performRelease</name>
179 |           <value>true</value>
180 |         </property>
181 |       </activation>
182 |       <build>
183 |         <plugins>
184 |           <plugin>
185 |             <groupId>org.apache.maven.plugins</groupId>
186 |             <artifactId>maven-gpg-plugin</artifactId>
187 |             <executions>
188 |               <execution>
189 |                 <id>sign-artifacts</id>
190 |                 <phase>verify</phase>
191 |                 <goals>
192 |                   <goal>sign</goal>
193 |                 </goals>
194 |               </execution>
195 |             </executions>
196 |           </plugin>
197 |         </plugins>
198 |       </build>
199 |     </profile>
200 |   </profiles>
201 | 
202 |   <dependencies>
203 | 
204 |     <dependency>
205 |       <!-- junit -->
206 |       <groupId>junit</groupId>
207 |       <artifactId>junit</artifactId>
208 |       <version>4.12</version>
209 |       <scope>test</scope>
210 |     </dependency>
211 | 
212 |     <dependency>
213 |       <!-- gson, to fetch entities from w3.org -->
214 |       <groupId>com.google.code.gson</groupId>
215 |       <artifactId>gson</artifactId>
216 |       <version>2.7</version>
217 |       <scope>test</scope>
218 |     </dependency>
219 | 
220 |   </dependencies>
221 | 
222 |   <dependencyManagement>
223 |   	<dependencies>
224 |   	</dependencies>
225 |   </dependencyManagement>
226 | 
227 |   <properties>
228 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
229 |   </properties>
230 | 
231 |   <developers>
232 |     <developer>
233 |       <id>jhy</id>
234 |       <name>Jonathan Hedley</name>
235 |       <email>jonathan@hedley.net</email>
236 |       <roles>
237 |         <role>Lead Developer</role>
238 |       </roles>
239 |       <timezone>+11</timezone>
240 |     </developer>
241 |   </developers>
242 | 
243 | </project>
244 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/HttpStatusException.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Signals that a HTTP request resulted in a not OK HTTP response.
 7 |  */
 8 | public class HttpStatusException extends IOException {
 9 |     private int statusCode;
10 |     private String url;
11 | 
12 |     public HttpStatusException(String message, int statusCode, String url) {
13 |         super(message);
14 |         this.statusCode = statusCode;
15 |         this.url = url;
16 |     }
17 | 
18 |     public int getStatusCode() {
19 |         return statusCode;
20 |     }
21 | 
22 |     public String getUrl() {
23 |         return url;
24 |     }
25 | 
26 |     @Override
27 |     public String toString() {
28 |         return super.toString() + ". Status=" + statusCode + ", URL=" + url;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/SerializationException.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup;
 2 | 
 3 | /**
 4 |  * A SerializationException is raised whenever serialization of a DOM element fails. This exception usually wraps an
 5 |  * {@link java.io.IOException} that may be thrown due to an inaccessible output stream.
 6 |  */
 7 | public final class SerializationException extends RuntimeException {
 8 | 	/**
 9 | 	 * Creates and initializes a new serialization exception with no error message and cause.
10 | 	 */
11 | 	public SerializationException() {
12 | 		super();
13 | 	}
14 | 
15 | 	/**
16 | 	 * Creates and initializes a new serialization exception with the given error message and no cause.
17 | 	 * 
18 | 	 * @param message
19 | 	 *            the error message of the new serialization exception (may be <code>null</code>).
20 | 	 */
21 | 	public SerializationException(String message) {
22 | 		super(message);
23 | 	}
24 | 
25 | 	/**
26 | 	 * Creates and initializes a new serialization exception with the specified cause and an error message of
27 |      * <code>(cause==null ? null : cause.toString())</code> (which typically contains the class and error message of
28 |      * <code>cause</code>).
29 | 	 * 
30 | 	 * @param cause
31 | 	 *            the cause of the new serialization exception (may be <code>null</code>).
32 | 	 */
33 | 	public SerializationException(Throwable cause) {
34 | 		super(cause);
35 | 	}
36 | 
37 | 	/**
38 | 	 * Creates and initializes a new serialization exception with the given error message and cause.
39 | 	 * 
40 | 	 * @param message
41 | 	 *            the error message of the new serialization exception.
42 | 	 * @param cause
43 | 	 *            the cause of the new serialization exception.
44 | 	 */
45 | 	public SerializationException(String message, Throwable cause) {
46 | 		super(message, cause);
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/UnsupportedMimeTypeException.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Signals that a HTTP response returned a mime type that is not supported.
 7 |  */
 8 | public class UnsupportedMimeTypeException extends IOException {
 9 |     private String mimeType;
10 |     private String url;
11 | 
12 |     public UnsupportedMimeTypeException(String message, String mimeType, String url) {
13 |         super(message);
14 |         this.mimeType = mimeType;
15 |         this.url = url;
16 |     }
17 | 
18 |     public String getMimeType() {
19 |         return mimeType;
20 |     }
21 | 
22 |     public String getUrl() {
23 |         return url;
24 |     }
25 | 
26 |     @Override
27 |     public String toString() {
28 |         return super.toString() + ". Mimetype=" + mimeType + ", URL="+url;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/examples/HtmlToPlainText.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.examples;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.helper.StringUtil;
  5 | import org.jsoup.helper.Validate;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.nodes.Element;
  8 | import org.jsoup.nodes.Node;
  9 | import org.jsoup.nodes.TextNode;
 10 | import org.jsoup.select.Elements;
 11 | import org.jsoup.select.NodeTraversor;
 12 | import org.jsoup.select.NodeVisitor;
 13 | 
 14 | import java.io.IOException;
 15 | 
 16 | /**
 17 |  * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted
 18 |  * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a
 19 |  * scrape.
 20 |  * <p>
 21 |  * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.
 22 |  * </p>
 23 |  * <p>
 24 |  * To invoke from the command line, assuming you've downloaded the jsoup jar to your current directory:</p>
 25 |  * <p><code>java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]</code></p>
 26 |  * where <i>url</i> is the URL to fetch, and <i>selector</i> is an optional CSS selector.
 27 |  * 
 28 |  * @author Jonathan Hedley, jonathan@hedley.net
 29 |  */
 30 | public class HtmlToPlainText {
 31 |     private static final String userAgent = "Mozilla/5.0 (jsoup)";
 32 |     private static final int timeout = 5 * 1000;
 33 | 
 34 |     public static void main(String... args) throws IOException {
 35 |         Validate.isTrue(args.length == 1 || args.length == 2, "usage: java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]");
 36 |         final String url = args[0];
 37 |         final String selector = args.length == 2 ? args[1] : null;
 38 | 
 39 |         // fetch the specified URL and parse to a HTML DOM
 40 |         Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get();
 41 | 
 42 |         HtmlToPlainText formatter = new HtmlToPlainText();
 43 | 
 44 |         if (selector != null) {
 45 |             Elements elements = doc.select(selector); // get each element that matches the CSS selector
 46 |             for (Element element : elements) {
 47 |                 String plainText = formatter.getPlainText(element); // format that element to plain text
 48 |                 System.out.println(plainText);
 49 |             }
 50 |         } else { // format the whole doc
 51 |             String plainText = formatter.getPlainText(doc);
 52 |             System.out.println(plainText);
 53 |         }
 54 |     }
 55 | 
 56 |     /**
 57 |      * Format an Element to plain-text
 58 |      * @param element the root element to format
 59 |      * @return formatted text
 60 |      */
 61 |     public String getPlainText(Element element) {
 62 |         FormattingVisitor formatter = new FormattingVisitor();
 63 |         NodeTraversor traversor = new NodeTraversor(formatter);
 64 |         traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node
 65 | 
 66 |         return formatter.toString();
 67 |     }
 68 | 
 69 |     // the formatting rules, implemented in a breadth-first DOM traverse
 70 |     private class FormattingVisitor implements NodeVisitor {
 71 |         private static final int maxWidth = 80;
 72 |         private int width = 0;
 73 |         private StringBuilder accum = new StringBuilder(); // holds the accumulated text
 74 | 
 75 |         // hit when the node is first seen
 76 |         public void head(Node node, int depth) {
 77 |             String name = node.nodeName();
 78 |             if (node instanceof TextNode)
 79 |                 append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
 80 |             else if (name.equals("li"))
 81 |                 append("\n * ");
 82 |             else if (name.equals("dt"))
 83 |                 append("  ");
 84 |             else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))
 85 |                 append("\n");
 86 |         }
 87 | 
 88 |         // hit when all of the node's children (if any) have been visited
 89 |         public void tail(Node node, int depth) {
 90 |             String name = node.nodeName();
 91 |             if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5"))
 92 |                 append("\n");
 93 |             else if (name.equals("a"))
 94 |                 append(String.format(" <%s>", node.absUrl("href")));
 95 |         }
 96 | 
 97 |         // appends text to the string builder with a simple word wrap method
 98 |         private void append(String text) {
 99 |             if (text.startsWith("\n"))
100 |                 width = 0; // reset counter if starts with a newline. only from formats above, not in natural text
101 |             if (text.equals(" ") &&
102 |                     (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))
103 |                 return; // don't accumulate long runs of empty spaces
104 | 
105 |             if (text.length() + width > maxWidth) { // won't fit, needs to wrap
106 |                 String words[] = text.split("\\s+");
107 |                 for (int i = 0; i < words.length; i++) {
108 |                     String word = words[i];
109 |                     boolean last = i == words.length - 1;
110 |                     if (!last) // insert a space if not the last word
111 |                         word = word + " ";
112 |                     if (word.length() + width > maxWidth) { // wrap and reset counter
113 |                         accum.append("\n").append(word);
114 |                         width = word.length();
115 |                     } else {
116 |                         accum.append(word);
117 |                         width += word.length();
118 |                     }
119 |                 }
120 |             } else { // fits as is, without need to wrap text
121 |                 accum.append(text);
122 |                 width += text.length();
123 |             }
124 |         }
125 | 
126 |         @Override
127 |         public String toString() {
128 |             return accum.toString();
129 |         }
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/examples/ListLinks.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.examples;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.helper.Validate;
 5 | import org.jsoup.nodes.Document;
 6 | import org.jsoup.nodes.Element;
 7 | import org.jsoup.select.Elements;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | /**
12 |  * Example program to list links from a URL.
13 |  */
14 | public class ListLinks {
15 |     public static void main(String[] args) throws IOException {
16 |         Validate.isTrue(args.length == 1, "usage: supply url to fetch");
17 |         String url = args[0];
18 |         print("Fetching %s...", url);
19 | 
20 |         Document doc = Jsoup.connect(url).get();
21 |         Elements links = doc.select("a[href]");
22 |         Elements media = doc.select("[src]");
23 |         Elements imports = doc.select("link[href]");
24 | 
25 |         print("\nMedia: (%d)", media.size());
26 |         for (Element src : media) {
27 |             if (src.tagName().equals("img"))
28 |                 print(" * %s: <%s> %sx%s (%s)",
29 |                         src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
30 |                         trim(src.attr("alt"), 20));
31 |             else
32 |                 print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
33 |         }
34 | 
35 |         print("\nImports: (%d)", imports.size());
36 |         for (Element link : imports) {
37 |             print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
38 |         }
39 | 
40 |         print("\nLinks: (%d)", links.size());
41 |         for (Element link : links) {
42 |             print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
43 |         }
44 |     }
45 | 
46 |     private static void print(String msg, Object... args) {
47 |         System.out.println(String.format(msg, args));
48 |     }
49 | 
50 |     private static String trim(String s, int width) {
51 |         if (s.length() > width)
52 |             return s.substring(0, width-1) + ".";
53 |         else
54 |             return s;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/examples/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains example programs and use of jsoup. See the <a href="https://jsoup.org/cookbook/">jsoup cookbook</a>.
3 |  */
4 | package org.jsoup.examples;


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/DescendableLinkedList.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.helper;
 2 | 
 3 | import java.util.Iterator;
 4 | import java.util.LinkedList;
 5 | import java.util.ListIterator;
 6 | 
 7 | /**
 8 |  * Provides a descending iterator and other 1.6 methods to allow support on the 1.5 JRE.
 9 |  * @param <E> Type of elements
10 |  */
11 | public class DescendableLinkedList<E> extends LinkedList<E> {
12 | 
13 |     /**
14 |      * Create a new DescendableLinkedList.
15 |      */
16 |     public DescendableLinkedList() {
17 |         super();
18 |     }
19 | 
20 |     /**
21 |      * Add a new element to the start of the list.
22 |      * @param e element to add
23 |      */
24 |     public void push(E e) {
25 |         addFirst(e);
26 |     }
27 | 
28 |     /**
29 |      * Look at the last element, if there is one.
30 |      * @return the last element, or null
31 |      */
32 |     public E peekLast() {
33 |         return size() == 0 ? null : getLast();
34 |     }
35 | 
36 |     /**
37 |      * Remove and return the last element, if there is one
38 |      * @return the last element, or null
39 |      */
40 |     public E pollLast() {
41 |         return size() == 0 ? null : removeLast();
42 |     }
43 | 
44 |     /**
45 |      * Get an iterator that starts and the end of the list and works towards the start.
46 |      * @return an iterator that starts and the end of the list and works towards the start.
47 |      */
48 |     public Iterator<E> descendingIterator() {
49 |         return new DescendingIterator<E>(size());
50 |     }
51 | 
52 |     private class DescendingIterator<E> implements Iterator<E> {
53 |         private final ListIterator<E> iter;
54 | 
55 |         @SuppressWarnings("unchecked")
56 |         private DescendingIterator(int index) {
57 |             iter = (ListIterator<E>) listIterator(index);
58 |         }
59 | 
60 |         /**
61 |          * Check if there is another element on the list.
62 |          * @return if another element
63 |          */
64 |         public boolean hasNext() {
65 |             return iter.hasPrevious();
66 |         }
67 | 
68 |         /**
69 |          * Get the next element.
70 |          * @return the next element.
71 |          */
72 |         public E next() {
73 |             return iter.previous();
74 |         }
75 | 
76 |         /**
77 |          * Remove the current element.
78 |          */
79 |         public void remove() {
80 |             iter.remove();
81 |         }
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/StringUtil.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | import java.net.MalformedURLException;
  4 | import java.net.URL;
  5 | import java.util.Arrays;
  6 | import java.util.Collection;
  7 | import java.util.Iterator;
  8 | 
  9 | /**
 10 |  * A minimal String utility class. Designed for internal jsoup use only.
 11 |  */
 12 | public final class StringUtil {
 13 |     // memoised padding up to 10
 14 |     private static final String[] padding = {"", " ", "  ", "   ", "    ", "     ", "      ", "       ", "        ", "         ", "          "};
 15 | 
 16 |     /**
 17 |      * Join a collection of strings by a separator
 18 |      * @param strings collection of string objects
 19 |      * @param sep string to place between strings
 20 |      * @return joined string
 21 |      */
 22 |     public static String join(Collection strings, String sep) {
 23 |         return join(strings.iterator(), sep);
 24 |     }
 25 | 
 26 |     /**
 27 |      * Join a collection of strings by a separator
 28 |      * @param strings iterator of string objects
 29 |      * @param sep string to place between strings
 30 |      * @return joined string
 31 |      */
 32 |     public static String join(Iterator strings, String sep) {
 33 |         if (!strings.hasNext())
 34 |             return "";
 35 | 
 36 |         String start = strings.next().toString();
 37 |         if (!strings.hasNext()) // only one, avoid builder
 38 |             return start;
 39 | 
 40 |         StringBuilder sb = new StringBuilder(64).append(start);
 41 |         while (strings.hasNext()) {
 42 |             sb.append(sep);
 43 |             sb.append(strings.next());
 44 |         }
 45 |         return sb.toString();
 46 |     }
 47 | 
 48 |     /**
 49 |      * Returns space padding
 50 |      * @param width amount of padding desired
 51 |      * @return string of spaces * width
 52 |      */
 53 |     public static String padding(int width) {
 54 |         if (width < 0)
 55 |             throw new IllegalArgumentException("width must be > 0");
 56 | 
 57 |         if (width < padding.length)
 58 |             return padding[width];
 59 | 
 60 |         char[] out = new char[width];
 61 |         for (int i = 0; i < width; i++)
 62 |             out[i] = ' ';
 63 |         return String.valueOf(out);
 64 |     }
 65 | 
 66 |     /**
 67 |      * Tests if a string is blank: null, empty, or only whitespace (" ", \r\n, \t, etc)
 68 |      * @param string string to test
 69 |      * @return if string is blank
 70 |      */
 71 |     public static boolean isBlank(String string) {
 72 |         if (string == null || string.length() == 0)
 73 |             return true;
 74 | 
 75 |         int l = string.length();
 76 |         for (int i = 0; i < l; i++) {
 77 |             if (!StringUtil.isWhitespace(string.codePointAt(i)))
 78 |                 return false;
 79 |         }
 80 |         return true;
 81 |     }
 82 | 
 83 |     /**
 84 |      * Tests if a string is numeric, i.e. contains only digit characters
 85 |      * @param string string to test
 86 |      * @return true if only digit chars, false if empty or null or contains non-digit chars
 87 |      */
 88 |     public static boolean isNumeric(String string) {
 89 |         if (string == null || string.length() == 0)
 90 |             return false;
 91 | 
 92 |         int l = string.length();
 93 |         for (int i = 0; i < l; i++) {
 94 |             if (!Character.isDigit(string.codePointAt(i)))
 95 |                 return false;
 96 |         }
 97 |         return true;
 98 |     }
 99 | 
100 |     /**
101 |      * Tests if a code point is "whitespace" as defined in the HTML spec.
102 |      * @param c code point to test
103 |      * @return true if code point is whitespace, false otherwise
104 |      */
105 |     public static boolean isWhitespace(int c){
106 |         return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
107 |     }
108 | 
109 |     /**
110 |      * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters
111 |      * (e.g. newline, tab) convert to a simple space
112 |      * @param string content to normalise
113 |      * @return normalised string
114 |      */
115 |     public static String normaliseWhitespace(String string) {
116 |         StringBuilder sb = new StringBuilder(string.length());
117 |         appendNormalisedWhitespace(sb, string, false);
118 |         return sb.toString();
119 |     }
120 | 
121 |     /**
122 |      * After normalizing the whitespace within a string, appends it to a string builder.
123 |      * @param accum builder to append to
124 |      * @param string string to normalize whitespace within
125 |      * @param stripLeading set to true if you wish to remove any leading whitespace
126 |      */
127 |     public static void appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading) {
128 |         boolean lastWasWhite = false;
129 |         boolean reachedNonWhite = false;
130 | 
131 |         int len = string.length();
132 |         int c;
133 |         for (int i = 0; i < len; i+= Character.charCount(c)) {
134 |             c = string.codePointAt(i);
135 |             if (isWhitespace(c)) {
136 |                 if ((stripLeading && !reachedNonWhite) || lastWasWhite)
137 |                     continue;
138 |                 accum.append(' ');
139 |                 lastWasWhite = true;
140 |             }
141 |             else {
142 |                 accum.appendCodePoint(c);
143 |                 lastWasWhite = false;
144 |                 reachedNonWhite = true;
145 |             }
146 |         }
147 |     }
148 | 
149 |     public static boolean in(String needle, String... haystack) {
150 |         for (String hay : haystack) {
151 |             if (hay.equals(needle))
152 |             return true;
153 |         }
154 |         return false;
155 |     }
156 | 
157 |     public static boolean inSorted(String needle, String[] haystack) {
158 |         return Arrays.binarySearch(haystack, needle) >= 0;
159 |     }
160 | 
161 |     /**
162 |      * Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
163 |      * @param base the existing absolute base URL
164 |      * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
165 |      * @return the resolved absolute URL
166 |      * @throws MalformedURLException if an error occurred generating the URL
167 |      */
168 |     public static URL resolve(URL base, String relUrl) throws MalformedURLException {
169 |         // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
170 |         if (relUrl.startsWith("?"))
171 |             relUrl = base.getPath() + relUrl;
172 |         // workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo
173 |         if (relUrl.indexOf('.') == 0 && base.getFile().indexOf('/') != 0) {
174 |             base = new URL(base.getProtocol(), base.getHost(), base.getPort(), "/" + base.getFile());
175 |         }
176 |         return new URL(base, relUrl);
177 |     }
178 | 
179 |     /**
180 |      * Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
181 |      * @param baseUrl the existing absolute base URL
182 |      * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
183 |      * @return an absolute URL if one was able to be generated, or the empty string if not
184 |      */
185 |     public static String resolve(final String baseUrl, final String relUrl) {
186 |         URL base;
187 |         try {
188 |             try {
189 |                 base = new URL(baseUrl);
190 |             } catch (MalformedURLException e) {
191 |                 // the base is unsuitable, but the attribute/rel may be abs on its own, so try that
192 |                 URL abs = new URL(relUrl);
193 |                 return abs.toExternalForm();
194 |             }
195 |             return resolve(base, relUrl).toExternalForm();
196 |         } catch (MalformedURLException e) {
197 |             return "";
198 |         }
199 | 
200 |     }
201 | }
202 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/Validate.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | /**
  4 |  * Simple validation methods. Designed for jsoup internal use
  5 |  */
  6 | public final class Validate {
  7 |     
  8 |     private Validate() {}
  9 | 
 10 |     /**
 11 |      * Validates that the object is not null
 12 |      * @param obj object to test
 13 |      */
 14 |     public static void notNull(Object obj) {
 15 |         if (obj == null)
 16 |             throw new IllegalArgumentException("Object must not be null");
 17 |     }
 18 | 
 19 |     /**
 20 |      * Validates that the object is not null
 21 |      * @param obj object to test
 22 |      * @param msg message to output if validation fails
 23 |      */
 24 |     public static void notNull(Object obj, String msg) {
 25 |         if (obj == null)
 26 |             throw new IllegalArgumentException(msg);
 27 |     }
 28 | 
 29 |     /**
 30 |      * Validates that the value is true
 31 |      * @param val object to test
 32 |      */
 33 |     public static void isTrue(boolean val) {
 34 |         if (!val)
 35 |             throw new IllegalArgumentException("Must be true");
 36 |     }
 37 | 
 38 |     /**
 39 |      * Validates that the value is true
 40 |      * @param val object to test
 41 |      * @param msg message to output if validation fails
 42 |      */
 43 |     public static void isTrue(boolean val, String msg) {
 44 |         if (!val)
 45 |             throw new IllegalArgumentException(msg);
 46 |     }
 47 | 
 48 |     /**
 49 |      * Validates that the value is false
 50 |      * @param val object to test
 51 |      */
 52 |     public static void isFalse(boolean val) {
 53 |         if (val)
 54 |             throw new IllegalArgumentException("Must be false");
 55 |     }
 56 | 
 57 |     /**
 58 |      * Validates that the value is false
 59 |      * @param val object to test
 60 |      * @param msg message to output if validation fails
 61 |      */
 62 |     public static void isFalse(boolean val, String msg) {
 63 |         if (val)
 64 |             throw new IllegalArgumentException(msg);
 65 |     }
 66 | 
 67 |     /**
 68 |      * Validates that the array contains no null elements
 69 |      * @param objects the array to test
 70 |      */
 71 |     public static void noNullElements(Object[] objects) {
 72 |         noNullElements(objects, "Array must not contain any null objects");
 73 |     }
 74 | 
 75 |     /**
 76 |      * Validates that the array contains no null elements
 77 |      * @param objects the array to test
 78 |      * @param msg message to output if validation fails
 79 |      */
 80 |     public static void noNullElements(Object[] objects, String msg) {
 81 |         for (Object obj : objects)
 82 |             if (obj == null)
 83 |                 throw new IllegalArgumentException(msg);
 84 |     }
 85 | 
 86 |     /**
 87 |      * Validates that the string is not empty
 88 |      * @param string the string to test
 89 |      */
 90 |     public static void notEmpty(String string) {
 91 |         if (string == null || string.length() == 0)
 92 |             throw new IllegalArgumentException("String must not be empty");
 93 |     }
 94 | 
 95 |     /**
 96 |      * Validates that the string is not empty
 97 |      * @param string the string to test
 98 |      * @param msg message to output if validation fails
 99 |      */
100 |     public static void notEmpty(String string, String msg) {
101 |         if (string == null || string.length() == 0)
102 |             throw new IllegalArgumentException(msg);
103 |     }
104 | 
105 |     /**
106 |      Cause a failure.
107 |      @param msg message to output.
108 |      */
109 |     public static void fail(String msg) {
110 |         throw new IllegalArgumentException(msg);
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/helper/W3CDom.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | import org.jsoup.nodes.Attribute;
  4 | import org.jsoup.nodes.Attributes;
  5 | import org.jsoup.select.NodeTraversor;
  6 | import org.jsoup.select.NodeVisitor;
  7 | import org.w3c.dom.Comment;
  8 | import org.w3c.dom.Document;
  9 | import org.w3c.dom.Element;
 10 | import org.w3c.dom.Text;
 11 | 
 12 | import javax.xml.parsers.DocumentBuilder;
 13 | import javax.xml.parsers.DocumentBuilderFactory;
 14 | import javax.xml.parsers.ParserConfigurationException;
 15 | import javax.xml.transform.Transformer;
 16 | import javax.xml.transform.TransformerException;
 17 | import javax.xml.transform.TransformerFactory;
 18 | import javax.xml.transform.dom.DOMSource;
 19 | import javax.xml.transform.stream.StreamResult;
 20 | import java.io.StringWriter;
 21 | import java.util.HashMap;
 22 | 
 23 | /**
 24 |  * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
 25 |  * for integration with toolsets that use the W3C DOM.
 26 |  */
 27 | public class W3CDom {
 28 |     protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
 29 | 
 30 |     /**
 31 |      * Convert a jsoup Document to a W3C Document.
 32 |      * @param in jsoup doc
 33 |      * @return w3c doc
 34 |      */
 35 |     public Document fromJsoup(org.jsoup.nodes.Document in) {
 36 |         Validate.notNull(in);
 37 |         DocumentBuilder builder;
 38 |         try {
 39 |         	//set the factory to be namespace-aware
 40 |         	factory.setNamespaceAware(true);
 41 |             builder = factory.newDocumentBuilder();
 42 |             Document out = builder.newDocument();
 43 |             convert(in, out);
 44 |             return out;
 45 |         } catch (ParserConfigurationException e) {
 46 |             throw new IllegalStateException(e);
 47 |         }
 48 |     }
 49 | 
 50 |     /**
 51 |      * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output document
 52 |      * before converting.
 53 |      * @param in jsoup doc
 54 |      * @param out w3c doc
 55 |      * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Document)
 56 |      */
 57 |     public void convert(org.jsoup.nodes.Document in, Document out) {
 58 |         if (!StringUtil.isBlank(in.location()))
 59 |             out.setDocumentURI(in.location());
 60 | 
 61 |         org.jsoup.nodes.Element rootEl = in.child(0); // skip the #root node
 62 |         NodeTraversor traversor = new NodeTraversor(new W3CBuilder(out));
 63 |         traversor.traverse(rootEl);
 64 |     }
 65 | 
 66 |     /**
 67 |      * Implements the conversion by walking the input.
 68 |      */
 69 |     protected static class W3CBuilder implements NodeVisitor {
 70 |         private static final String xmlnsKey = "xmlns";
 71 |         private static final String xmlnsPrefix = "xmlns:";
 72 | 
 73 |         private final Document doc;
 74 |         private final HashMap<String, String> namespaces = new HashMap<String, String>(); // prefix => urn
 75 |         private Element dest;
 76 | 
 77 |         public W3CBuilder(Document doc) {
 78 |             this.doc = doc;
 79 |         }
 80 | 
 81 |         public void head(org.jsoup.nodes.Node source, int depth) {
 82 |             if (source instanceof org.jsoup.nodes.Element) {
 83 |                 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
 84 | 
 85 |                 String prefix = updateNamespaces(sourceEl);
 86 |                 String namespace = namespaces.get(prefix);
 87 | 
 88 |                 Element el = doc.createElementNS(namespace, sourceEl.tagName());
 89 |                 copyAttributes(sourceEl, el);
 90 |                 if (dest == null) { // sets up the root
 91 |                     doc.appendChild(el);
 92 |                 } else {
 93 |                     dest.appendChild(el);
 94 |                 }
 95 |                 dest = el; // descend
 96 |             } else if (source instanceof org.jsoup.nodes.TextNode) {
 97 |                 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
 98 |                 Text text = doc.createTextNode(sourceText.getWholeText());
 99 |                 dest.appendChild(text);
100 |             } else if (source instanceof org.jsoup.nodes.Comment) {
101 |                 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
102 |                 Comment comment = doc.createComment(sourceComment.getData());
103 |                 dest.appendChild(comment);
104 |             } else if (source instanceof org.jsoup.nodes.DataNode) {
105 |                 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
106 |                 Text node = doc.createTextNode(sourceData.getWholeData());
107 |                 dest.appendChild(node);
108 |             } else {
109 |                 // unhandled
110 |             }
111 |         }
112 | 
113 |         public void tail(org.jsoup.nodes.Node source, int depth) {
114 |             if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
115 |                 dest = (Element) dest.getParentNode(); // undescend. cromulent.
116 |             }
117 |         }
118 | 
119 |         private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
120 |             for (Attribute attribute : source.attributes()) {
121 |                 // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
122 |                 String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
123 |                 if (key.matches("[a-zA-Z_:]{1}[-a-zA-Z0-9_:.]*"))
124 |                     el.setAttribute(key, attribute.getValue());
125 |             }
126 |         }
127 | 
128 |         /**
129 |          * Finds any namespaces defined in this element. Returns any tag prefix.
130 |          */
131 |         private String updateNamespaces(org.jsoup.nodes.Element el) {
132 |             // scan the element for namespace declarations
133 |             // like: xmlns="blah" or xmlns:prefix="blah"
134 |             Attributes attributes = el.attributes();
135 |             for (Attribute attr : attributes) {
136 |                 String key = attr.getKey();
137 |                 String prefix;
138 |                 if (key.equals(xmlnsKey)) {
139 |                     prefix = "";
140 |                 } else if (key.startsWith(xmlnsPrefix)) {
141 |                     prefix = key.substring(xmlnsPrefix.length());
142 |                 } else {
143 |                     continue;
144 |                 }
145 |                 namespaces.put(prefix, attr.getValue());
146 |             }
147 | 
148 |             // get the element prefix if any
149 |             int pos = el.tagName().indexOf(":");
150 |             return pos > 0 ? el.tagName().substring(0, pos) : "";
151 |         }
152 | 
153 |     }
154 | 
155 |     /**
156 |      * Serialize a W3C document to a String.
157 |      * @param doc Document
158 |      * @return Document as string
159 |      */
160 |     public String asString(Document doc) {
161 |         try {
162 |             DOMSource domSource = new DOMSource(doc);
163 |             StringWriter writer = new StringWriter();
164 |             StreamResult result = new StreamResult(writer);
165 |             TransformerFactory tf = TransformerFactory.newInstance();
166 |             Transformer transformer = tf.newTransformer();
167 |             transformer.transform(domSource, result);
168 |             return writer.toString();
169 |         } catch (TransformerException e) {
170 |             throw new IllegalStateException(e);
171 |         }
172 |     }
173 | }
174 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/Attribute.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.SerializationException;
  4 | import org.jsoup.helper.Validate;
  5 | 
  6 | import java.io.IOException;
  7 | import java.util.Arrays;
  8 | import java.util.Map;
  9 | 
 10 | /**
 11 |  A single key + value attribute. Keys are trimmed and normalised to lower-case.
 12 | 
 13 |  @author Jonathan Hedley, jonathan@hedley.net */
 14 | public class Attribute implements Map.Entry<String, String>, Cloneable  {
 15 |     private static final String[] booleanAttributes = {
 16 |             "allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled",
 17 |             "formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize",
 18 |             "noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected",
 19 |             "sortable", "truespeed", "typemustmatch"
 20 |     };
 21 | 
 22 |     private String key;
 23 |     private String value;
 24 | 
 25 |     /**
 26 |      * Create a new attribute from unencoded (raw) key and value.
 27 |      * @param key attribute key; case is preserved.
 28 |      * @param value attribute value
 29 |      * @see #createFromEncoded
 30 |      */
 31 |     public Attribute(String key, String value) {
 32 |         Validate.notNull(key);
 33 |         Validate.notNull(value);
 34 |         this.key = key.trim();
 35 |         Validate.notEmpty(key); // trimming could potentially make empty, so validate here
 36 |         this.value = value;
 37 |     }
 38 | 
 39 |     /**
 40 |      Get the attribute key.
 41 |      @return the attribute key
 42 |      */
 43 |     public String getKey() {
 44 |         return key;
 45 |     }
 46 | 
 47 |     /**
 48 |      Set the attribute key; case is preserved.
 49 |      @param key the new key; must not be null
 50 |      */
 51 |     public void setKey(String key) {
 52 |         Validate.notEmpty(key);
 53 |         this.key = key.trim();
 54 |     }
 55 | 
 56 |     /**
 57 |      Get the attribute value.
 58 |      @return the attribute value
 59 |      */
 60 |     public String getValue() {
 61 |         return value;
 62 |     }
 63 | 
 64 |     /**
 65 |      Set the attribute value.
 66 |      @param value the new attribute value; must not be null
 67 |      */
 68 |     public String setValue(String value) {
 69 |         Validate.notNull(value);
 70 |         String old = this.value;
 71 |         this.value = value;
 72 |         return old;
 73 |     }
 74 | 
 75 |     /**
 76 |      Get the HTML representation of this attribute; e.g. {@code href="index.html"}.
 77 |      @return HTML
 78 |      */
 79 |     public String html() {
 80 |         StringBuilder accum = new StringBuilder();
 81 |         
 82 |         try {
 83 |         	html(accum, (new Document("")).outputSettings());
 84 |         } catch(IOException exception) {
 85 |         	throw new SerializationException(exception);
 86 |         }
 87 |         return accum.toString();
 88 |     }
 89 |     
 90 |     protected void html(Appendable accum, Document.OutputSettings out) throws IOException {
 91 |         accum.append(key);
 92 |         if (!shouldCollapseAttribute(out)) {
 93 |             accum.append("=\"");
 94 |             Entities.escape(accum, value, out, true, false, false);
 95 |             accum.append('"');
 96 |         }
 97 |     }
 98 | 
 99 |     /**
100 |      Get the string representation of this attribute, implemented as {@link #html()}.
101 |      @return string
102 |      */
103 |     @Override
104 |     public String toString() {
105 |         return html();
106 |     }
107 | 
108 |     /**
109 |      * Create a new Attribute from an unencoded key and a HTML attribute encoded value.
110 |      * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars.
111 |      * @param encodedValue HTML attribute encoded value
112 |      * @return attribute
113 |      */
114 |     public static Attribute createFromEncoded(String unencodedKey, String encodedValue) {
115 |         String value = Entities.unescape(encodedValue, true);
116 |         return new Attribute(unencodedKey, value);
117 |     }
118 | 
119 |     protected boolean isDataAttribute() {
120 |         return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
121 |     }
122 | 
123 |     /**
124 |      * Collapsible if it's a boolean attribute and value is empty or same as name
125 |      * 
126 |      * @param out output settings
127 |      * @return  Returns whether collapsible or not
128 |      */
129 |     protected final boolean shouldCollapseAttribute(Document.OutputSettings out) {
130 |         return ("".equals(value) || value.equalsIgnoreCase(key))
131 |                 && out.syntax() == Document.OutputSettings.Syntax.html
132 |                 && isBooleanAttribute();
133 |     }
134 | 
135 |     protected boolean isBooleanAttribute() {
136 |         return Arrays.binarySearch(booleanAttributes, key) >= 0;
137 |     }
138 | 
139 |     @Override
140 |     public boolean equals(Object o) {
141 |         if (this == o) return true;
142 |         if (!(o instanceof Attribute)) return false;
143 | 
144 |         Attribute attribute = (Attribute) o;
145 | 
146 |         if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false;
147 |         return !(value != null ? !value.equals(attribute.value) : attribute.value != null);
148 |     }
149 | 
150 |     @Override
151 |     public int hashCode() {
152 |         int result = key != null ? key.hashCode() : 0;
153 |         result = 31 * result + (value != null ? value.hashCode() : 0);
154 |         return result;
155 |     }
156 | 
157 |     @Override
158 |     public Attribute clone() {
159 |         try {
160 |             return (Attribute) super.clone(); // only fields are immutable strings key and value, so no more deep copy required
161 |         } catch (CloneNotSupportedException e) {
162 |             throw new RuntimeException(e);
163 |         }
164 |     }
165 | }
166 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/BooleanAttribute.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | /**
 4 |  * A boolean attribute that is written out without any value.
 5 |  */
 6 | public class BooleanAttribute extends Attribute {
 7 |     /**
 8 |      * Create a new boolean attribute from unencoded (raw) key.
 9 |      * @param key attribute key
10 |      */
11 |     public BooleanAttribute(String key) {
12 |         super(key, "");
13 |     }
14 | 
15 |     @Override
16 |     protected boolean isBooleanAttribute() {
17 |         return true;
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/Comment.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  A comment node.
 7 | 
 8 |  @author Jonathan Hedley, jonathan@hedley.net */
 9 | public class Comment extends Node {
10 |     private static final String COMMENT_KEY = "comment";
11 | 
12 |     /**
13 |      Create a new comment node.
14 |      @param data The contents of the comment
15 |      @param baseUri base URI
16 |      */
17 |     public Comment(String data, String baseUri) {
18 |         super(baseUri);
19 |         attributes.put(COMMENT_KEY, data);
20 |     }
21 | 
22 |     public String nodeName() {
23 |         return "#comment";
24 |     }
25 | 
26 |     /**
27 |      Get the contents of the comment.
28 |      @return comment content
29 |      */
30 |     public String getData() {
31 |         return attributes.get(COMMENT_KEY);
32 |     }
33 | 
34 | 	void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
35 |         if (out.prettyPrint())
36 |             indent(accum, depth, out);
37 |         accum
38 |                 .append("<!--")
39 |                 .append(getData())
40 |                 .append("-->");
41 |     }
42 | 
43 | 	void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}
44 | 
45 |     @Override
46 |     public String toString() {
47 |         return outerHtml();
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/DataNode.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  A data node, for contents of style, script tags etc, where contents should not show in text().
 7 | 
 8 |  @author Jonathan Hedley, jonathan@hedley.net */
 9 | public class DataNode extends Node{
10 |     private static final String DATA_KEY = "data";
11 | 
12 |     /**
13 |      Create a new DataNode.
14 |      @param data data contents
15 |      @param baseUri base URI
16 |      */
17 |     public DataNode(String data, String baseUri) {
18 |         super(baseUri);
19 |         attributes.put(DATA_KEY, data);
20 |     }
21 | 
22 |     public String nodeName() {
23 |         return "#data";
24 |     }
25 | 
26 |     /**
27 |      Get the data contents of this node. Will be unescaped and with original new lines, space etc.
28 |      @return data
29 |      */
30 |     public String getWholeData() {
31 |         return attributes.get(DATA_KEY);
32 |     }
33 | 
34 |     /**
35 |      * Set the data contents of this node.
36 |      * @param data unencoded data
37 |      * @return this node, for chaining
38 |      */
39 |     public DataNode setWholeData(String data) {
40 |         attributes.put(DATA_KEY, data);
41 |         return this;
42 |     }
43 | 
44 | 	void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
45 |         accum.append(getWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain
46 |     }
47 | 
48 | 	void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}
49 | 
50 |     @Override
51 |     public String toString() {
52 |         return outerHtml();
53 |     }
54 | 
55 |     /**
56 |      Create a new DataNode from HTML encoded data.
57 |      @param encodedData encoded data
58 |      @param baseUri bass URI
59 |      @return new DataNode
60 |      */
61 |     public static DataNode createFromEncoded(String encodedData, String baseUri) {
62 |         String data = Entities.unescape(encodedData);
63 |         return new DataNode(data, baseUri);
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/DocumentType.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.jsoup.helper.StringUtil;
 6 | import org.jsoup.nodes.Document.OutputSettings.*;
 7 | 
 8 | /**
 9 |  * A {@code <!DOCTYPE>} node.
10 |  */
11 | public class DocumentType extends Node {
12 |     public static final String PUBLIC_KEY = "PUBLIC";
13 |     public static final String SYSTEM_KEY = "SYSTEM";
14 |     private static final String NAME = "name";
15 |     private static final String PUB_SYS_KEY = "pubSysKey"; // PUBLIC or SYSTEM
16 |     private static final String PUBLIC_ID = "publicId";
17 |     private static final String SYSTEM_ID = "systemId";
18 |     // todo: quirk mode from publicId and systemId
19 | 
20 |     /**
21 |      * Create a new doctype element.
22 |      * @param name the doctype's name
23 |      * @param publicId the doctype's public ID
24 |      * @param systemId the doctype's system ID
25 |      * @param baseUri the doctype's base URI
26 |      */
27 |     public DocumentType(String name, String publicId, String systemId, String baseUri) {
28 |         super(baseUri);
29 | 
30 |         attr(NAME, name);
31 |         attr(PUBLIC_ID, publicId);
32 |         if (has(PUBLIC_ID)) {
33 |             attr(PUB_SYS_KEY, PUBLIC_KEY);
34 |         }
35 |         attr(SYSTEM_ID, systemId);
36 |     }
37 | 
38 |     /**
39 |      * Create a new doctype element.
40 |      * @param name the doctype's name
41 |      * @param publicId the doctype's public ID
42 |      * @param systemId the doctype's system ID
43 |      * @param baseUri the doctype's base URI
44 |      */
45 |     public DocumentType(String name, String pubSysKey, String publicId, String systemId, String baseUri) {
46 |         super(baseUri);
47 | 
48 |         attr(NAME, name);
49 |         if (pubSysKey != null) {
50 |             attr(PUB_SYS_KEY, pubSysKey);
51 |         }
52 |         attr(PUBLIC_ID, publicId);
53 |         attr(SYSTEM_ID, systemId);
54 |     }
55 | 
56 |     @Override
57 |     public String nodeName() {
58 |         return "#doctype";
59 |     }
60 | 
61 |     @Override
62 |     void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
63 |         if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) {
64 |             // looks like a html5 doctype, go lowercase for aesthetics
65 |             accum.append("<!doctype");
66 |         } else {
67 |             accum.append("<!DOCTYPE");
68 |         }
69 |         if (has(NAME))
70 |             accum.append(" ").append(attr(NAME));
71 |         if (has(PUB_SYS_KEY))
72 |             accum.append(" ").append(attr(PUB_SYS_KEY));
73 |         if (has(PUBLIC_ID))
74 |             accum.append(" \"").append(attr(PUBLIC_ID)).append('"');
75 |         if (has(SYSTEM_ID))
76 |             accum.append(" \"").append(attr(SYSTEM_ID)).append('"');
77 |         accum.append('>');
78 |     }
79 | 
80 |     @Override
81 |     void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {
82 |     }
83 | 
84 |     private boolean has(final String attribute) {
85 |         return !StringUtil.isBlank(attr(attribute));
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/FormElement.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.Connection;
  4 | import org.jsoup.Jsoup;
  5 | import org.jsoup.helper.HttpConnection;
  6 | import org.jsoup.helper.Validate;
  7 | import org.jsoup.parser.Tag;
  8 | import org.jsoup.select.Elements;
  9 | 
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | 
 13 | /**
 14 |  * A HTML Form Element provides ready access to the form fields/controls that are associated with it. It also allows a
 15 |  * form to easily be submitted.
 16 |  */
 17 | public class FormElement extends Element {
 18 |     private final Elements elements = new Elements();
 19 | 
 20 |     /**
 21 |      * Create a new, standalone form element.
 22 |      *
 23 |      * @param tag        tag of this element
 24 |      * @param baseUri    the base URI
 25 |      * @param attributes initial attributes
 26 |      */
 27 |     public FormElement(Tag tag, String baseUri, Attributes attributes) {
 28 |         super(tag, baseUri, attributes);
 29 |     }
 30 | 
 31 |     /**
 32 |      * Get the list of form control elements associated with this form.
 33 |      * @return form controls associated with this element.
 34 |      */
 35 |     public Elements elements() {
 36 |         return elements;
 37 |     }
 38 | 
 39 |     /**
 40 |      * Add a form control element to this form.
 41 |      * @param element form control to add
 42 |      * @return this form element, for chaining
 43 |      */
 44 |     public FormElement addElement(Element element) {
 45 |         elements.add(element);
 46 |         return this;
 47 |     }
 48 | 
 49 |     /**
 50 |      * Prepare to submit this form. A Connection object is created with the request set up from the form values. You
 51 |      * can then set up other options (like user-agent, timeout, cookies), then execute it.
 52 |      * @return a connection prepared from the values of this form.
 53 |      * @throws IllegalArgumentException if the form's absolute action URL cannot be determined. Make sure you pass the
 54 |      * document's base URI when parsing.
 55 |      */
 56 |     public Connection submit() {
 57 |         String action = hasAttr("action") ? absUrl("action") : baseUri();
 58 |         Validate.notEmpty(action, "Could not determine a form action URL for submit. Ensure you set a base URI when parsing.");
 59 |         Connection.Method method = attr("method").toUpperCase().equals("POST") ?
 60 |                 Connection.Method.POST : Connection.Method.GET;
 61 | 
 62 |         return Jsoup.connect(action)
 63 |                 .data(formData())
 64 |                 .method(method);
 65 |     }
 66 | 
 67 |     /**
 68 |      * Get the data that this form submits. The returned list is a copy of the data, and changes to the contents of the
 69 |      * list will not be reflected in the DOM.
 70 |      * @return a list of key vals
 71 |      */
 72 |     public List<Connection.KeyVal> formData() {
 73 |         ArrayList<Connection.KeyVal> data = new ArrayList<Connection.KeyVal>();
 74 | 
 75 |         // iterate the form control elements and accumulate their values
 76 |         for (Element el: elements) {
 77 |             if (!el.tag().isFormSubmittable()) continue; // contents are form listable, superset of submitable
 78 |             if (el.hasAttr("disabled")) continue; // skip disabled form inputs
 79 |             String name = el.attr("name");
 80 |             if (name.length() == 0) continue;
 81 |             String type = el.attr("type");
 82 | 
 83 |             if ("select".equals(el.tagName())) {
 84 |                 Elements options = el.select("option[selected]");
 85 |                 boolean set = false;
 86 |                 for (Element option: options) {
 87 |                     data.add(HttpConnection.KeyVal.create(name, option.val()));
 88 |                     set = true;
 89 |                 }
 90 |                 if (!set) {
 91 |                     Element option = el.select("option").first();
 92 |                     if (option != null)
 93 |                         data.add(HttpConnection.KeyVal.create(name, option.val()));
 94 |                 }
 95 |             } else if ("checkbox".equalsIgnoreCase(type) || "radio".equalsIgnoreCase(type)) {
 96 |                 // only add checkbox or radio if they have the checked attribute
 97 |                 if (el.hasAttr("checked")) {
 98 |                     final String val = el.val().length() >  0 ? el.val() : "on";
 99 |                     data.add(HttpConnection.KeyVal.create(name, val));
100 |                 }
101 |             } else {
102 |                 data.add(HttpConnection.KeyVal.create(name, el.val()));
103 |             }
104 |         }
105 |         return data;
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/TextNode.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.jsoup.helper.StringUtil;
  6 | import org.jsoup.helper.Validate;
  7 | 
  8 | /**
  9 |  A text node.
 10 | 
 11 |  @author Jonathan Hedley, jonathan@hedley.net */
 12 | public class TextNode extends Node {
 13 |     /*
 14 |     TextNode is a node, and so by default comes with attributes and children. The attributes are seldom used, but use
 15 |     memory, and the child nodes are never used. So we don't have them, and override accessors to attributes to create
 16 |     them as needed on the fly.
 17 |      */
 18 |     private static final String TEXT_KEY = "text";
 19 |     String text;
 20 | 
 21 |     /**
 22 |      Create a new TextNode representing the supplied (unencoded) text).
 23 | 
 24 |      @param text raw text
 25 |      @param baseUri base uri
 26 |      @see #createFromEncoded(String, String)
 27 |      */
 28 |     public TextNode(String text, String baseUri) {
 29 |         this.baseUri = baseUri;
 30 |         this.text = text;
 31 |     }
 32 | 
 33 | 	public String nodeName() {
 34 |         return "#text";
 35 |     }
 36 |     
 37 |     /**
 38 |      * Get the text content of this text node.
 39 |      * @return Unencoded, normalised text.
 40 |      * @see TextNode#getWholeText()
 41 |      */
 42 |     public String text() {
 43 |         return normaliseWhitespace(getWholeText());
 44 |     }
 45 |     
 46 |     /**
 47 |      * Set the text content of this text node.
 48 |      * @param text unencoded text
 49 |      * @return this, for chaining
 50 |      */
 51 |     public TextNode text(String text) {
 52 |         this.text = text;
 53 |         if (attributes != null)
 54 |             attributes.put(TEXT_KEY, text);
 55 |         return this;
 56 |     }
 57 | 
 58 |     /**
 59 |      Get the (unencoded) text of this text node, including any newlines and spaces present in the original.
 60 |      @return text
 61 |      */
 62 |     public String getWholeText() {
 63 |         return attributes == null ? text : attributes.get(TEXT_KEY);
 64 |     }
 65 | 
 66 |     /**
 67 |      Test if this text node is blank -- that is, empty or only whitespace (including newlines).
 68 |      @return true if this document is empty or only whitespace, false if it contains any text content.
 69 |      */
 70 |     public boolean isBlank() {
 71 |         return StringUtil.isBlank(getWholeText());
 72 |     }
 73 | 
 74 |     /**
 75 |      * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the
 76 |      * original text up to the offset, and will have a new text node sibling containing the text after the offset.
 77 |      * @param offset string offset point to split node at.
 78 |      * @return the newly created text node containing the text after the offset.
 79 |      */
 80 |     public TextNode splitText(int offset) {
 81 |         Validate.isTrue(offset >= 0, "Split offset must be not be negative");
 82 |         Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length");
 83 | 
 84 |         String head = getWholeText().substring(0, offset);
 85 |         String tail = getWholeText().substring(offset);
 86 |         text(head);
 87 |         TextNode tailNode = new TextNode(tail, this.baseUri());
 88 |         if (parent() != null)
 89 |             parent().addChildren(siblingIndex()+1, tailNode);
 90 | 
 91 |         return tailNode;
 92 |     }
 93 | 
 94 | 	void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
 95 |         if (out.prettyPrint() && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) ))
 96 |             indent(accum, depth, out);
 97 | 
 98 |         boolean normaliseWhite = out.prettyPrint() && parent() instanceof Element
 99 |                 && !Element.preserveWhitespace(parent());
100 |         Entities.escape(accum, getWholeText(), out, false, normaliseWhite, false);
101 |     }
102 | 
103 | 	void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}
104 | 
105 |     @Override
106 |     public String toString() {
107 |         return outerHtml();
108 |     }
109 | 
110 |     /**
111 |      * Create a new TextNode from HTML encoded (aka escaped) data.
112 |      * @param encodedText Text containing encoded HTML (e.g. &amp;lt;)
113 |      * @param baseUri Base uri
114 |      * @return TextNode containing unencoded data (e.g. &lt;)
115 |      */
116 |     public static TextNode createFromEncoded(String encodedText, String baseUri) {
117 |         String text = Entities.unescape(encodedText);
118 |         return new TextNode(text, baseUri);
119 |     }
120 | 
121 |     static String normaliseWhitespace(String text) {
122 |         text = StringUtil.normaliseWhitespace(text);
123 |         return text;
124 |     }
125 | 
126 |     static String stripLeadingWhitespace(String text) {
127 |         return text.replaceFirst("^\\s+", "");
128 |     }
129 | 
130 |     static boolean lastCharIsWhitespace(StringBuilder sb) {
131 |         return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
132 |     }
133 | 
134 |     // attribute fiddling. create on first access.
135 |     private void ensureAttributes() {
136 |         if (attributes == null) {
137 |             attributes = new Attributes();
138 |             attributes.put(TEXT_KEY, text);
139 |         }
140 |     }
141 | 
142 |     @Override
143 |     public String attr(String attributeKey) {
144 |         ensureAttributes();
145 |         return super.attr(attributeKey);
146 |     }
147 | 
148 |     @Override
149 |     public Attributes attributes() {
150 |         ensureAttributes();
151 |         return super.attributes();
152 |     }
153 | 
154 |     @Override
155 |     public Node attr(String attributeKey, String attributeValue) {
156 |         ensureAttributes();
157 |         return super.attr(attributeKey, attributeValue);
158 |     }
159 | 
160 |     @Override
161 |     public boolean hasAttr(String attributeKey) {
162 |         ensureAttributes();
163 |         return super.hasAttr(attributeKey);
164 |     }
165 | 
166 |     @Override
167 |     public Node removeAttr(String attributeKey) {
168 |         ensureAttributes();
169 |         return super.removeAttr(attributeKey);
170 |     }
171 | 
172 |     @Override
173 |     public String absUrl(String attributeKey) {
174 |         ensureAttributes();
175 |         return super.absUrl(attributeKey);
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/XmlDeclaration.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.helper.Validate;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | /**
 8 |  An XML Declaration.
 9 | 
10 |  @author Jonathan Hedley, jonathan@hedley.net */
11 | public class XmlDeclaration extends Node {
12 |     private final String name;
13 |     private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)
14 | 
15 |     /**
16 |      Create a new XML declaration
17 |      @param name of declaration
18 |      @param baseUri base uri
19 |      @param isProcessingInstruction is processing instruction
20 |      */
21 |     public XmlDeclaration(String name, String baseUri, boolean isProcessingInstruction) {
22 |         super(baseUri);
23 |         Validate.notNull(name);
24 |         this.name = name;
25 |         this.isProcessingInstruction = isProcessingInstruction;
26 |     }
27 | 
28 |     public String nodeName() {
29 |         return "#declaration";
30 |     }
31 | 
32 | 
33 |     /**
34 |      * Get the name of this declaration.
35 |      * @return name of this declaration.
36 |      */
37 |     public String name() {
38 |         return name;
39 |     }
40 | 
41 |     /**
42 |      Get the unencoded XML declaration.
43 |      @return XML declaration
44 |      */
45 |     public String getWholeDeclaration() {
46 |         return attributes.html().trim(); // attr html starts with a " "
47 |     }
48 | 
49 | 	void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
50 |         accum
51 |             .append("<")
52 |             .append(isProcessingInstruction ? "!" : "?")
53 |             .append(name);
54 |         attributes.html(accum, out);
55 |         accum
56 |             .append(isProcessingInstruction ? "!" : "?")
57 |             .append(">");
58 |     }
59 | 
60 | 	void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}
61 | 
62 |     @Override
63 |     public String toString() {
64 |         return outerHtml();
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/entities-base.properties:
--------------------------------------------------------------------------------
  1 | AElig=5i;1c
  2 | AMP=12;2
  3 | Aacute=5d;17
  4 | Acirc=5e;18
  5 | Agrave=5c;16
  6 | Aring=5h;1b
  7 | Atilde=5f;19
  8 | Auml=5g;1a
  9 | COPY=4p;h
 10 | Ccedil=5j;1d
 11 | ETH=5s;1m
 12 | Eacute=5l;1f
 13 | Ecirc=5m;1g
 14 | Egrave=5k;1e
 15 | Euml=5n;1h
 16 | GT=1q;6
 17 | Iacute=5p;1j
 18 | Icirc=5q;1k
 19 | Igrave=5o;1i
 20 | Iuml=5r;1l
 21 | LT=1o;4
 22 | Ntilde=5t;1n
 23 | Oacute=5v;1p
 24 | Ocirc=5w;1q
 25 | Ograve=5u;1o
 26 | Oslash=60;1u
 27 | Otilde=5x;1r
 28 | Ouml=5y;1s
 29 | QUOT=y;0
 30 | REG=4u;n
 31 | THORN=66;20
 32 | Uacute=62;1w
 33 | Ucirc=63;1x
 34 | Ugrave=61;1v
 35 | Uuml=64;1y
 36 | Yacute=65;1z
 37 | aacute=69;23
 38 | acirc=6a;24
 39 | acute=50;u
 40 | aelig=6e;28
 41 | agrave=68;22
 42 | amp=12;3
 43 | aring=6d;27
 44 | atilde=6b;25
 45 | auml=6c;26
 46 | brvbar=4m;e
 47 | ccedil=6f;29
 48 | cedil=54;y
 49 | cent=4i;a
 50 | copy=4p;i
 51 | curren=4k;c
 52 | deg=4w;q
 53 | divide=6v;2p
 54 | eacute=6h;2b
 55 | ecirc=6i;2c
 56 | egrave=6g;2a
 57 | eth=6o;2i
 58 | euml=6j;2d
 59 | frac12=59;13
 60 | frac14=58;12
 61 | frac34=5a;14
 62 | gt=1q;7
 63 | iacute=6l;2f
 64 | icirc=6m;2g
 65 | iexcl=4h;9
 66 | igrave=6k;2e
 67 | iquest=5b;15
 68 | iuml=6n;2h
 69 | laquo=4r;k
 70 | lt=1o;5
 71 | macr=4v;p
 72 | micro=51;v
 73 | middot=53;x
 74 | nbsp=4g;8
 75 | not=4s;l
 76 | ntilde=6p;2j
 77 | oacute=6r;2l
 78 | ocirc=6s;2m
 79 | ograve=6q;2k
 80 | ordf=4q;j
 81 | ordm=56;10
 82 | oslash=6w;2q
 83 | otilde=6t;2n
 84 | ouml=6u;2o
 85 | para=52;w
 86 | plusmn=4x;r
 87 | pound=4j;b
 88 | quot=y;1
 89 | raquo=57;11
 90 | reg=4u;o
 91 | sect=4n;f
 92 | shy=4t;m
 93 | sup1=55;z
 94 | sup2=4y;s
 95 | sup3=4z;t
 96 | szlig=67;21
 97 | thorn=72;2w
 98 | times=5z;1t
 99 | uacute=6y;2s
100 | ucirc=6z;2t
101 | ugrave=6x;2r
102 | uml=4o;g
103 | uuml=70;2u
104 | yacute=71;2v
105 | yen=4l;d
106 | yuml=73;2x
107 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/entities-xhtml.properties:
--------------------------------------------------------------------------------
1 | amp=12;1
2 | gt=1q;3
3 | lt=1o;2
4 | quot=y;0
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/nodes/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  HTML document structure nodes.
3 |  */
4 | package org.jsoup.nodes;


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains the main {@link org.jsoup.Jsoup} class, which provides convenient static access to the jsoup functionality. 
3 |  */
4 | package org.jsoup;


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/ParseError.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | /**
 4 |  * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase.
 5 |  */
 6 | public class ParseError {
 7 |     private int pos;
 8 |     private String errorMsg;
 9 | 
10 |     ParseError(int pos, String errorMsg) {
11 |         this.pos = pos;
12 |         this.errorMsg = errorMsg;
13 |     }
14 | 
15 |     ParseError(int pos, String errorFormat, Object... args) {
16 |         this.errorMsg = String.format(errorFormat, args);
17 |         this.pos = pos;
18 |     }
19 | 
20 |     /**
21 |      * Retrieve the error message.
22 |      * @return the error message.
23 |      */
24 |     public String getErrorMessage() {
25 |         return errorMsg;
26 |     }
27 | 
28 |     /**
29 |      * Retrieves the offset of the error.
30 |      * @return error offset within input
31 |      */
32 |     public int getPosition() {
33 |         return pos;
34 |     }
35 | 
36 |     @Override
37 |     public String toString() {
38 |         return pos + ": " + errorMsg;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/ParseErrorList.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * A container for ParseErrors.
 7 |  * 
 8 |  * @author Jonathan Hedley
 9 |  */
10 | public class ParseErrorList extends ArrayList<ParseError>{
11 |     private static final int INITIAL_CAPACITY = 16;
12 |     private final int maxSize;
13 |     
14 |     ParseErrorList(int initialCapacity, int maxSize) {
15 |         super(initialCapacity);
16 |         this.maxSize = maxSize;
17 |     }
18 |     
19 |     boolean canAddError() {
20 |         return size() < maxSize;
21 |     }
22 | 
23 |     int getMaxSize() {
24 |         return maxSize;
25 |     }
26 | 
27 |     public static ParseErrorList noTracking() {
28 |         return new ParseErrorList(0, 0);
29 |     }
30 |     
31 |     public static ParseErrorList tracking(int maxSize) {
32 |         return new ParseErrorList(INITIAL_CAPACITY, maxSize);
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/ParseSettings.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.jsoup.nodes.Attribute;
 4 | import org.jsoup.nodes.Attributes;
 5 | 
 6 | /**
 7 |  * Controls parser settings, to optionally preserve tag and/or attribute name case.
 8 |  */
 9 | public class ParseSettings {
10 |     /**
11 |      * HTML default settings: both tag and attribute names are lower-cased during parsing.
12 |      */
13 |     public static final ParseSettings htmlDefault;
14 |     /**
15 |      * Preserve both tag and attribute case.
16 |      */
17 |     public static final ParseSettings preserveCase;
18 | 
19 |     static {
20 |         htmlDefault = new ParseSettings(false, false);
21 |         preserveCase = new ParseSettings(true, true);
22 |     }
23 | 
24 |     private final boolean preserveTagCase;
25 |     private final boolean preserveAttributeCase;
26 | 
27 |     /**
28 |      * Define parse settings.
29 |      * @param tag preserve tag case?
30 |      * @param attribute preserve attribute name case?
31 |      */
32 |     public ParseSettings(boolean tag, boolean attribute) {
33 |         preserveTagCase = tag;
34 |         preserveAttributeCase = attribute;
35 |     }
36 | 
37 |     String normalizeTag(String name) {
38 |         name = name.trim();
39 |         if (!preserveTagCase)
40 |             name = name.toLowerCase();
41 |         return name;
42 |     }
43 | 
44 |     String normalizeAttribute(String name) {
45 |         name = name.trim();
46 |         if (!preserveAttributeCase)
47 |             name = name.toLowerCase();
48 |         return name;
49 |     }
50 | 
51 |     Attributes normalizeAttributes(Attributes attributes) {
52 |         if (!preserveAttributeCase) {
53 |             for (Attribute attr : attributes) {
54 |                 attr.setKey(attr.getKey().toLowerCase());
55 |             }
56 |         }
57 |         return attributes;
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/Parser.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import org.jsoup.nodes.Document;
  4 | import org.jsoup.nodes.Element;
  5 | import org.jsoup.nodes.Node;
  6 | 
  7 | import java.util.List;
  8 | 
  9 | /**
 10 |  * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the  more convenient parse methods
 11 |  * in {@link org.jsoup.Jsoup}.
 12 |  */
 13 | public class Parser {
 14 |     private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled.
 15 |     
 16 |     private TreeBuilder treeBuilder;
 17 |     private int maxErrors = DEFAULT_MAX_ERRORS;
 18 |     private ParseErrorList errors;
 19 |     private ParseSettings settings;
 20 | 
 21 |     /**
 22 |      * Create a new Parser, using the specified TreeBuilder
 23 |      * @param treeBuilder TreeBuilder to use to parse input into Documents.
 24 |      */
 25 |     public Parser(TreeBuilder treeBuilder) {
 26 |         this.treeBuilder = treeBuilder;
 27 |         settings = treeBuilder.defaultSettings();
 28 |     }
 29 |     
 30 |     public Document parseInput(String html, String baseUri) {
 31 |         errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
 32 |         return treeBuilder.parse(html, baseUri, errors, settings);
 33 |     }
 34 | 
 35 |     // gets & sets
 36 |     /**
 37 |      * Get the TreeBuilder currently in use.
 38 |      * @return current TreeBuilder.
 39 |      */
 40 |     public TreeBuilder getTreeBuilder() {
 41 |         return treeBuilder;
 42 |     }
 43 | 
 44 |     /**
 45 |      * Update the TreeBuilder used when parsing content.
 46 |      * @param treeBuilder current TreeBuilder
 47 |      * @return this, for chaining
 48 |      */
 49 |     public Parser setTreeBuilder(TreeBuilder treeBuilder) {
 50 |         this.treeBuilder = treeBuilder;
 51 |         return this;
 52 |     }
 53 | 
 54 |     /**
 55 |      * Check if parse error tracking is enabled.
 56 |      * @return current track error state.
 57 |      */
 58 |     public boolean isTrackErrors() {
 59 |         return maxErrors > 0;
 60 |     }
 61 | 
 62 |     /**
 63 |      * Enable or disable parse error tracking for the next parse.
 64 |      * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
 65 |      * @return this, for chaining
 66 |      */
 67 |     public Parser setTrackErrors(int maxErrors) {
 68 |         this.maxErrors = maxErrors;
 69 |         return this;
 70 |     }
 71 | 
 72 |     /**
 73 |      * Retrieve the parse errors, if any, from the last parse.
 74 |      * @return list of parse errors, up to the size of the maximum errors tracked.
 75 |      */
 76 |     public List<ParseError> getErrors() {
 77 |         return errors;
 78 |     }
 79 | 
 80 |     public Parser settings(ParseSettings settings) {
 81 |         this.settings = settings;
 82 |         return this;
 83 |     }
 84 | 
 85 |     public ParseSettings settings() {
 86 |         return settings;
 87 |     }
 88 | 
 89 |     // static parse functions below
 90 |     /**
 91 |      * Parse HTML into a Document.
 92 |      *
 93 |      * @param html HTML to parse
 94 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
 95 |      *
 96 |      * @return parsed Document
 97 |      */
 98 |     public static Document parse(String html, String baseUri) {
 99 |         TreeBuilder treeBuilder = new HtmlTreeBuilder();
100 |         return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings());
101 |     }
102 | 
103 |     /**
104 |      * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
105 |      *
106 |      * @param fragmentHtml the fragment of HTML to parse
107 |      * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
108 |      * provides stack context (for implicit element creation).
109 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
110 |      *
111 |      * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
112 |      */
113 |     public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
114 |         HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
115 |         return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings());
116 |     }
117 | 
118 |     /**
119 |      * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
120 |      *
121 |      * @param fragmentHtml the fragment of HTML to parse
122 |      * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
123 |      * provides stack context (for implicit element creation).
124 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
125 |      * @param errorList list to add errors to
126 |      *
127 |      * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
128 |      */
129 |     public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
130 |         HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
131 |         return treeBuilder.parseFragment(fragmentHtml, context, baseUri, errorList, treeBuilder.defaultSettings());
132 |     }
133 | 
134 |     /**
135 |      * Parse a fragment of XML into a list of nodes.
136 |      *
137 |      * @param fragmentXml the fragment of XML to parse
138 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
139 |      * @return list of nodes parsed from the input XML.
140 |      */
141 |     public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
142 |         XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
143 |         return treeBuilder.parseFragment(fragmentXml, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings());
144 |     }
145 | 
146 |     /**
147 |      * Parse a fragment of HTML into the {@code body} of a Document.
148 |      *
149 |      * @param bodyHtml fragment of HTML
150 |      * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
151 |      *
152 |      * @return Document, with empty head, and HTML parsed into body
153 |      */
154 |     public static Document parseBodyFragment(String bodyHtml, String baseUri) {
155 |         Document doc = Document.createShell(baseUri);
156 |         Element body = doc.body();
157 |         List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
158 |         Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
159 |         for (int i = nodes.length - 1; i > 0; i--) {
160 |             nodes[i].remove();
161 |         }
162 |         for (Node node : nodes) {
163 |             body.appendChild(node);
164 |         }
165 |         return doc;
166 |     }
167 | 
168 |     /**
169 |      * Utility method to unescape HTML entities from a string
170 |      * @param string HTML escaped string
171 |      * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
172 |      * @return an unescaped string
173 |      */
174 |     public static String unescapeEntities(String string, boolean inAttribute) {
175 |         Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking());
176 |         return tokeniser.unescapeEntities(inAttribute);
177 |     }
178 | 
179 |     /**
180 |      * @param bodyHtml HTML to parse
181 |      * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
182 |      *
183 |      * @return parsed Document
184 |      * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead.
185 |      */
186 |     public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) {
187 |         return parse(bodyHtml, baseUri);
188 |     }
189 |     
190 |     // builders
191 | 
192 |     /**
193 |      * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
194 |      * based on a knowledge of the semantics of the incoming tags.
195 |      * @return a new HTML parser.
196 |      */
197 |     public static Parser htmlParser() {
198 |         return new Parser(new HtmlTreeBuilder());
199 |     }
200 | 
201 |     /**
202 |      * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
203 |      * rather creates a simple tree directly from the input.
204 |      * @return a new simple XML parser.
205 |      */
206 |     public static Parser xmlParser() {
207 |         return new Parser(new XmlTreeBuilder());
208 |     }
209 | }
210 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/TreeBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.jsoup.helper.Validate;
 4 | import org.jsoup.nodes.Attributes;
 5 | import org.jsoup.nodes.Document;
 6 | import org.jsoup.nodes.Element;
 7 | 
 8 | import java.util.ArrayList;
 9 | 
10 | /**
11 |  * @author Jonathan Hedley
12 |  */
13 | abstract class TreeBuilder {
14 |     CharacterReader reader;
15 |     Tokeniser tokeniser;
16 |     protected Document doc; // current doc we are building into
17 |     protected ArrayList<Element> stack; // the stack of open elements
18 |     protected String baseUri; // current base uri, for creating new elements
19 |     protected Token currentToken; // currentToken is used only for error tracking.
20 |     protected ParseErrorList errors; // null when not tracking errors
21 |     protected ParseSettings settings;
22 | 
23 |     private Token.StartTag start = new Token.StartTag(); // start tag to process
24 |     private Token.EndTag end  = new Token.EndTag();
25 | 
26 |     abstract ParseSettings defaultSettings();
27 | 
28 |     protected void initialiseParse(String input, String baseUri, ParseErrorList errors, ParseSettings settings) {
29 |         Validate.notNull(input, "String input must not be null");
30 |         Validate.notNull(baseUri, "BaseURI must not be null");
31 | 
32 |         doc = new Document(baseUri);
33 |         this.settings = settings;
34 |         reader = new CharacterReader(input);
35 |         this.errors = errors;
36 |         tokeniser = new Tokeniser(reader, errors);
37 |         stack = new ArrayList<Element>(32);
38 |         this.baseUri = baseUri;
39 |     }
40 | 
41 |     Document parse(String input, String baseUri, ParseErrorList errors, ParseSettings settings) {
42 |         initialiseParse(input, baseUri, errors, settings);
43 |         runParser();
44 |         return doc;
45 |     }
46 | 
47 |     protected void runParser() {
48 |         while (true) {
49 |             Token token = tokeniser.read();
50 |             process(token);
51 |             token.reset();
52 | 
53 |             if (token.type == Token.TokenType.EOF)
54 |                 break;
55 |         }
56 |     }
57 | 
58 |     protected abstract boolean process(Token token);
59 | 
60 |     protected boolean processStartTag(String name) {
61 |         if (currentToken == start) { // don't recycle an in-use token
62 |             return process(new Token.StartTag().name(name));
63 |         }
64 |         return process(start.reset().name(name));
65 |     }
66 | 
67 |     public boolean processStartTag(String name, Attributes attrs) {
68 |         if (currentToken == start) { // don't recycle an in-use token
69 |             return process(new Token.StartTag().nameAttr(name, attrs));
70 |         }
71 |         start.reset();
72 |         start.nameAttr(name, attrs);
73 |         return process(start);
74 |     }
75 | 
76 |     protected boolean processEndTag(String name) {
77 |         if (currentToken == end) { // don't recycle an in-use token
78 |             return process(new Token.EndTag().name(name));
79 |         }
80 |         return process(end.reset().name(name));
81 |     }
82 | 
83 | 
84 |     protected Element currentElement() {
85 |         int size = stack.size();
86 |         return size > 0 ? stack.get(size-1) : null;
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/XmlTreeBuilder.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.helper.Validate;
  5 | import org.jsoup.nodes.*;
  6 | 
  7 | import java.util.List;
  8 | 
  9 | /**
 10 |  * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
 11 |  * document.
 12 |  * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
 13 |  *
 14 |  * @author Jonathan Hedley
 15 |  */
 16 | public class XmlTreeBuilder extends TreeBuilder {
 17 |     ParseSettings defaultSettings() {
 18 |         return ParseSettings.preserveCase;
 19 |     }
 20 | 
 21 |     Document parse(String input, String baseUri) {
 22 |         return parse(input, baseUri, ParseErrorList.noTracking(), ParseSettings.preserveCase);
 23 |     }
 24 | 
 25 |     @Override
 26 |     protected void initialiseParse(String input, String baseUri, ParseErrorList errors, ParseSettings settings) {
 27 |         super.initialiseParse(input, baseUri, errors, settings);
 28 |         stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack)
 29 |         doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
 30 |     }
 31 | 
 32 |     @Override
 33 |     protected boolean process(Token token) {
 34 |         // start tag, end tag, doctype, comment, character, eof
 35 |         switch (token.type) {
 36 |             case StartTag:
 37 |                 insert(token.asStartTag());
 38 |                 break;
 39 |             case EndTag:
 40 |                 popStackToClose(token.asEndTag());
 41 |                 break;
 42 |             case Comment:
 43 |                 insert(token.asComment());
 44 |                 break;
 45 |             case Character:
 46 |                 insert(token.asCharacter());
 47 |                 break;
 48 |             case Doctype:
 49 |                 insert(token.asDoctype());
 50 |                 break;
 51 |             case EOF: // could put some normalisation here if desired
 52 |                 break;
 53 |             default:
 54 |                 Validate.fail("Unexpected token type: " + token.type);
 55 |         }
 56 |         return true;
 57 |     }
 58 | 
 59 |     private void insertNode(Node node) {
 60 |         currentElement().appendChild(node);
 61 |     }
 62 | 
 63 |     Element insert(Token.StartTag startTag) {
 64 |         Tag tag = Tag.valueOf(startTag.name(), settings);
 65 |         // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html.
 66 |         Element el = new Element(tag, baseUri, settings.normalizeAttributes(startTag.attributes));
 67 |         insertNode(el);
 68 |         if (startTag.isSelfClosing()) {
 69 |             tokeniser.acknowledgeSelfClosingFlag();
 70 |             if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above.
 71 |                 tag.setSelfClosing();
 72 |         } else {
 73 |             stack.add(el);
 74 |         }
 75 |         return el;
 76 |     }
 77 | 
 78 |     void insert(Token.Comment commentToken) {
 79 |         Comment comment = new Comment(commentToken.getData(), baseUri);
 80 |         Node insert = comment;
 81 |         if (commentToken.bogus) { // xml declarations are emitted as bogus comments (which is right for html, but not xml)
 82 |             // so we do a bit of a hack and parse the data as an element to pull the attributes out
 83 |             String data = comment.getData();
 84 |             if (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) {
 85 |                 Document doc = Jsoup.parse("<" + data.substring(1, data.length() -1) + ">", baseUri, Parser.xmlParser());
 86 |                 Element el = doc.child(0);
 87 |                 insert = new XmlDeclaration(settings.normalizeTag(el.tagName()), comment.baseUri(), data.startsWith("!"));
 88 |                 insert.attributes().addAll(el.attributes());
 89 |             }
 90 |         }
 91 |         insertNode(insert);
 92 |     }
 93 | 
 94 |     void insert(Token.Character characterToken) {
 95 |         Node node = new TextNode(characterToken.getData(), baseUri);
 96 |         insertNode(node);
 97 |     }
 98 | 
 99 |     void insert(Token.Doctype d) {
100 |         DocumentType doctypeNode = new DocumentType(settings.normalizeTag(d.getName()), d.getPubSysKey(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri);
101 |         insertNode(doctypeNode);
102 |     }
103 | 
104 |     /**
105 |      * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
106 |      * found, skips.
107 |      *
108 |      * @param endTag
109 |      */
110 |     private void popStackToClose(Token.EndTag endTag) {
111 |         String elName = endTag.name();
112 |         Element firstFound = null;
113 | 
114 |         for (int pos = stack.size() -1; pos >= 0; pos--) {
115 |             Element next = stack.get(pos);
116 |             if (next.nodeName().equals(elName)) {
117 |                 firstFound = next;
118 |                 break;
119 |             }
120 |         }
121 |         if (firstFound == null)
122 |             return; // not found, skip
123 | 
124 |         for (int pos = stack.size() -1; pos >= 0; pos--) {
125 |             Element next = stack.get(pos);
126 |             stack.remove(pos);
127 |             if (next == firstFound)
128 |                 break;
129 |         }
130 |     }
131 | 
132 |     List<Node> parseFragment(String inputFragment, String baseUri, ParseErrorList errors, ParseSettings settings) {
133 |         initialiseParse(inputFragment, baseUri, errors, settings);
134 |         runParser();
135 |         return doc.childNodes();
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/parser/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains the HTML parser, tag specifications, and HTML tokeniser.
3 |  */
4 | package org.jsoup.parser;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/safety/Cleaner.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.safety;
  2 | 
  3 | import org.jsoup.helper.Validate;
  4 | import org.jsoup.nodes.Attribute;
  5 | import org.jsoup.nodes.Attributes;
  6 | import org.jsoup.nodes.DataNode;
  7 | import org.jsoup.nodes.Document;
  8 | import org.jsoup.nodes.Element;
  9 | import org.jsoup.nodes.Node;
 10 | import org.jsoup.nodes.TextNode;
 11 | import org.jsoup.parser.ParseErrorList;
 12 | import org.jsoup.parser.Parser;
 13 | import org.jsoup.parser.Tag;
 14 | import org.jsoup.select.NodeTraversor;
 15 | import org.jsoup.select.NodeVisitor;
 16 | 
 17 | import java.util.List;
 18 | 
 19 | 
 20 | /**
 21 |  The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
 22 |  that you are expecting; no junk, and no cross-site scripting attacks!
 23 |  <p>
 24 |  The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain
 25 |  HTML that is allowed by the whitelist.
 26 |  </p>
 27 |  <p>
 28 |  It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
 29 |  canned white-lists only allow body contained tags.
 30 |  </p>
 31 |  <p>
 32 |  Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
 33 |  </p>
 34 |  */
 35 | public class Cleaner {
 36 |     private Whitelist whitelist;
 37 | 
 38 |     /**
 39 |      Create a new cleaner, that sanitizes documents using the supplied whitelist.
 40 |      @param whitelist white-list to clean with
 41 |      */
 42 |     public Cleaner(Whitelist whitelist) {
 43 |         Validate.notNull(whitelist);
 44 |         this.whitelist = whitelist;
 45 |     }
 46 | 
 47 |     /**
 48 |      Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
 49 |      The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
 50 |      @param dirtyDocument Untrusted base document to clean.
 51 |      @return cleaned document.
 52 |      */
 53 |     public Document clean(Document dirtyDocument) {
 54 |         Validate.notNull(dirtyDocument);
 55 | 
 56 |         Document clean = Document.createShell(dirtyDocument.baseUri());
 57 |         if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
 58 |             copySafeNodes(dirtyDocument.body(), clean.body());
 59 | 
 60 |         return clean;
 61 |     }
 62 | 
 63 |     /**
 64 |      Determines if the input document <b>body</b>is valid, against the whitelist. It is considered valid if all the tags and attributes
 65 |      in the input HTML are allowed by the whitelist, and that there is no content in the <code>head</code>.
 66 |      <p>
 67 |      This method can be used as a validator for user input. An invalid document will still be cleaned successfully
 68 |      using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
 69 |      to ensure enforced attributes are set correctly, and that the output is tidied.
 70 |      </p>
 71 |      @param dirtyDocument document to test
 72 |      @return true if no tags or attributes need to be removed; false if they do
 73 |      */
 74 |     public boolean isValid(Document dirtyDocument) {
 75 |         Validate.notNull(dirtyDocument);
 76 | 
 77 |         Document clean = Document.createShell(dirtyDocument.baseUri());
 78 |         int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
 79 |         return numDiscarded == 0
 80 |             && dirtyDocument.head().childNodes().size() == 0; // because we only look at the body, but we start from a shell, make sure there's nothing in the head
 81 |     }
 82 | 
 83 |     public boolean isValidBodyHtml(String bodyHtml) {
 84 |         Document clean = Document.createShell("");
 85 |         Document dirty = Document.createShell("");
 86 |         ParseErrorList errorList = ParseErrorList.tracking(1);
 87 |         List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
 88 |         dirty.body().insertChildren(0, nodes);
 89 |         int numDiscarded = copySafeNodes(dirty.body(), clean.body());
 90 |         return numDiscarded == 0 && errorList.size() == 0;
 91 |     }
 92 | 
 93 |     /**
 94 |      Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
 95 |      */
 96 |     private final class CleaningVisitor implements NodeVisitor {
 97 |         private int numDiscarded = 0;
 98 |         private final Element root;
 99 |         private Element destination; // current element to append nodes to
100 | 
101 |         private CleaningVisitor(Element root, Element destination) {
102 |             this.root = root;
103 |             this.destination = destination;
104 |         }
105 | 
106 |         public void head(Node source, int depth) {
107 |             if (source instanceof Element) {
108 |                 Element sourceEl = (Element) source;
109 | 
110 |                 if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
111 |                     ElementMeta meta = createSafeElement(sourceEl);
112 |                     Element destChild = meta.el;
113 |                     destination.appendChild(destChild);
114 | 
115 |                     numDiscarded += meta.numAttribsDiscarded;
116 |                     destination = destChild;
117 |                 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
118 |                     numDiscarded++;
119 |                 }
120 |             } else if (source instanceof TextNode) {
121 |                 TextNode sourceText = (TextNode) source;
122 |                 TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
123 |                 destination.appendChild(destText);
124 |             } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
125 |               DataNode sourceData = (DataNode) source;
126 |               DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri());
127 |               destination.appendChild(destData);
128 |             } else { // else, we don't care about comments, xml proc instructions, etc
129 |                 numDiscarded++;
130 |             }
131 |         }
132 | 
133 |         public void tail(Node source, int depth) {
134 |             if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
135 |                 destination = destination.parent(); // would have descended, so pop destination stack
136 |             }
137 |         }
138 |     }
139 | 
140 |     private int copySafeNodes(Element source, Element dest) {
141 |         CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
142 |         NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
143 |         traversor.traverse(source);
144 |         return cleaningVisitor.numDiscarded;
145 |     }
146 | 
147 |     private ElementMeta createSafeElement(Element sourceEl) {
148 |         String sourceTag = sourceEl.tagName();
149 |         Attributes destAttrs = new Attributes();
150 |         Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
151 |         int numDiscarded = 0;
152 | 
153 |         Attributes sourceAttrs = sourceEl.attributes();
154 |         for (Attribute sourceAttr : sourceAttrs) {
155 |             if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
156 |                 destAttrs.put(sourceAttr);
157 |             else
158 |                 numDiscarded++;
159 |         }
160 |         Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
161 |         destAttrs.addAll(enforcedAttrs);
162 | 
163 |         return new ElementMeta(dest, numDiscarded);
164 |     }
165 | 
166 |     private static class ElementMeta {
167 |         Element el;
168 |         int numAttribsDiscarded;
169 | 
170 |         ElementMeta(Element el, int numAttribsDiscarded) {
171 |             this.el = el;
172 |             this.numAttribsDiscarded = numAttribsDiscarded;
173 |         }
174 |     }
175 | 
176 | }
177 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/safety/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Contains the jsoup HTML cleaner, and whitelist definitions.
3 |  */
4 | package org.jsoup.safety;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/Collector.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.jsoup.nodes.Element;
 4 | import org.jsoup.nodes.Node;
 5 | 
 6 | /**
 7 |  * Collects a list of elements that match the supplied criteria.
 8 |  *
 9 |  * @author Jonathan Hedley
10 |  */
11 | public class Collector {
12 | 
13 |     private Collector() {
14 |     }
15 | 
16 |     /**
17 |      Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator.
18 |      @param eval Evaluator to test elements against
19 |      @param root root of tree to descend
20 |      @return list of matches; empty if none
21 |      */
22 |     public static Elements collect (Evaluator eval, Element root) {
23 |         Elements elements = new Elements();
24 |         new NodeTraversor(new Accumulator(root, elements, eval)).traverse(root);
25 |         return elements;
26 |     }
27 | 
28 |     private static class Accumulator implements NodeVisitor {
29 |         private final Element root;
30 |         private final Elements elements;
31 |         private final Evaluator eval;
32 | 
33 |         Accumulator(Element root, Elements elements, Evaluator eval) {
34 |             this.root = root;
35 |             this.elements = elements;
36 |             this.eval = eval;
37 |         }
38 | 
39 |         public void head(Node node, int depth) {
40 |             if (node instanceof Element) {
41 |                 Element el = (Element) node;
42 |                 if (eval.matches(root, el))
43 |                     elements.add(el);
44 |             }
45 |         }
46 | 
47 |         public void tail(Node node, int depth) {
48 |             // void
49 |         }
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/CombiningEvaluator.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.select;
  2 | 
  3 | import org.jsoup.helper.StringUtil;
  4 | import org.jsoup.nodes.Element;
  5 | 
  6 | import java.util.ArrayList;
  7 | import java.util.Arrays;
  8 | import java.util.Collection;
  9 | 
 10 | /**
 11 |  * Base combining (and, or) evaluator.
 12 |  */
 13 | abstract class CombiningEvaluator extends Evaluator {
 14 |     final ArrayList<Evaluator> evaluators;
 15 |     int num = 0;
 16 | 
 17 |     CombiningEvaluator() {
 18 |         super();
 19 |         evaluators = new ArrayList<Evaluator>();
 20 |     }
 21 | 
 22 |     CombiningEvaluator(Collection<Evaluator> evaluators) {
 23 |         this();
 24 |         this.evaluators.addAll(evaluators);
 25 |         updateNumEvaluators();
 26 |     }
 27 | 
 28 |     Evaluator rightMostEvaluator() {
 29 |         return num > 0 ? evaluators.get(num - 1) : null;
 30 |     }
 31 |     
 32 |     void replaceRightMostEvaluator(Evaluator replacement) {
 33 |         evaluators.set(num - 1, replacement);
 34 |     }
 35 | 
 36 |     void updateNumEvaluators() {
 37 |         // used so we don't need to bash on size() for every match test
 38 |         num = evaluators.size();
 39 |     }
 40 | 
 41 |     static final class And extends CombiningEvaluator {
 42 |         And(Collection<Evaluator> evaluators) {
 43 |             super(evaluators);
 44 |         }
 45 | 
 46 |         And(Evaluator... evaluators) {
 47 |             this(Arrays.asList(evaluators));
 48 |         }
 49 | 
 50 |         @Override
 51 |         public boolean matches(Element root, Element node) {
 52 |             for (int i = 0; i < num; i++) {
 53 |                 Evaluator s = evaluators.get(i);
 54 |                 if (!s.matches(root, node))
 55 |                     return false;
 56 |             }
 57 |             return true;
 58 |         }
 59 | 
 60 |         @Override
 61 |         public String toString() {
 62 |             return StringUtil.join(evaluators, " ");
 63 |         }
 64 |     }
 65 | 
 66 |     static final class Or extends CombiningEvaluator {
 67 |         /**
 68 |          * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR.
 69 |          * @param evaluators initial OR clause (these are wrapped into an AND evaluator).
 70 |          */
 71 |         Or(Collection<Evaluator> evaluators) {
 72 |             super();
 73 |             if (num > 1)
 74 |                 this.evaluators.add(new And(evaluators));
 75 |             else // 0 or 1
 76 |                 this.evaluators.addAll(evaluators);
 77 |             updateNumEvaluators();
 78 |         }
 79 | 
 80 |         Or(Evaluator... evaluators) { this(Arrays.asList(evaluators)); }
 81 | 
 82 |         Or() {
 83 |             super();
 84 |         }
 85 | 
 86 |         public void add(Evaluator e) {
 87 |             evaluators.add(e);
 88 |             updateNumEvaluators();
 89 |         }
 90 | 
 91 |         @Override
 92 |         public boolean matches(Element root, Element node) {
 93 |             for (int i = 0; i < num; i++) {
 94 |                 Evaluator s = evaluators.get(i);
 95 |                 if (s.matches(root, node))
 96 |                     return true;
 97 |             }
 98 |             return false;
 99 |         }
100 | 
101 |         @Override
102 |         public String toString() {
103 |             return String.format(":or%s", evaluators);
104 |         }
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/NodeTraversor.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.jsoup.nodes.Node;
 4 | 
 5 | /**
 6 |  * Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
 7 |  * <p>
 8 |  * This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
 9 |  * </p>
10 |  */
11 | public class NodeTraversor {
12 |     private NodeVisitor visitor;
13 | 
14 |     /**
15 |      * Create a new traversor.
16 |      * @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node.
17 |      */
18 |     public NodeTraversor(NodeVisitor visitor) {
19 |         this.visitor = visitor;
20 |     }
21 | 
22 |     /**
23 |      * Start a depth-first traverse of the root and all of its descendants.
24 |      * @param root the root node point to traverse.
25 |      */
26 |     public void traverse(Node root) {
27 |         Node node = root;
28 |         int depth = 0;
29 |         
30 |         while (node != null) {
31 |             visitor.head(node, depth);
32 |             if (node.childNodeSize() > 0) {
33 |                 node = node.childNode(0);
34 |                 depth++;
35 |             } else {
36 |                 while (node.nextSibling() == null && depth > 0) {
37 |                     visitor.tail(node, depth);
38 |                     node = node.parentNode();
39 |                     depth--;
40 |                 }
41 |                 visitor.tail(node, depth);
42 |                 if (node == root)
43 |                     break;
44 |                 node = node.nextSibling();
45 |             }
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/NodeVisitor.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.jsoup.nodes.Node;
 4 | 
 5 | /**
 6 |  * Node visitor interface. Provide an implementing class to {@link NodeTraversor} to iterate through nodes.
 7 |  * <p>
 8 |  * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
 9 |  * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
10 |  * create a start tag for a node, and tail to create the end tag.
11 |  * </p>
12 |  */
13 | public interface NodeVisitor {
14 |     /**
15 |      * Callback for when a node is first visited.
16 |      *
17 |      * @param node the node being visited.
18 |      * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
19 |      * of that will have depth 1.
20 |      */
21 |     void head(Node node, int depth);
22 | 
23 |     /**
24 |      * Callback for when a node is last visited, after all of its descendants have been visited.
25 |      *
26 |      * @param node the node being visited.
27 |      * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
28 |      * of that will have depth 1.
29 |      */
30 |     void tail(Node node, int depth);
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/StructuralEvaluator.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.select;
  2 | 
  3 | import org.jsoup.nodes.Element;
  4 | 
  5 | /**
  6 |  * Base structural evaluator.
  7 |  */
  8 | abstract class StructuralEvaluator extends Evaluator {
  9 |     Evaluator evaluator;
 10 | 
 11 |     static class Root extends Evaluator {
 12 |         public boolean matches(Element root, Element element) {
 13 |             return root == element;
 14 |         }
 15 |     }
 16 | 
 17 |     static class Has extends StructuralEvaluator {
 18 |         public Has(Evaluator evaluator) {
 19 |             this.evaluator = evaluator;
 20 |         }
 21 | 
 22 |         public boolean matches(Element root, Element element) {
 23 |             for (Element e : element.getAllElements()) {
 24 |                 if (e != element && evaluator.matches(root, e))
 25 |                     return true;
 26 |             }
 27 |             return false;
 28 |         }
 29 | 
 30 |         @Override
 31 |         public String toString() {
 32 |             return String.format(":has(%s)", evaluator);
 33 |         }
 34 |     }
 35 | 
 36 |     static class Not extends StructuralEvaluator {
 37 |         public Not(Evaluator evaluator) {
 38 |             this.evaluator = evaluator;
 39 |         }
 40 | 
 41 |         public boolean matches(Element root, Element node) {
 42 |             return !evaluator.matches(root, node);
 43 |         }
 44 | 
 45 |         @Override
 46 |         public String toString() {
 47 |             return String.format(":not%s", evaluator);
 48 |         }
 49 |     }
 50 | 
 51 |     static class Parent extends StructuralEvaluator {
 52 |         public Parent(Evaluator evaluator) {
 53 |             this.evaluator = evaluator;
 54 |         }
 55 | 
 56 |         public boolean matches(Element root, Element element) {
 57 |             if (root == element)
 58 |                 return false;
 59 | 
 60 |             Element parent = element.parent();
 61 |             while (true) {
 62 |                 if (evaluator.matches(root, parent))
 63 |                     return true;
 64 |                 if (parent == root)
 65 |                     break;
 66 |                 parent = parent.parent();
 67 |             }
 68 |             return false;
 69 |         }
 70 | 
 71 |         @Override
 72 |         public String toString() {
 73 |             return String.format(":parent%s", evaluator);
 74 |         }
 75 |     }
 76 | 
 77 |     static class ImmediateParent extends StructuralEvaluator {
 78 |         public ImmediateParent(Evaluator evaluator) {
 79 |             this.evaluator = evaluator;
 80 |         }
 81 | 
 82 |         public boolean matches(Element root, Element element) {
 83 |             if (root == element)
 84 |                 return false;
 85 | 
 86 |             Element parent = element.parent();
 87 |             return parent != null && evaluator.matches(root, parent);
 88 |         }
 89 | 
 90 |         @Override
 91 |         public String toString() {
 92 |             return String.format(":ImmediateParent%s", evaluator);
 93 |         }
 94 |     }
 95 | 
 96 |     static class PreviousSibling extends StructuralEvaluator {
 97 |         public PreviousSibling(Evaluator evaluator) {
 98 |             this.evaluator = evaluator;
 99 |         }
100 | 
101 |         public boolean matches(Element root, Element element) {
102 |             if (root == element)
103 |                 return false;
104 | 
105 |             Element prev = element.previousElementSibling();
106 | 
107 |             while (prev != null) {
108 |                 if (evaluator.matches(root, prev))
109 |                     return true;
110 | 
111 |                 prev = prev.previousElementSibling();
112 |             }
113 |             return false;
114 |         }
115 | 
116 |         @Override
117 |         public String toString() {
118 |             return String.format(":prev*%s", evaluator);
119 |         }
120 |     }
121 | 
122 |     static class ImmediatePreviousSibling extends StructuralEvaluator {
123 |         public ImmediatePreviousSibling(Evaluator evaluator) {
124 |             this.evaluator = evaluator;
125 |         }
126 | 
127 |         public boolean matches(Element root, Element element) {
128 |             if (root == element)
129 |                 return false;
130 | 
131 |             Element prev = element.previousElementSibling();
132 |             return prev != null && evaluator.matches(root, prev);
133 |         }
134 | 
135 |         @Override
136 |         public String toString() {
137 |             return String.format(":prev%s", evaluator);
138 |         }
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/src/main/java/org/jsoup/select/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Packages to support the CSS-style element selector.
3 |  */
4 | package org.jsoup.select;


--------------------------------------------------------------------------------
/src/main/javadoc/overview.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <title>jsoup Javadoc overview</title>
 5 | </head>
 6 | <body>
 7 | <h1>jsoup: Java HTML parser that makes sense of real-world HTML soup.</h1>
 8 | 
 9 | <p><b>jsoup</b> is a Java library for working with real-world HTML. It provides a very convenient API
10 | for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.</p>
11 | 
12 | <p>jsoup implements the <a href="http://whatwg.org/html">WHATWG HTML</a> specification, and parses HTML to the same DOM
13 | as modern browsers do.</p>
14 | 
15 | <ul>
16 | <li>parse HTML from a URL, file, or string
17 | <li>find and extract data, using DOM traversal or CSS selectors
18 | <li>manipulate the HTML elements, attributes, and text
19 | <li>clean user-submitted content against a safe white-list, to prevent XSS
20 | <li>output tidy HTML
21 | </ul>
22 | 
23 | <p>jsoup is designed to deal with all varieties of HTML found in the wild; from pristine and validating,
24 | to invalid tag-soup; jsoup will create a sensible parse tree.</p>
25 | 
26 | <p>See <a href="https://jsoup.org/"><b>jsoup.org</b></a> for downloads, documentation, and examples...</p>
27 | 
28 | @author <a href="http://jonathanhedley.com/">Jonathan Hedley</a>
29 | 
30 | </body>
31 | </html>
32 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/TextUtil.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup;
 2 | 
 3 | /**
 4 |  Text utils to ease testing
 5 | 
 6 |  @author Jonathan Hedley, jonathan@hedley.net */
 7 | public class TextUtil {
 8 |     public static final String LE = String.format("%n");
 9 | 
10 |     public static String stripNewlines(String text) {
11 |         text = text.replaceAll("\\n\\s*", "");
12 |         return text;
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/helper/DataUtilTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.UnsupportedEncodingException;
  6 | 
  7 | import org.jsoup.Jsoup;
  8 | import org.jsoup.nodes.Document;
  9 | import org.jsoup.parser.Parser;
 10 | import org.junit.Test;
 11 | 
 12 | import java.nio.ByteBuffer;
 13 | import java.nio.charset.Charset;
 14 | 
 15 | import static org.jsoup.integration.ParseTest.getFile;
 16 | import static org.junit.Assert.*;
 17 | 
 18 | public class DataUtilTest {
 19 |     @Test
 20 |     public void testCharset() {
 21 |         assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 "));
 22 |         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8"));
 23 |         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1"));
 24 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html"));
 25 |         assertEquals(null, DataUtil.getCharsetFromContentType(null));
 26 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html;charset=Unknown"));
 27 |     }
 28 | 
 29 |     @Test public void testQuotedCharset() {
 30 |         assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\""));
 31 |         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"UTF-8\""));
 32 |         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\""));
 33 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=\"Unsupported\""));
 34 |         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'"));
 35 |     }
 36 | 
 37 |     @Test public void discardsSpuriousByteOrderMark() {
 38 |         String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
 39 |         ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
 40 |         Document doc = DataUtil.parseByteData(buffer, "UTF-8", "http://foo.com/", Parser.htmlParser());
 41 |         assertEquals("One", doc.head().text());
 42 |     }
 43 | 
 44 |     @Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
 45 |         String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
 46 |         ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
 47 |         Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
 48 |         assertEquals("One", doc.head().text());
 49 |         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
 50 |     }
 51 | 
 52 |     @Test
 53 |     public void shouldNotThrowExceptionOnEmptyCharset() {
 54 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset="));
 55 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=;"));
 56 |     }
 57 | 
 58 |     @Test
 59 |     public void shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags() {
 60 |         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1, charset=1251"));
 61 |     }
 62 | 
 63 |     @Test
 64 |     public void shouldCorrectCharsetForDuplicateCharsetString() {
 65 |         assertEquals("iso-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=charset=iso-8859-1"));
 66 |     }
 67 | 
 68 |     @Test
 69 |     public void shouldReturnNullForIllegalCharsetNames() {
 70 |         assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=$HJKDF§$/("));
 71 |     }
 72 | 
 73 |     @Test
 74 |     public void generatesMimeBoundaries() {
 75 |         String m1 = DataUtil.mimeBoundary();
 76 |         String m2 = DataUtil.mimeBoundary();
 77 | 
 78 |         assertEquals(DataUtil.boundaryLength, m1.length());
 79 |         assertEquals(DataUtil.boundaryLength, m2.length());
 80 |         assertNotSame(m1, m2);
 81 |     }
 82 |     
 83 |     @Test
 84 |     public void wrongMetaCharsetFallback() {
 85 |         try {
 86 |             final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8");
 87 |             final ByteBuffer inBuffer = ByteBuffer.wrap(input);
 88 |             
 89 |             Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser());
 90 |             
 91 |             final String expected = "<html>\n" +
 92 |                                     " <head>\n" +
 93 |                                     "  <meta charset=\"iso-8\">\n" +
 94 |                                     " </head>\n" +
 95 |                                     " <body></body>\n" +
 96 |                                     "</html>";
 97 |             
 98 |             assertEquals(expected, doc.toString());
 99 |         } catch( UnsupportedEncodingException ex ) {
100 |             fail(ex.getMessage());
101 |         }
102 |     }
103 | 
104 |     @Test
105 |     public void supportsBOMinFiles() throws IOException {
106 |         // test files from http://www.i18nl10n.com/korean/utftest/
107 |         File in = getFile("/bomtests/bom_utf16be.html");
108 |         Document doc = Jsoup.parse(in, null, "http://example.com");
109 |         assertTrue(doc.title().contains("UTF-16BE"));
110 |         assertTrue(doc.text().contains("가각갂갃간갅"));
111 | 
112 |         in = getFile("/bomtests/bom_utf16le.html");
113 |         doc = Jsoup.parse(in, null, "http://example.com");
114 |         assertTrue(doc.title().contains("UTF-16LE"));
115 |         assertTrue(doc.text().contains("가각갂갃간갅"));
116 | 
117 |         in = getFile("/bomtests/bom_utf32be.html");
118 |         doc = Jsoup.parse(in, null, "http://example.com");
119 |         assertTrue(doc.title().contains("UTF-32BE"));
120 |         assertTrue(doc.text().contains("가각갂갃간갅"));
121 | 
122 |         in = getFile("/bomtests/bom_utf32le.html");
123 |         doc = Jsoup.parse(in, null, "http://example.com");
124 |         assertTrue(doc.title().contains("UTF-32LE"));
125 |         assertTrue(doc.text().contains("가각갂갃간갅"));
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/helper/HttpConnectionTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.helper;
  2 | 
  3 | import static org.junit.Assert.*;
  4 | 
  5 | import org.jsoup.integration.ParseTest;
  6 | import org.junit.Test;
  7 | import org.jsoup.Connection;
  8 | 
  9 | import java.io.IOException;
 10 | import java.util.*;
 11 | import java.net.URL;
 12 | import java.net.MalformedURLException;
 13 | 
 14 | public class HttpConnectionTest {
 15 |     /* most actual network http connection tests are in integration */
 16 | 
 17 |     @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnParseWithoutExecute() throws IOException {
 18 |         Connection con = HttpConnection.connect("http://example.com");
 19 |         con.response().parse();
 20 |     }
 21 | 
 22 |     @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnBodyWithoutExecute() throws IOException {
 23 |         Connection con = HttpConnection.connect("http://example.com");
 24 |         con.response().body();
 25 |     }
 26 | 
 27 |     @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnBodyAsBytesWithoutExecute() throws IOException {
 28 |         Connection con = HttpConnection.connect("http://example.com");
 29 |         con.response().bodyAsBytes();
 30 |     }
 31 | 
 32 |     @Test public void caseInsensitiveHeaders() {
 33 |         Connection.Response res = new HttpConnection.Response();
 34 |         Map<String, String> headers = res.headers();
 35 |         headers.put("Accept-Encoding", "gzip");
 36 |         headers.put("content-type", "text/html");
 37 |         headers.put("refErrer", "http://example.com");
 38 | 
 39 |         assertTrue(res.hasHeader("Accept-Encoding"));
 40 |         assertTrue(res.hasHeader("accept-encoding"));
 41 |         assertTrue(res.hasHeader("accept-Encoding"));
 42 | 
 43 |         assertEquals("gzip", res.header("accept-Encoding"));
 44 |         assertEquals("text/html", res.header("Content-Type"));
 45 |         assertEquals("http://example.com", res.header("Referrer"));
 46 | 
 47 |         res.removeHeader("Content-Type");
 48 |         assertFalse(res.hasHeader("content-type"));
 49 | 
 50 |         res.header("accept-encoding", "deflate");
 51 |         assertEquals("deflate", res.header("Accept-Encoding"));
 52 |         assertEquals("deflate", res.header("accept-Encoding"));
 53 |     }
 54 | 
 55 |     @Test public void headers() {
 56 |         Connection con = HttpConnection.connect("http://example.com");
 57 |         Map<String, String> headers = new HashMap<String, String>();
 58 |         headers.put("content-type", "text/html");
 59 |         headers.put("Connection", "keep-alive");
 60 |         headers.put("Host", "http://example.com");
 61 |         con.headers(headers);
 62 |         assertEquals("text/html", con.request().header("content-type"));
 63 |         assertEquals("keep-alive", con.request().header("Connection"));
 64 |         assertEquals("http://example.com", con.request().header("Host"));
 65 |     }
 66 | 
 67 |     @Test public void sameHeadersCombineWithComma() {
 68 |         Map<String, List<String>> headers = new HashMap<String, List<String>>();
 69 |         List<String> values = new ArrayList<String>();
 70 |         values.add("no-cache");
 71 |         values.add("no-store");
 72 |         headers.put("Cache-Control", values);
 73 |         HttpConnection.Response res = new HttpConnection.Response();
 74 |         res.processResponseHeaders(headers);
 75 |         assertEquals("no-cache, no-store", res.header("Cache-Control"));
 76 |     }
 77 | 
 78 |     @Test public void ignoresEmptySetCookies() {
 79 |         // prep http response header map
 80 |         Map<String, List<String>> headers = new HashMap<String, List<String>>();
 81 |         headers.put("Set-Cookie", Collections.<String>emptyList());
 82 |         HttpConnection.Response res = new HttpConnection.Response();
 83 |         res.processResponseHeaders(headers);
 84 |         assertEquals(0, res.cookies().size());
 85 |     }
 86 | 
 87 |     @Test public void ignoresEmptyCookieNameAndVals() {
 88 |         // prep http response header map
 89 |         Map<String, List<String>> headers = new HashMap<String, List<String>>();
 90 |         List<String> cookieStrings = new ArrayList<String>();
 91 |         cookieStrings.add(null);
 92 |         cookieStrings.add("");
 93 |         cookieStrings.add("one");
 94 |         cookieStrings.add("two=");
 95 |         cookieStrings.add("three=;");
 96 |         cookieStrings.add("four=data; Domain=.example.com; Path=/");
 97 | 
 98 |         headers.put("Set-Cookie", cookieStrings);
 99 |         HttpConnection.Response res = new HttpConnection.Response();
100 |         res.processResponseHeaders(headers);
101 |         assertEquals(4, res.cookies().size());
102 |         assertEquals("", res.cookie("one"));
103 |         assertEquals("", res.cookie("two"));
104 |         assertEquals("", res.cookie("three"));
105 |         assertEquals("data", res.cookie("four"));
106 |     }
107 | 
108 |     @Test public void connectWithUrl() throws MalformedURLException {
109 |         Connection con = HttpConnection.connect(new URL("http://example.com"));
110 |         assertEquals("http://example.com", con.request().url().toExternalForm());
111 |     }
112 | 
113 |     @Test(expected=IllegalArgumentException.class) public void throwsOnMalformedUrl() {
114 |         Connection con = HttpConnection.connect("bzzt");
115 |     }
116 | 
117 |     @Test public void userAgent() {
118 |         Connection con = HttpConnection.connect("http://example.com/");
119 |         assertEquals(HttpConnection.DEFAULT_UA, con.request().header("User-Agent"));
120 |         con.userAgent("Mozilla");
121 |         assertEquals("Mozilla", con.request().header("User-Agent"));
122 |     }
123 | 
124 |     @Test public void timeout() {
125 |         Connection con = HttpConnection.connect("http://example.com/");
126 |         assertEquals(30 * 1000, con.request().timeout());
127 |         con.timeout(1000);
128 |         assertEquals(1000, con.request().timeout());
129 |     }
130 | 
131 |     @Test public void referrer() {
132 |         Connection con = HttpConnection.connect("http://example.com/");
133 |         con.referrer("http://foo.com");
134 |         assertEquals("http://foo.com", con.request().header("Referer"));
135 |     }
136 | 
137 |     @Test public void method() {
138 |         Connection con = HttpConnection.connect("http://example.com/");
139 |         assertEquals(Connection.Method.GET, con.request().method());
140 |         con.method(Connection.Method.POST);
141 |         assertEquals(Connection.Method.POST, con.request().method());
142 |     }
143 | 
144 |     @Test(expected=IllegalArgumentException.class) public void throwsOnOddData() {
145 |         Connection con = HttpConnection.connect("http://example.com/");
146 |         con.data("Name", "val", "what");
147 |     }
148 | 
149 |     @Test public void data() {
150 |         Connection con = HttpConnection.connect("http://example.com/");
151 |         con.data("Name", "Val", "Foo", "bar");
152 |         Collection<Connection.KeyVal> values = con.request().data();
153 |         Object[] data =  values.toArray();
154 |         Connection.KeyVal one = (Connection.KeyVal) data[0];
155 |         Connection.KeyVal two = (Connection.KeyVal) data[1];
156 |         assertEquals("Name", one.key());
157 |         assertEquals("Val", one.value());
158 |         assertEquals("Foo", two.key());
159 |         assertEquals("bar", two.value());
160 |     }
161 | 
162 |     @Test public void cookie() {
163 |         Connection con = HttpConnection.connect("http://example.com/");
164 |         con.cookie("Name", "Val");
165 |         assertEquals("Val", con.request().cookie("Name"));
166 |     }
167 | 
168 |     @Test public void inputStream() {
169 |         Connection.KeyVal kv = HttpConnection.KeyVal.create("file", "thumb.jpg", ParseTest.inputStreamFrom("Check"));
170 |         assertEquals("file", kv.key());
171 |         assertEquals("thumb.jpg", kv.value());
172 |         assertTrue(kv.hasInputStream());
173 | 
174 |         kv = HttpConnection.KeyVal.create("one", "two");
175 |         assertEquals("one", kv.key());
176 |         assertEquals("two", kv.value());
177 |         assertFalse(kv.hasInputStream());
178 |     }
179 | 
180 |     @Test public void requestBody() {
181 |         Connection con = HttpConnection.connect("http://example.com/");
182 |         con.requestBody("foo");
183 |         assertEquals("foo", con.request().requestBody());
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/helper/StringUtilTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.helper;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.junit.Test;
 5 | 
 6 | import java.util.Arrays;
 7 | 
 8 | import static org.jsoup.helper.StringUtil.*;
 9 | import static org.junit.Assert.assertEquals;
10 | import static org.junit.Assert.assertFalse;
11 | import static org.junit.Assert.assertTrue;
12 | 
13 | public class StringUtilTest {
14 | 
15 |     @Test public void join() {
16 |         assertEquals("", StringUtil.join(Arrays.asList(""), " "));
17 |         assertEquals("one", StringUtil.join(Arrays.asList("one"), " "));
18 |         assertEquals("one two three", StringUtil.join(Arrays.asList("one", "two", "three"), " "));
19 |     }
20 | 
21 |     @Test public void padding() {
22 |         assertEquals("", StringUtil.padding(0));
23 |         assertEquals(" ", StringUtil.padding(1));
24 |         assertEquals("  ", StringUtil.padding(2));
25 |         assertEquals("               ", StringUtil.padding(15));
26 |     }
27 | 
28 |     @Test public void isBlank() {
29 |         assertTrue(StringUtil.isBlank(null));
30 |         assertTrue(StringUtil.isBlank(""));
31 |         assertTrue(StringUtil.isBlank("      "));
32 |         assertTrue(StringUtil.isBlank("   \r\n  "));
33 | 
34 |         assertFalse(StringUtil.isBlank("hello"));
35 |         assertFalse(StringUtil.isBlank("   hello   "));
36 |     }
37 | 
38 |     @Test public void isNumeric() {
39 |         assertFalse(StringUtil.isNumeric(null));
40 |         assertFalse(StringUtil.isNumeric(" "));
41 |         assertFalse(StringUtil.isNumeric("123 546"));
42 |         assertFalse(StringUtil.isNumeric("hello"));
43 |         assertFalse(StringUtil.isNumeric("123.334"));
44 | 
45 |         assertTrue(StringUtil.isNumeric("1"));
46 |         assertTrue(StringUtil.isNumeric("1234"));
47 |     }
48 | 
49 |     @Test public void isWhitespace() {
50 |         assertTrue(StringUtil.isWhitespace('\t'));
51 |         assertTrue(StringUtil.isWhitespace('\n'));
52 |         assertTrue(StringUtil.isWhitespace('\r'));
53 |         assertTrue(StringUtil.isWhitespace('\f'));
54 |         assertTrue(StringUtil.isWhitespace(' '));
55 |         
56 |         assertFalse(StringUtil.isWhitespace('\u00a0'));
57 |         assertFalse(StringUtil.isWhitespace('\u2000'));
58 |         assertFalse(StringUtil.isWhitespace('\u3000'));
59 |     }
60 | 
61 |     @Test public void normaliseWhiteSpace() {
62 |         assertEquals(" ", normaliseWhitespace("    \r \n \r\n"));
63 |         assertEquals(" hello there ", normaliseWhitespace("   hello   \r \n  there    \n"));
64 |         assertEquals("hello", normaliseWhitespace("hello"));
65 |         assertEquals("hello there", normaliseWhitespace("hello\nthere"));
66 |     }
67 | 
68 |     @Test public void normaliseWhiteSpaceHandlesHighSurrogates() {
69 |         String test71540chars = "\ud869\udeb2\u304b\u309a  1";
70 |         String test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1";
71 | 
72 |         assertEquals(test71540charsExpectedSingleWhitespace, normaliseWhitespace(test71540chars));
73 |         String extractedText = Jsoup.parse(test71540chars).text();
74 |         assertEquals(test71540charsExpectedSingleWhitespace, extractedText);
75 |     }
76 | 
77 |     @Test public void resolvesRelativeUrls() {
78 |         assertEquals("http://example.com/one/two?three", resolve("http://example.com", "./one/two?three"));
79 |         assertEquals("http://example.com/one/two?three", resolve("http://example.com?one", "./one/two?three"));
80 |         assertEquals("http://example.com/one/two?three#four", resolve("http://example.com", "./one/two?three#four"));
81 |         assertEquals("https://example.com/one", resolve("http://example.com/", "https://example.com/one"));
82 |         assertEquals("http://example.com/one/two.html", resolve("http://example.com/two/", "../one/two.html"));
83 |         assertEquals("https://example2.com/one", resolve("https://example.com/", "//example2.com/one"));
84 |         assertEquals("https://example.com:8080/one", resolve("https://example.com:8080", "./one"));
85 |         assertEquals("https://example2.com/one", resolve("http://example.com/", "https://example2.com/one"));
86 |         assertEquals("https://example.com/one", resolve("wrong", "https://example.com/one"));
87 |         assertEquals("https://example.com/one", resolve("https://example.com/one", ""));
88 |         assertEquals("", resolve("wrong", "also wrong"));
89 |         assertEquals("ftp://example.com/one", resolve("ftp://example.com/two/", "../one"));
90 |         assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "./two.c"));
91 |         assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "two.c"));
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/helper/W3CDomTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.helper;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.integration.ParseTest;
 5 | import org.jsoup.nodes.Element;
 6 | import org.junit.Test;
 7 | import org.w3c.dom.Document;
 8 | import org.w3c.dom.Node;
 9 | 
10 | import java.io.File;
11 | import java.io.IOException;
12 | 
13 | import static org.jsoup.TextUtil.LE;
14 | import static org.junit.Assert.assertEquals;
15 | import static org.junit.Assert.assertTrue;
16 | 
17 | public class W3CDomTest {
18 |     @Test
19 |     public void simpleConversion() {
20 |         String html = "<html><head><title>W3c</title></head><body><p class='one' id=12>Text</p><!-- comment --><invalid>What<script>alert('!')";
21 |         org.jsoup.nodes.Document doc = Jsoup.parse(html);
22 | 
23 |         W3CDom w3c = new W3CDom();
24 |         Document wDoc = w3c.fromJsoup(doc);
25 |         String out = w3c.asString(wDoc);
26 |         assertEquals(
27 |                 "<html>" + LE +
28 |                         "<head>" + LE +
29 |                         "<META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" + LE +
30 |                         "<title>W3c</title>" + LE +
31 |                         "</head>" + LE +
32 |                         "<body>" + LE +
33 |                         "<p class=\"one\" id=\"12\">Text</p>" + LE +
34 |                         "<!-- comment -->" + LE +
35 |                         "<invalid>What<script>alert('!')</script>" + LE +
36 |                         "</invalid>" + LE +
37 |                         "</body>" + LE +
38 |                         "</html>" + LE
39 |                 , out);
40 |     }
41 | 
42 |     @Test
43 |     public void convertsGoogle() throws IOException {
44 |         File in = ParseTest.getFile("/htmltests/google-ipod.html");
45 |         org.jsoup.nodes.Document doc = Jsoup.parse(in, "UTF8");
46 | 
47 |         W3CDom w3c = new W3CDom();
48 |         Document wDoc = w3c.fromJsoup(doc);
49 |         Node htmlEl = wDoc.getChildNodes().item(0);
50 |         assertEquals(null, htmlEl.getNamespaceURI());
51 |         assertEquals("html", htmlEl.getLocalName());
52 |         assertEquals("html", htmlEl.getNodeName());
53 | 
54 |         String out = w3c.asString(wDoc);
55 |         assertTrue(out.contains("ipod"));
56 |     }
57 | 
58 |     @Test
59 |     public void namespacePreservation() throws IOException {
60 |         File in = ParseTest.getFile("/htmltests/namespaces.xhtml");
61 |         org.jsoup.nodes.Document jsoupDoc;
62 |         jsoupDoc = Jsoup.parse(in, "UTF-8");
63 | 
64 |         Document doc;
65 |         org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom();
66 |         doc = jDom.fromJsoup(jsoupDoc);
67 | 
68 |         Node htmlEl = doc.getChildNodes().item(0);
69 |         assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI());
70 |         assertEquals("html", htmlEl.getLocalName());
71 |         assertEquals("html", htmlEl.getNodeName());
72 | 
73 |         Node epubTitle = htmlEl.getChildNodes().item(2).getChildNodes().item(3);
74 |         assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI());
75 |         assertEquals("title", epubTitle.getLocalName());
76 |         assertEquals("epub:title", epubTitle.getNodeName());
77 | 
78 |         Node xSection = epubTitle.getNextSibling().getNextSibling();
79 |         assertEquals("urn:test", xSection.getNamespaceURI());
80 |         assertEquals("section", xSection.getLocalName());
81 |         assertEquals("x:section", xSection.getNodeName());
82 |     }
83 | 
84 |     @Test
85 |     public void handlesInvalidAttributeNames() {
86 |         String html = "<html><head></head><body style=\"color: red\" \" name\"></body></html>";
87 |         org.jsoup.nodes.Document jsoupDoc;
88 |         jsoupDoc = Jsoup.parse(html);
89 |         Element body = jsoupDoc.select("body").first();
90 |         assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it
91 |         assertTrue(body.hasAttr("name\""));
92 | 
93 |         Document w3Doc = new W3CDom().fromJsoup(jsoupDoc);
94 |     }
95 | }
96 | 
97 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/integration/Benchmark.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.integration;
 2 | 
 3 | import java.util.Date;
 4 | 
 5 | /**
 6 |  Does an A/B test on two methods, and prints out how long each took.
 7 | 
 8 |  @author Jonathan Hedley, jonathan@hedley.net */
 9 | public class Benchmark {
10 |     public static void run(Runnable a, Runnable b, int count) {
11 |         long aMillis;
12 |         long bMillis;
13 | 
14 |         print("Running test A (x%d)", count);
15 |         aMillis = time(a, count);
16 |         print("Running test B");
17 |         bMillis = time(b, count);
18 | 
19 |         print("\nResults:");
20 |         print("A: %.2fs", aMillis / 1000f);
21 |         print("B: %.2fs", bMillis / 1000f);
22 |         print("\nB ran in %.2f %% time of A\n", (bMillis *1f / aMillis * 1f) * 100f);
23 |     }
24 | 
25 |     private static long time(Runnable test, int count) {
26 |         Date start = new Date();
27 |         for (int i = 0; i < count; i++) {
28 |             test.run();
29 |         }
30 |         Date end = new Date();
31 |         return end.getTime() - start.getTime();
32 |     }
33 | 
34 |     private static void print(String msgFormat, Object... msgParams) {
35 |         System.out.println(String.format(msgFormat, msgParams));
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/integration/ParseTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.integration;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.nodes.Document;
  5 | import org.jsoup.nodes.Element;
  6 | import org.jsoup.select.Elements;
  7 | import org.junit.Test;
  8 | 
  9 | import java.io.*;
 10 | import java.net.URISyntaxException;
 11 | 
 12 | import static org.junit.Assert.*;
 13 | 
 14 | /**
 15 |  * Integration test: parses from real-world example HTML.
 16 |  *
 17 |  * @author Jonathan Hedley, jonathan@hedley.net
 18 |  */
 19 | public class ParseTest {
 20 | 
 21 |     @Test
 22 |     public void testSmhBizArticle() throws IOException {
 23 |         File in = getFile("/htmltests/smh-biz-article-1.html");
 24 |         Document doc = Jsoup.parse(in, "UTF-8",
 25 |                 "http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html");
 26 |         assertEquals("The board’s next fear: the female quota",
 27 |                 doc.title()); // note that the apos in the source is a literal ’ (8217), not escaped or '
 28 |         assertEquals("en", doc.select("html").attr("xml:lang"));
 29 | 
 30 |         Elements articleBody = doc.select(".articleBody > *");
 31 |         assertEquals(17, articleBody.size());
 32 |         // todo: more tests!
 33 | 
 34 |     }
 35 | 
 36 |     @Test
 37 |     public void testNewsHomepage() throws IOException {
 38 |         File in = getFile("/htmltests/news-com-au-home.html");
 39 |         Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/");
 40 |         assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title());
 41 |         assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim());
 42 | 
 43 |         Element a = doc.select("a[href=/entertainment/horoscopes]").first();
 44 |         assertEquals("/entertainment/horoscopes", a.attr("href"));
 45 |         assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href"));
 46 | 
 47 |         Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first();
 48 |         assertEquals(
 49 |                 "http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003",
 50 |                 hs.attr("href"));
 51 |         assertEquals(hs.attr("href"), hs.attr("abs:href"));
 52 |     }
 53 | 
 54 |     @Test
 55 |     public void testGoogleSearchIpod() throws IOException {
 56 |         File in = getFile("/htmltests/google-ipod.html");
 57 |         Document doc = Jsoup.parse(in, "UTF-8", "http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10");
 58 |         assertEquals("ipod - Google Search", doc.title());
 59 |         Elements results = doc.select("h3.r > a");
 60 |         assertEquals(12, results.size());
 61 |         assertEquals(
 62 |                 "http://news.google.com/news?hl=en&q=ipod&um=1&ie=UTF-8&ei=uYlKS4SbBoGg6gPf-5XXCw&sa=X&oi=news_group&ct=title&resnum=1&ved=0CCIQsQQwAA",
 63 |                 results.get(0).attr("href"));
 64 |         assertEquals("http://www.apple.com/itunes/",
 65 |                 results.get(1).attr("href"));
 66 |     }
 67 | 
 68 |     @Test
 69 |     public void testBinary() throws IOException {
 70 |         File in = getFile("/htmltests/thumb.jpg");
 71 |         Document doc = Jsoup.parse(in, "UTF-8");
 72 |         // nothing useful, but did not blow up
 73 |         assertTrue(doc.text().contains("gd-jpeg"));
 74 |     }
 75 | 
 76 |     @Test
 77 |     public void testYahooJp() throws IOException {
 78 |         File in = getFile("/htmltests/yahoo-jp.html");
 79 |         Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html"); // http charset is utf-8.
 80 |         assertEquals("Yahoo! JAPAN", doc.title());
 81 |         Element a = doc.select("a[href=t/2322m2]").first();
 82 |         assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/t/2322m2",
 83 |                 a.attr("abs:href")); // session put into <base>
 84 |         assertEquals("全国、人気の駅ランキング", a.text());
 85 |     }
 86 | 
 87 |     @Test
 88 |     public void testBaidu() throws IOException {
 89 |         // tests <meta http-equiv="Content-Type" content="text/html;charset=gb2312">
 90 |         File in = getFile("/htmltests/baidu-cn-home.html");
 91 |         Document doc = Jsoup.parse(in, null,
 92 |                 "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse
 93 |         Element submit = doc.select("#su").first();
 94 |         assertEquals("百度一下", submit.attr("value"));
 95 | 
 96 |         // test from attribute match
 97 |         submit = doc.select("input[value=百度一下]").first();
 98 |         assertEquals("su", submit.id());
 99 |         Element newsLink = doc.select("a:contains(新)").first();
100 |         assertEquals("http://news.baidu.com", newsLink.absUrl("href"));
101 | 
102 |         // check auto-detect from meta
103 |         assertEquals("GB2312", doc.outputSettings().charset().displayName());
104 |         assertEquals("<title>百度一下，你就知道      </title>", doc.select("title").outerHtml());
105 | 
106 |         doc.outputSettings().charset("ascii");
107 |         assertEquals("<title>&#x767e;&#x5ea6;&#x4e00;&#x4e0b;&#xff0c;&#x4f60;&#x5c31;&#x77e5;&#x9053;      </title>",
108 |                 doc.select("title").outerHtml());
109 |     }
110 | 
111 |     @Test
112 |     public void testBaiduVariant() throws IOException {
113 |         // tests <meta charset> when preceded by another <meta>
114 |         File in = getFile("/htmltests/baidu-variant.html");
115 |         Document doc = Jsoup.parse(in, null,
116 |                 "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse
117 |         // check auto-detect from meta
118 |         assertEquals("GB2312", doc.outputSettings().charset().displayName());
119 |         assertEquals("<title>百度一下，你就知道</title>", doc.select("title").outerHtml());
120 |     }
121 | 
122 |     @Test
123 |     public void testHtml5Charset() throws IOException {
124 |         // test that <meta charset="gb2312"> works
125 |         File in = getFile("/htmltests/meta-charset-1.html");
126 |         Document doc = Jsoup.parse(in, null, "http://example.com/"); //gb2312, has html5 <meta charset>
127 |         assertEquals("新", doc.text());
128 |         assertEquals("GB2312", doc.outputSettings().charset().displayName());
129 | 
130 |         // double check, no charset, falls back to utf8 which is incorrect
131 |         in = getFile("/htmltests/meta-charset-2.html"); //
132 |         doc = Jsoup.parse(in, null, "http://example.com"); // gb2312, no charset
133 |         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
134 |         assertFalse("新".equals(doc.text()));
135 | 
136 |         // confirm fallback to utf8
137 |         in = getFile("/htmltests/meta-charset-3.html");
138 |         doc = Jsoup.parse(in, null, "http://example.com/"); // utf8, no charset
139 |         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
140 |         assertEquals("新", doc.text());
141 |     }
142 | 
143 |     @Test
144 |     public void testBrokenHtml5CharsetWithASingleDoubleQuote() throws IOException {
145 |         InputStream in = inputStreamFrom("<html>\n" +
146 |                 "<head><meta charset=UTF-8\"></head>\n" +
147 |                 "<body></body>\n" +
148 |                 "</html>");
149 |         Document doc = Jsoup.parse(in, null, "http://example.com/");
150 |         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
151 |     }
152 | 
153 |     @Test
154 |     public void testNytArticle() throws IOException {
155 |         // has tags like <nyt_text>
156 |         File in = getFile("/htmltests/nyt-article-1.html");
157 |         Document doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp");
158 | 
159 |         Element headline = doc.select("nyt_headline[version=1.0]").first();
160 |         assertEquals("As BP Lays Out Future, It Will Not Include Hayward", headline.text());
161 |     }
162 | 
163 |     @Test
164 |     public void testYahooArticle() throws IOException {
165 |         File in = getFile("/htmltests/yahoo-article-1.html");
166 |         Document doc = Jsoup.parse(in, "UTF-8", "http://news.yahoo.com/s/nm/20100831/bs_nm/us_gm_china");
167 |         Element p = doc.select("p:contains(Volt will be sold in the United States)").first();
168 |         assertEquals("In July, GM said its electric Chevrolet Volt will be sold in the United States at $41,000 -- $8,000 more than its nearest competitor, the Nissan Leaf.", p.text());
169 |     }
170 | 
171 |     public static File getFile(String resourceName) {
172 |         try {
173 |             File file = new File(ParseTest.class.getResource(resourceName).toURI());
174 |             return file;
175 |         } catch (URISyntaxException e) {
176 |             throw new IllegalStateException(e);
177 |         }
178 |     }
179 | 
180 |     public static InputStream inputStreamFrom(String s) {
181 |         try {
182 |             return new ByteArrayInputStream(s.getBytes("UTF-8"));
183 |         } catch (UnsupportedEncodingException e) {
184 |             throw new RuntimeException(e);
185 |         }
186 |     }
187 | 
188 | }
189 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/AttributeTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import static org.junit.Assert.assertEquals;
 6 | 
 7 | public class AttributeTest {
 8 |     @Test public void html() {
 9 |         Attribute attr = new Attribute("key", "value &");
10 |         assertEquals("key=\"value &amp;\"", attr.html());
11 |         assertEquals(attr.html(), attr.toString());
12 |     }
13 | 
14 |     @Test public void testWithSupplementaryCharacterInAttributeKeyAndValue() {
15 |         String s = new String(Character.toChars(135361));
16 |         Attribute attr = new Attribute(s, "A" + s + "B");
17 |         assertEquals(s + "=\"A" + s + "B\"", attr.html());
18 |         assertEquals(attr.html(), attr.toString());
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/AttributesTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.junit.Test;
  4 | 
  5 | import java.util.Iterator;
  6 | 
  7 | import static org.junit.Assert.assertEquals;
  8 | import static org.junit.Assert.assertFalse;
  9 | import static org.junit.Assert.assertTrue;
 10 | 
 11 | /**
 12 |  * Tests for Attributes.
 13 |  *
 14 |  * @author Jonathan Hedley
 15 |  */
 16 | public class AttributesTest {
 17 | 
 18 |     @Test
 19 |     public void html() {
 20 |         Attributes a = new Attributes();
 21 |         a.put("Tot", "a&p");
 22 |         a.put("Hello", "There");
 23 |         a.put("data-name", "Jsoup");
 24 | 
 25 |         assertEquals(3, a.size());
 26 |         assertTrue(a.hasKey("Tot"));
 27 |         assertTrue(a.hasKey("Hello"));
 28 |         assertTrue(a.hasKey("data-name"));
 29 |         assertFalse(a.hasKey("tot"));
 30 |         assertTrue(a.hasKeyIgnoreCase("tot"));
 31 |         assertEquals("There", a.getIgnoreCase("hEllo"));
 32 | 
 33 |         assertEquals(1, a.dataset().size());
 34 |         assertEquals("Jsoup", a.dataset().get("name"));
 35 |         assertEquals("", a.get("tot"));
 36 |         assertEquals("a&p", a.get("Tot"));
 37 |         assertEquals("a&p", a.getIgnoreCase("tot"));
 38 | 
 39 |         assertEquals(" Tot=\"a&amp;p\" Hello=\"There\" data-name=\"Jsoup\"", a.html());
 40 |         assertEquals(a.html(), a.toString());
 41 |     }
 42 | 
 43 |     @Test
 44 |     public void testIteratorRemovable() {
 45 |         Attributes a = new Attributes();
 46 |         a.put("Tot", "a&p");
 47 |         a.put("Hello", "There");
 48 |         a.put("data-name", "Jsoup");
 49 | 
 50 |         Iterator<Attribute> iterator = a.iterator();
 51 |         iterator.next();
 52 |         iterator.remove();
 53 |         assertEquals(2, a.size());
 54 |     }
 55 | 
 56 |     @Test
 57 |     public void testIterator() {
 58 |         Attributes a = new Attributes();
 59 |         String[][] datas = {{"Tot", "raul"},
 60 |             {"Hello", "pismuth"},
 61 |             {"data-name", "Jsoup"}};
 62 |         for (String[] atts : datas) {
 63 |             a.put(atts[0], atts[1]);
 64 |         }
 65 | 
 66 |         Iterator<Attribute> iterator = a.iterator();
 67 |         assertTrue(iterator.hasNext());
 68 |         int i = 0;
 69 |         for (Attribute attribute : a) {
 70 |             assertEquals(datas[i][0], attribute.getKey());
 71 |             assertEquals(datas[i][1], attribute.getValue());
 72 |             i++;
 73 |         }
 74 |         assertEquals(datas.length, i);
 75 |     }
 76 | 
 77 |     @Test
 78 |     public void testIteratorEmpty() {
 79 |         Attributes a = new Attributes();
 80 | 
 81 |         Iterator<Attribute> iterator = a.iterator();
 82 |         assertFalse(iterator.hasNext());
 83 |     }
 84 | 
 85 |     @Test
 86 |     public void removeCaseSensitive() {
 87 |         Attributes a = new Attributes();
 88 |         a.put("Tot", "a&p");
 89 |         a.put("tot", "one");
 90 |         a.put("Hello", "There");
 91 |         a.put("hello", "There");
 92 |         a.put("data-name", "Jsoup");
 93 | 
 94 |         assertEquals(5, a.size());
 95 |         a.remove("Tot");
 96 |         a.remove("Hello");
 97 |         assertEquals(3, a.size());
 98 |         assertTrue(a.hasKey("tot"));
 99 |         assertFalse(a.hasKey("Tot"));
100 |     }
101 | 
102 | }
103 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/BuildEntities.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import com.google.gson.Gson;
  4 | import com.google.gson.reflect.TypeToken;
  5 | import org.jsoup.Connection;
  6 | import org.jsoup.Jsoup;
  7 | import org.jsoup.integration.UrlConnectTest;
  8 | import org.jsoup.nodes.Entities;
  9 | 
 10 | import java.io.File;
 11 | import java.io.FileWriter;
 12 | import java.io.IOException;
 13 | import java.util.ArrayList;
 14 | import java.util.Collections;
 15 | import java.util.Comparator;
 16 | import java.util.Map;
 17 | 
 18 | /**
 19 |  * Fetches HTML entity names from w3.org json, and outputs data files for optimized used in Entities.
 20 |  * I refuse to believe that entity names like "NotNestedLessLess" are valuable or useful for HTML authors. Implemented
 21 |  * only to be complete.
 22 |  */
 23 | class BuildEntities {
 24 |     private static final String projectDir = "/Users/jhy/projects/jsoup";
 25 | 
 26 |     public static void main(String[] args) throws IOException {
 27 |         String url = "https://www.w3.org/TR/2012/WD-html5-20121025/entities.json";
 28 |         Connection.Response res = Jsoup.connect(url)
 29 |             .ignoreContentType(true)
 30 |             .userAgent(UrlConnectTest.browserUa)
 31 |             .execute();
 32 | 
 33 |         Gson gson = new Gson();
 34 |         Map<String, CharacterRef> input = gson.fromJson(res.body(),
 35 |             new TypeToken<Map<String, CharacterRef>>() {
 36 |             }.getType());
 37 | 
 38 | 
 39 |         // build name sorted base and full character lists:
 40 |         ArrayList<CharacterRef> base = new ArrayList<CharacterRef>();
 41 |         ArrayList<CharacterRef> full = new ArrayList<CharacterRef>();
 42 | 
 43 |         for (Map.Entry<String, CharacterRef> entry : input.entrySet()) {
 44 |             String name = entry.getKey().substring(1); // name is like &acute or &acute; , trim &
 45 |             CharacterRef ref = entry.getValue();
 46 |             if (name.endsWith(";")) {
 47 |                 name = name.substring(0, name.length() - 1);
 48 |                 full.add(ref);
 49 |             } else {
 50 |                 base.add(ref);
 51 |             }
 52 |             ref.name = name;
 53 |         }
 54 |         Collections.sort(base, byName);
 55 |         Collections.sort(full, byName);
 56 | 
 57 |         // now determine code point order
 58 |         ArrayList<CharacterRef> baseByCode = new ArrayList<CharacterRef>(base);
 59 |         ArrayList<CharacterRef> fullByCode = new ArrayList<CharacterRef>(full);
 60 |         Collections.sort(baseByCode, byCode);
 61 |         Collections.sort(fullByCode, byCode);
 62 | 
 63 |         // and update their codepoint index. Don't
 64 |         ArrayList<CharacterRef>[] codelists = new ArrayList[]{baseByCode, fullByCode};
 65 |         for (ArrayList<CharacterRef> codelist : codelists) {
 66 |             for (int i = 0; i < codelist.size(); i++) {
 67 |                 codelist.get(i).codeIndex = i;
 68 |             }
 69 |         }
 70 | 
 71 |         // now write them
 72 |         persist("entities-full.properties", full);
 73 |         persist("entities-base.properties", base);
 74 | 
 75 |         System.out.println("Full size: " + full.size() + ", base size: " + base.size());
 76 |     }
 77 | 
 78 |     private static void persist(String name, ArrayList<CharacterRef> refs) throws IOException {
 79 |         String base = projectDir + "/src/main/java/org/jsoup/nodes";
 80 |         File file = new File(base, name);
 81 |         FileWriter writer = new FileWriter(file, false);
 82 |         for (CharacterRef ref : refs) {
 83 |             writer.append(ref.toString()).append("\n");
 84 |         }
 85 |         writer.close();
 86 |     }
 87 | 
 88 | 
 89 |     private static class CharacterRef {
 90 |         int[] codepoints;
 91 |         String name;
 92 |         int codeIndex;
 93 | 
 94 |         @Override
 95 |         public String toString() {
 96 |             return name
 97 |                 + "="
 98 |                 + d(codepoints[0])
 99 |                 + (codepoints.length > 1 ? "," + d(codepoints[1]) : "")
100 |                 + ";" + d(codeIndex);
101 |         }
102 |     }
103 | 
104 |     private static String d(int d) {
105 |         return Integer.toString(d, Entities.codepointRadix);
106 |     }
107 | 
108 |     private static class ByName implements Comparator<CharacterRef> {
109 |         public int compare(CharacterRef o1, CharacterRef o2) {
110 |             return o1.name.compareTo(o2.name);
111 |         }
112 |     }
113 | 
114 |     private static class ByCode implements Comparator<CharacterRef> {
115 |         public int compare(CharacterRef o1, CharacterRef o2) {
116 |             int[] c1 = o1.codepoints;
117 |             int[] c2 = o2.codepoints;
118 |             int first = c1[0] - c2[0];
119 |             if (first != 0)
120 |                 return first;
121 |             if (c1.length == 1 && c2.length == 1) { // for the same code, use the shorter name
122 |                 int len = o2.name.length() - o1.name.length();
123 |                 if (len != 0)
124 |                     return len;
125 |                 return o1.name.compareTo(o2.name);
126 |             }
127 |             if (c1.length == 2 && c2.length == 2)
128 |                 return c1[1] - c2[1];
129 |             else
130 |                 return c2.length - c1.length; // pushes multi down the list so hits on singles first (don't support multi lookup by codepoint yet)
131 |         }
132 |     }
133 | 
134 |     private static ByName byName = new ByName();
135 |     private static ByCode byCode = new ByCode();
136 | }
137 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/DocumentTypeTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.parser.Parser;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.*;
 8 | 
 9 | /**
10 |  * Tests for the DocumentType node
11 |  *
12 |  * @author Jonathan Hedley, http://jonathanhedley.com/
13 |  */
14 | public class DocumentTypeTest {
15 |     @Test
16 |     public void constructorValidationOkWithBlankName() {
17 |         DocumentType fail = new DocumentType("","", "", "");
18 |     }
19 | 
20 |     @Test(expected = IllegalArgumentException.class)
21 |     public void constructorValidationThrowsExceptionOnNulls() {
22 |         DocumentType fail = new DocumentType("html", null, null, "");
23 |     }
24 | 
25 |     @Test
26 |     public void constructorValidationOkWithBlankPublicAndSystemIds() {
27 |         DocumentType fail = new DocumentType("html","", "","");
28 |     }
29 | 
30 |     @Test public void outerHtmlGeneration() {
31 |         DocumentType html5 = new DocumentType("html", "", "", "");
32 |         assertEquals("<!doctype html>", html5.outerHtml());
33 | 
34 |         DocumentType publicDocType = new DocumentType("html", "-//IETF//DTD HTML//", "", "");
35 |         assertEquals("<!DOCTYPE html PUBLIC \"-//IETF//DTD HTML//\">", publicDocType.outerHtml());
36 | 
37 |         DocumentType systemDocType = new DocumentType("html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd", "");
38 |         assertEquals("<!DOCTYPE html \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">", systemDocType.outerHtml());
39 | 
40 |         DocumentType combo = new DocumentType("notHtml", "--public", "--system", "");
41 |         assertEquals("<!DOCTYPE notHtml PUBLIC \"--public\" \"--system\">", combo.outerHtml());
42 |     }
43 | 
44 |     @Test public void testRoundTrip() {
45 |         String base = "<!DOCTYPE html>";
46 |         assertEquals("<!doctype html>", htmlOutput(base));
47 |         assertEquals(base, xmlOutput(base));
48 | 
49 |         String publicDoc = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
50 |         assertEquals(publicDoc, htmlOutput(publicDoc));
51 |         assertEquals(publicDoc, xmlOutput(publicDoc));
52 | 
53 |         String systemDoc = "<!DOCTYPE html SYSTEM \"exampledtdfile.dtd\">";
54 |         assertEquals(systemDoc, htmlOutput(systemDoc));
55 |         assertEquals(systemDoc, xmlOutput(systemDoc));
56 | 
57 |         String legacyDoc = "<!DOCTYPE html SYSTEM \"about:legacy-compat\">";
58 |         assertEquals(legacyDoc, htmlOutput(legacyDoc));
59 |         assertEquals(legacyDoc, xmlOutput(legacyDoc));
60 |     }
61 | 
62 |     private String htmlOutput(String in) {
63 |         DocumentType type = (DocumentType) Jsoup.parse(in).childNode(0);
64 |         return type.outerHtml();
65 |     }
66 | 
67 |     private String xmlOutput(String in) {
68 |         return Jsoup.parse(in, "", Parser.xmlParser()).childNode(0).outerHtml();
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/EntitiesTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.junit.Test;
  5 | 
  6 | import static org.jsoup.nodes.Document.OutputSettings;
  7 | import static org.jsoup.nodes.Entities.EscapeMode.*;
  8 | import static org.junit.Assert.*;
  9 | 
 10 | public class EntitiesTest {
 11 |     @Test public void escape() {
 12 |         String text = "Hello &<> Å å π 新 there ¾ © »";
 13 |         String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
 14 |         String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended));
 15 |         String escapedAsciiXhtml = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(xhtml));
 16 |         String escapedUtfFull = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended));
 17 |         String escapedUtfMin = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(xhtml));
 18 | 
 19 |         assertEquals("Hello &amp;&lt;&gt; &Aring; &aring; &#x3c0; &#x65b0; there &frac34; &copy; &raquo;", escapedAscii);
 20 |         assertEquals("Hello &amp;&lt;&gt; &angst; &aring; &pi; &#x65b0; there &frac34; &copy; &raquo;", escapedAsciiFull);
 21 |         assertEquals("Hello &amp;&lt;&gt; &#xc5; &#xe5; &#x3c0; &#x65b0; there &#xbe; &#xa9; &#xbb;", escapedAsciiXhtml);
 22 |         assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ © »", escapedUtfFull);
 23 |         assertEquals("Hello &amp;&lt;&gt; Å å π 新 there ¾ © »", escapedUtfMin);
 24 |         // odd that it's defined as aring in base but angst in full
 25 | 
 26 |         // round trip
 27 |         assertEquals(text, Entities.unescape(escapedAscii));
 28 |         assertEquals(text, Entities.unescape(escapedAsciiFull));
 29 |         assertEquals(text, Entities.unescape(escapedAsciiXhtml));
 30 |         assertEquals(text, Entities.unescape(escapedUtfFull));
 31 |         assertEquals(text, Entities.unescape(escapedUtfMin));
 32 |     }
 33 | 
 34 |     @Test public void escapedSupplemtary() {
 35 |         String text = "\uD835\uDD59";
 36 |         String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
 37 |         assertEquals("&#x1d559;", escapedAscii);
 38 |         String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended));
 39 |         assertEquals("&hopf;", escapedAsciiFull);
 40 |         String escapedUtf= Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended));
 41 |         assertEquals(text, escapedUtf);
 42 |     }
 43 | 
 44 |     @Test public void unescapeMultiChars() {
 45 |         String text = "&NestedGreaterGreater; &nGg; &nGt; &nGtv; &Gt; &gg;"; // gg is not combo, but 8811 could conflict with NestedGreaterGreater or others
 46 |         String un = "≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫";
 47 |         assertEquals(un, Entities.unescape(text));
 48 |         String escaped = Entities.escape(un, new OutputSettings().charset("ascii").escapeMode(extended));
 49 |         assertEquals("&Gt; &Gg;&#x338; &Gt;&#x20d2; &Gt;&#x338; &Gt; &Gt;", escaped);
 50 |         assertEquals(un, Entities.unescape(escaped));
 51 |     }
 52 | 
 53 |     @Test public void xhtml() {
 54 |         String text = "&amp; &gt; &lt; &quot;";
 55 |         assertEquals(38, xhtml.codepointForName("amp"));
 56 |         assertEquals(62, xhtml.codepointForName("gt"));
 57 |         assertEquals(60, xhtml.codepointForName("lt"));
 58 |         assertEquals(34, xhtml.codepointForName("quot"));
 59 | 
 60 |         assertEquals("amp", xhtml.nameForCodepoint(38));
 61 |         assertEquals("gt", xhtml.nameForCodepoint(62));
 62 |         assertEquals("lt", xhtml.nameForCodepoint(60));
 63 |         assertEquals("quot", xhtml.nameForCodepoint(34));
 64 |     }
 65 | 
 66 |     @Test public void getByName() {
 67 |         assertEquals("≫⃒", Entities.getByName("nGt"));
 68 |         assertEquals("fj", Entities.getByName("fjlig"));
 69 |         assertEquals("≫", Entities.getByName("gg"));
 70 |         assertEquals("©", Entities.getByName("copy"));
 71 |     }
 72 | 
 73 |     @Test public void escapeSupplementaryCharacter() {
 74 |         String text = new String(Character.toChars(135361));
 75 |         String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
 76 |         assertEquals("&#x210c1;", escapedAscii);
 77 |         String escapedUtf = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(base));
 78 |         assertEquals(text, escapedUtf);
 79 |     }
 80 | 
 81 |     @Test public void notMissingMultis() {
 82 |         String text = "&nparsl;";
 83 |         String un = "\u2AFD\u20E5";
 84 |         assertEquals(un, Entities.unescape(text));
 85 |     }
 86 | 
 87 |     @Test public void notMissingSupplementals() {
 88 |         String text = "&npolint; &qfr;";
 89 |         String un = "⨔ \uD835\uDD2E"; // 𝔮
 90 |         assertEquals(un, Entities.unescape(text));
 91 |     }
 92 | 
 93 |     @Test public void unescape() {
 94 |         String text = "Hello &AElig; &amp;&LT&gt; &reg &angst; &angst &#960; &#960 &#x65B0; there &! &frac34; &copy; &COPY;";
 95 |         assertEquals("Hello Æ &<> ® Å &angst π π 新 there &! ¾ © ©", Entities.unescape(text));
 96 | 
 97 |         assertEquals("&0987654321; &unknown", Entities.unescape("&0987654321; &unknown"));
 98 |     }
 99 | 
100 |     @Test public void strictUnescape() { // for attributes, enforce strict unescaping (must look like &#xxx; , not just &#xxx)
101 |         String text = "Hello &amp= &amp;";
102 |         assertEquals("Hello &amp= &", Entities.unescape(text, true));
103 |         assertEquals("Hello &= &", Entities.unescape(text));
104 |         assertEquals("Hello &= &", Entities.unescape(text, false));
105 |     }
106 | 
107 |     
108 |     @Test public void caseSensitive() {
109 |         String unescaped = "Ü ü & &";
110 |         assertEquals("&Uuml; &uuml; &amp; &amp;",
111 |                 Entities.escape(unescaped, new OutputSettings().charset("ascii").escapeMode(extended)));
112 |         
113 |         String escaped = "&Uuml; &uuml; &amp; &AMP";
114 |         assertEquals("Ü ü & &", Entities.unescape(escaped));
115 |     }
116 |     
117 |     @Test public void quoteReplacements() {
118 |         String escaped = "&#92; &#36;";
119 |         String unescaped = "\\ $";
120 |         
121 |         assertEquals(unescaped, Entities.unescape(escaped));
122 |     }
123 | 
124 |     @Test public void letterDigitEntities() {
125 |         String html = "<p>&sup1;&sup2;&sup3;&frac14;&frac12;&frac34;</p>";
126 |         Document doc = Jsoup.parse(html);
127 |         doc.outputSettings().charset("ascii");
128 |         Element p = doc.select("p").first();
129 |         assertEquals("&sup1;&sup2;&sup3;&frac14;&frac12;&frac34;", p.html());
130 |         assertEquals("¹²³¼½¾", p.text());
131 |         doc.outputSettings().charset("UTF-8");
132 |         assertEquals("¹²³¼½¾", p.html());
133 |     }
134 | 
135 |     @Test public void noSpuriousDecodes() {
136 |         String string = "http://www.foo.com?a=1&num_rooms=1&children=0&int=VA&b=2";
137 |         assertEquals(string, Entities.unescape(string));
138 |     }
139 | 
140 |     @Test public void escapesGtInXmlAttributesButNotInHtml() {
141 |         // https://github.com/jhy/jsoup/issues/528 - < is OK in HTML attribute values, but not in XML
142 | 
143 | 
144 |         String docHtml = "<a title='<p>One</p>'>One</a>";
145 |         Document doc = Jsoup.parse(docHtml);
146 |         Element element = doc.select("a").first();
147 | 
148 |         doc.outputSettings().escapeMode(base);
149 |         assertEquals("<a title=\"<p>One</p>\">One</a>", element.outerHtml());
150 | 
151 |         doc.outputSettings().escapeMode(xhtml);
152 |         assertEquals("<a title=\"&lt;p>One&lt;/p>\">One</a>", element.outerHtml());
153 |     }
154 | }
155 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/FormElementTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.nodes;
  2 | 
  3 | import org.jsoup.Connection;
  4 | import org.jsoup.Jsoup;
  5 | import org.junit.Test;
  6 | 
  7 | import java.util.List;
  8 | 
  9 | import static org.junit.Assert.*;
 10 | 
 11 | /**
 12 |  * Tests for FormElement
 13 |  *
 14 |  * @author Jonathan Hedley
 15 |  */
 16 | public class FormElementTest {
 17 |     @Test public void hasAssociatedControls() {
 18 |         //"button", "fieldset", "input", "keygen", "object", "output", "select", "textarea"
 19 |         String html = "<form id=1><button id=1><fieldset id=2 /><input id=3><keygen id=4><object id=5><output id=6>" +
 20 |                 "<select id=7><option></select><textarea id=8><p id=9>";
 21 |         Document doc = Jsoup.parse(html);
 22 | 
 23 |         FormElement form = (FormElement) doc.select("form").first();
 24 |         assertEquals(8, form.elements().size());
 25 |     }
 26 | 
 27 |     @Test public void createsFormData() {
 28 |         String html = "<form><input name='one' value='two'><select name='three'><option value='not'>" +
 29 |                 "<option value='four' selected><option value='five' selected><textarea name=six>seven</textarea>" +
 30 |                 "<input name='seven' type='radio' value='on' checked><input name='seven' type='radio' value='off'>" +
 31 |                 "<input name='eight' type='checkbox' checked><input name='nine' type='checkbox' value='unset'>" +
 32 |                 "<input name='ten' value='text' disabled>" +
 33 |                 "</form>";
 34 |         Document doc = Jsoup.parse(html);
 35 |         FormElement form = (FormElement) doc.select("form").first();
 36 |         List<Connection.KeyVal> data = form.formData();
 37 | 
 38 |         assertEquals(6, data.size());
 39 |         assertEquals("one=two", data.get(0).toString());
 40 |         assertEquals("three=four", data.get(1).toString());
 41 |         assertEquals("three=five", data.get(2).toString());
 42 |         assertEquals("six=seven", data.get(3).toString());
 43 |         assertEquals("seven=on", data.get(4).toString()); // set
 44 |         assertEquals("eight=on", data.get(5).toString()); // default
 45 |         // nine should not appear, not checked checkbox
 46 |         // ten should not appear, disabled
 47 |     }
 48 | 
 49 |     @Test public void createsSubmitableConnection() {
 50 |         String html = "<form action='/search'><input name='q'></form>";
 51 |         Document doc = Jsoup.parse(html, "http://example.com/");
 52 |         doc.select("[name=q]").attr("value", "jsoup");
 53 | 
 54 |         FormElement form = ((FormElement) doc.select("form").first());
 55 |         Connection con = form.submit();
 56 | 
 57 |         assertEquals(Connection.Method.GET, con.request().method());
 58 |         assertEquals("http://example.com/search", con.request().url().toExternalForm());
 59 |         List<Connection.KeyVal> dataList = (List<Connection.KeyVal>) con.request().data();
 60 |         assertEquals("q=jsoup", dataList.get(0).toString());
 61 | 
 62 |         doc.select("form").attr("method", "post");
 63 |         Connection con2 = form.submit();
 64 |         assertEquals(Connection.Method.POST, con2.request().method());
 65 |     }
 66 | 
 67 |     @Test public void actionWithNoValue() {
 68 |         String html = "<form><input name='q'></form>";
 69 |         Document doc = Jsoup.parse(html, "http://example.com/");
 70 |         FormElement form = ((FormElement) doc.select("form").first());
 71 |         Connection con = form.submit();
 72 | 
 73 |         assertEquals("http://example.com/", con.request().url().toExternalForm());
 74 |     }
 75 | 
 76 |     @Test public void actionWithNoBaseUri() {
 77 |         String html = "<form><input name='q'></form>";
 78 |         Document doc = Jsoup.parse(html);
 79 |         FormElement form = ((FormElement) doc.select("form").first());
 80 | 
 81 | 
 82 |         boolean threw = false;
 83 |         try {
 84 |             Connection con = form.submit();
 85 |         } catch (IllegalArgumentException e) {
 86 |             threw = true;
 87 |             assertEquals("Could not determine a form action URL for submit. Ensure you set a base URI when parsing.",
 88 |                     e.getMessage());
 89 |         }
 90 |         assertTrue(threw);
 91 |     }
 92 | 
 93 |     @Test public void formsAddedAfterParseAreFormElements() {
 94 |         Document doc = Jsoup.parse("<body />");
 95 |         doc.body().html("<form action='http://example.com/search'><input name='q' value='search'>");
 96 |         Element formEl = doc.select("form").first();
 97 |         assertTrue(formEl instanceof FormElement);
 98 | 
 99 |         FormElement form = (FormElement) formEl;
100 |         assertEquals(1, form.elements().size());
101 |     }
102 | 
103 |     @Test public void controlsAddedAfterParseAreLinkedWithForms() {
104 |         Document doc = Jsoup.parse("<body />");
105 |         doc.body().html("<form />");
106 | 
107 |         Element formEl = doc.select("form").first();
108 |         formEl.append("<input name=foo value=bar>");
109 | 
110 |         assertTrue(formEl instanceof FormElement);
111 |         FormElement form = (FormElement) formEl;
112 |         assertEquals(1, form.elements().size());
113 | 
114 |         List<Connection.KeyVal> data = form.formData();
115 |         assertEquals("foo=bar", data.get(0).toString());
116 |     }
117 | 
118 |     @Test public void usesOnForCheckboxValueIfNoValueSet() {
119 |         Document doc = Jsoup.parse("<form><input type=checkbox checked name=foo></form>");
120 |         FormElement form = (FormElement) doc.select("form").first();
121 |         List<Connection.KeyVal> data = form.formData();
122 |         assertEquals("on", data.get(0).value());
123 |         assertEquals("foo", data.get(0).key());
124 |     }
125 | 
126 |     @Test public void adoptedFormsRetainInputs() {
127 |         // test for https://github.com/jhy/jsoup/issues/249
128 |         String html = "<html>\n" +
129 |                 "<body>  \n" +
130 |                 "  <table>\n" +
131 |                 "      <form action=\"/hello.php\" method=\"post\">\n" +
132 |                 "      <tr><td>User:</td><td> <input type=\"text\" name=\"user\" /></td></tr>\n" +
133 |                 "      <tr><td>Password:</td><td> <input type=\"password\" name=\"pass\" /></td></tr>\n" +
134 |                 "      <tr><td><input type=\"submit\" name=\"login\" value=\"login\" /></td></tr>\n" +
135 |                 "   </form>\n" +
136 |                 "  </table>\n" +
137 |                 "</body>\n" +
138 |                 "</html>";
139 |         Document doc = Jsoup.parse(html);
140 |         FormElement form = (FormElement) doc.select("form").first();
141 |         List<Connection.KeyVal> data = form.formData();
142 |         assertEquals(3, data.size());
143 |         assertEquals("user", data.get(0).key());
144 |         assertEquals("pass", data.get(1).key());
145 |         assertEquals("login", data.get(2).key());
146 |     }
147 | }
148 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/nodes/TextNodeTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.nodes;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.TextUtil;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.*;
 8 | 
 9 | /**
10 |  Test TextNodes
11 | 
12 |  @author Jonathan Hedley, jonathan@hedley.net */
13 | public class TextNodeTest {
14 |     @Test public void testBlank() {
15 |         TextNode one = new TextNode("", "");
16 |         TextNode two = new TextNode("     ", "");
17 |         TextNode three = new TextNode("  \n\n   ", "");
18 |         TextNode four = new TextNode("Hello", "");
19 |         TextNode five = new TextNode("  \nHello ", "");
20 | 
21 |         assertTrue(one.isBlank());
22 |         assertTrue(two.isBlank());
23 |         assertTrue(three.isBlank());
24 |         assertFalse(four.isBlank());
25 |         assertFalse(five.isBlank());
26 |     }
27 |     
28 |     @Test public void testTextBean() {
29 |         Document doc = Jsoup.parse("<p>One <span>two &amp;</span> three &amp;</p>");
30 |         Element p = doc.select("p").first();
31 | 
32 |         Element span = doc.select("span").first();
33 |         assertEquals("two &", span.text());
34 |         TextNode spanText = (TextNode) span.childNode(0);
35 |         assertEquals("two &", spanText.text());
36 |         
37 |         TextNode tn = (TextNode) p.childNode(2);
38 |         assertEquals(" three &", tn.text());
39 |         
40 |         tn.text(" POW!");
41 |         assertEquals("One <span>two &amp;</span> POW!", TextUtil.stripNewlines(p.html()));
42 | 
43 |         tn.attr("text", "kablam &");
44 |         assertEquals("kablam &", tn.text());
45 |         assertEquals("One <span>two &amp;</span>kablam &amp;", TextUtil.stripNewlines(p.html()));
46 |     }
47 | 
48 |     @Test public void testSplitText() {
49 |         Document doc = Jsoup.parse("<div>Hello there</div>");
50 |         Element div = doc.select("div").first();
51 |         TextNode tn = (TextNode) div.childNode(0);
52 |         TextNode tail = tn.splitText(6);
53 |         assertEquals("Hello ", tn.getWholeText());
54 |         assertEquals("there", tail.getWholeText());
55 |         tail.text("there!");
56 |         assertEquals("Hello there!", div.text());
57 |         assertTrue(tn.parent() == tail.parent());
58 |     }
59 | 
60 |     @Test public void testSplitAnEmbolden() {
61 |         Document doc = Jsoup.parse("<div>Hello there</div>");
62 |         Element div = doc.select("div").first();
63 |         TextNode tn = (TextNode) div.childNode(0);
64 |         TextNode tail = tn.splitText(6);
65 |         tail.wrap("<b></b>");
66 | 
67 |         assertEquals("Hello <b>there</b>", TextUtil.stripNewlines(div.html())); // not great that we get \n<b>there there... must correct
68 |     }
69 | 
70 |     @Test public void testWithSupplementaryCharacter(){
71 |         Document doc = Jsoup.parse(new String(Character.toChars(135361)));
72 |         TextNode t = doc.body().textNodes().get(0);
73 |         assertEquals(new String(Character.toChars(135361)), t.outerHtml().trim());
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/AttributeParseTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import java.util.List;
  4 | 
  5 | import org.jsoup.Jsoup;
  6 | import org.jsoup.nodes.Attribute;
  7 | import org.jsoup.nodes.Attributes;
  8 | import org.jsoup.nodes.BooleanAttribute;
  9 | import org.jsoup.nodes.Document;
 10 | import org.jsoup.nodes.Element;
 11 | import org.jsoup.select.Elements;
 12 | import org.junit.Test;
 13 | 
 14 | import static org.junit.Assert.*;
 15 | 
 16 | /**
 17 |  Test suite for attribute parser.
 18 | 
 19 |  @author Jonathan Hedley, jonathan@hedley.net */
 20 | public class AttributeParseTest {
 21 | 
 22 |     @Test public void parsesRoughAttributeString() {
 23 |         String html = "<a id=\"123\" class=\"baz = 'bar'\" style = 'border: 2px'qux zim foo = 12 mux=18 />";
 24 |         // should be: <id=123>, <class=baz = 'bar'>, <qux=>, <zim=>, <foo=12>, <mux.=18>
 25 | 
 26 |         Element el = Jsoup.parse(html).getElementsByTag("a").get(0);
 27 |         Attributes attr = el.attributes();
 28 |         assertEquals(7, attr.size());
 29 |         assertEquals("123", attr.get("id"));
 30 |         assertEquals("baz = 'bar'", attr.get("class"));
 31 |         assertEquals("border: 2px", attr.get("style"));
 32 |         assertEquals("", attr.get("qux"));
 33 |         assertEquals("", attr.get("zim"));
 34 |         assertEquals("12", attr.get("foo"));
 35 |         assertEquals("18", attr.get("mux"));
 36 |     }
 37 | 
 38 |     @Test public void handlesNewLinesAndReturns() {
 39 |         String html = "<a\r\nfoo='bar\r\nqux'\r\nbar\r\n=\r\ntwo>One</a>";
 40 |         Element el = Jsoup.parse(html).select("a").first();
 41 |         assertEquals(2, el.attributes().size());
 42 |         assertEquals("bar\r\nqux", el.attr("foo")); // currently preserves newlines in quoted attributes. todo confirm if should.
 43 |         assertEquals("two", el.attr("bar"));
 44 |     }
 45 | 
 46 |     @Test public void parsesEmptyString() {
 47 |         String html = "<a />";
 48 |         Element el = Jsoup.parse(html).getElementsByTag("a").get(0);
 49 |         Attributes attr = el.attributes();
 50 |         assertEquals(0, attr.size());
 51 |     }
 52 | 
 53 |     @Test public void canStartWithEq() {
 54 |         String html = "<a =empty />";
 55 |         Element el = Jsoup.parse(html).getElementsByTag("a").get(0);
 56 |         Attributes attr = el.attributes();
 57 |         assertEquals(1, attr.size());
 58 |         assertTrue(attr.hasKey("=empty"));
 59 |         assertEquals("", attr.get("=empty"));
 60 |     }
 61 | 
 62 |     @Test public void strictAttributeUnescapes() {
 63 |         String html = "<a id=1 href='?foo=bar&mid&lt=true'>One</a> <a id=2 href='?foo=bar&lt;qux&lg=1'>Two</a>";
 64 |         Elements els = Jsoup.parse(html).select("a");
 65 |         assertEquals("?foo=bar&mid&lt=true", els.first().attr("href"));
 66 |         assertEquals("?foo=bar<qux&lg=1", els.last().attr("href"));
 67 |     }
 68 | 
 69 |     @Test public void moreAttributeUnescapes() {
 70 |         String html = "<a href='&wr_id=123&mid-size=true&ok=&wr'>Check</a>";
 71 |         Elements els = Jsoup.parse(html).select("a");
 72 |         assertEquals("&wr_id=123&mid-size=true&ok=&wr", els.first().attr("href"));
 73 |     }
 74 |     
 75 |     @Test public void parsesBooleanAttributes() {
 76 |         String html = "<a normal=\"123\" boolean empty=\"\"></a>";
 77 |         Element el = Jsoup.parse(html).select("a").first();
 78 |         
 79 |         assertEquals("123", el.attr("normal"));
 80 |         assertEquals("", el.attr("boolean"));
 81 |         assertEquals("", el.attr("empty"));
 82 |         
 83 |         List<Attribute> attributes = el.attributes().asList();
 84 |         assertEquals("There should be 3 attribute present", 3, attributes.size());
 85 |         
 86 |         // Assuming the list order always follows the parsed html
 87 | 		assertFalse("'normal' attribute should not be boolean", attributes.get(0) instanceof BooleanAttribute);        
 88 | 		assertTrue("'boolean' attribute should be boolean", attributes.get(1) instanceof BooleanAttribute);        
 89 | 		assertFalse("'empty' attribute should not be boolean", attributes.get(2) instanceof BooleanAttribute);        
 90 |         
 91 |         assertEquals(html, el.outerHtml());
 92 |     }
 93 |     
 94 |     @Test public void dropsSlashFromAttributeName() {
 95 |         String html = "<img /onerror='doMyJob'/>";
 96 |         Document doc = Jsoup.parse(html);
 97 |         assertTrue("SelfClosingStartTag ignores last character", doc.select("img[onerror]").size() != 0);
 98 |         assertEquals("<img onerror=\"doMyJob\">", doc.body().html());
 99 | 
100 |         doc = Jsoup.parse(html, "", Parser.xmlParser());
101 |         assertEquals("<img onerror=\"doMyJob\" />", doc.html());
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/ParserSettingsTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.junit.Test;
 4 | import static org.junit.Assert.assertEquals;
 5 | 
 6 | public class ParserSettingsTest {
 7 |     @Test
 8 |     public void caseSupport() {
 9 |         ParseSettings bothOn = new ParseSettings(true, true);
10 |         ParseSettings bothOff = new ParseSettings(false, false);
11 |         ParseSettings tagOn = new ParseSettings(true, false);
12 |         ParseSettings attrOn = new ParseSettings(false, true);
13 | 
14 |         assertEquals("FOO", bothOn.normalizeTag("FOO"));
15 |         assertEquals("FOO", bothOn.normalizeAttribute("FOO"));
16 | 
17 |         assertEquals("foo", bothOff.normalizeTag("FOO"));
18 |         assertEquals("foo", bothOff.normalizeAttribute("FOO"));
19 | 
20 |         assertEquals("FOO", tagOn.normalizeTag("FOO"));
21 |         assertEquals("foo", tagOn.normalizeAttribute("FOO"));
22 | 
23 |         assertEquals("foo", attrOn.normalizeTag("FOO"));
24 |         assertEquals("FOO", attrOn.normalizeAttribute("FOO"));
25 | 
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/TagTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.junit.Test;
 4 | import static org.junit.Assert.*;
 5 | 
 6 | /**
 7 |  Tag tests.
 8 |  @author Jonathan Hedley, jonathan@hedley.net */
 9 | public class TagTest {
10 | 
11 |     @Test public void isCaseSensitive() {
12 |         Tag p1 = Tag.valueOf("P");
13 |         Tag p2 = Tag.valueOf("p");
14 |         assertFalse(p1.equals(p2));
15 |     }
16 | 
17 |     @Test public void canBeInsensitive() {
18 |         Tag p1 = Tag.valueOf("P", ParseSettings.htmlDefault);
19 |         Tag p2 = Tag.valueOf("p", ParseSettings.htmlDefault);
20 |         assertEquals(p1, p2);
21 |     }
22 | 
23 |     @Test public void trims() {
24 |         Tag p1 = Tag.valueOf("p");
25 |         Tag p2 = Tag.valueOf(" p ");
26 |         assertEquals(p1, p2);
27 |     }
28 | 
29 |     @Test public void equality() {
30 |         Tag p1 = Tag.valueOf("p");
31 |         Tag p2 = Tag.valueOf("p");
32 |         assertTrue(p1.equals(p2));
33 |         assertTrue(p1 == p2);
34 |     }
35 | 
36 |     @Test public void divSemantics() {
37 |         Tag div = Tag.valueOf("div");
38 | 
39 |         assertTrue(div.isBlock());
40 |         assertTrue(div.formatAsBlock());
41 |     }
42 | 
43 |     @Test public void pSemantics() {
44 |         Tag p = Tag.valueOf("p");
45 | 
46 |         assertTrue(p.isBlock());
47 |         assertFalse(p.formatAsBlock());
48 |     }
49 | 
50 |     @Test public void imgSemantics() {
51 |         Tag img = Tag.valueOf("img");
52 |         assertTrue(img.isInline());
53 |         assertTrue(img.isSelfClosing());
54 |         assertFalse(img.isBlock());
55 |     }
56 | 
57 |     @Test public void defaultSemantics() {
58 |         Tag foo = Tag.valueOf("FOO"); // not defined
59 |         Tag foo2 = Tag.valueOf("FOO");
60 | 
61 |         assertEquals(foo, foo2);
62 |         assertTrue(foo.isInline());
63 |         assertTrue(foo.formatAsBlock());
64 |     }
65 | 
66 |     @Test(expected = IllegalArgumentException.class) public void valueOfChecksNotNull() {
67 |         Tag.valueOf(null);
68 |     }
69 | 
70 |     @Test(expected = IllegalArgumentException.class) public void valueOfChecksNotEmpty() {
71 |         Tag.valueOf(" ");
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/TokenQueueTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.parser;
 2 | 
 3 | import org.junit.Test;
 4 | import static org.junit.Assert.*;
 5 | 
 6 | /**
 7 |  * Token queue tests.
 8 |  */
 9 | public class TokenQueueTest {
10 |     @Test public void chompBalanced() {
11 |         TokenQueue tq = new TokenQueue(":contains(one (two) three) four");
12 |         String pre = tq.consumeTo("(");
13 |         String guts = tq.chompBalanced('(', ')');
14 |         String remainder = tq.remainder();
15 | 
16 |         assertEquals(":contains", pre);
17 |         assertEquals("one (two) three", guts);
18 |         assertEquals(" four", remainder);
19 |     }
20 |     
21 |     @Test public void chompEscapedBalanced() {
22 |         TokenQueue tq = new TokenQueue(":contains(one (two) \\( \\) \\) three) four");
23 |         String pre = tq.consumeTo("(");
24 |         String guts = tq.chompBalanced('(', ')');
25 |         String remainder = tq.remainder();
26 | 
27 |         assertEquals(":contains", pre);
28 |         assertEquals("one (two) \\( \\) \\) three", guts);
29 |         assertEquals("one (two) ( ) ) three", TokenQueue.unescape(guts));
30 |         assertEquals(" four", remainder);
31 |     }
32 | 
33 |     @Test public void chompBalancedMatchesAsMuchAsPossible() {
34 |         TokenQueue tq = new TokenQueue("unbalanced(something(or another)) else");
35 |         tq.consumeTo("(");
36 |         String match = tq.chompBalanced('(', ')');
37 |         assertEquals("something(or another)", match);
38 |     }
39 |     
40 |     @Test public void unescape() {
41 |         assertEquals("one ( ) \\", TokenQueue.unescape("one \\( \\) \\\\"));
42 |     }
43 |     
44 |     @Test public void chompToIgnoreCase() {
45 |         String t = "<textarea>one < two </TEXTarea>";
46 |         TokenQueue tq = new TokenQueue(t);
47 |         String data = tq.chompToIgnoreCase("</textarea");
48 |         assertEquals("<textarea>one < two ", data);
49 |         
50 |         tq = new TokenQueue("<textarea> one two < three </oops>");
51 |         data = tq.chompToIgnoreCase("</textarea");
52 |         assertEquals("<textarea> one two < three </oops>", data);
53 |     }
54 | 
55 |     @Test public void addFirst() {
56 |         TokenQueue tq = new TokenQueue("One Two");
57 |         tq.consumeWord();
58 |         tq.addFirst("Three");
59 |         assertEquals("Three Two", tq.remainder());
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.parser;
  2 | 
  3 | import org.jsoup.Jsoup;
  4 | import org.jsoup.TextUtil;
  5 | import org.jsoup.helper.StringUtil;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.nodes.Node;
  8 | import org.jsoup.nodes.TextNode;
  9 | import org.jsoup.nodes.XmlDeclaration;
 10 | import org.junit.Ignore;
 11 | import org.junit.Test;
 12 | 
 13 | import java.io.File;
 14 | import java.io.FileInputStream;
 15 | import java.io.IOException;
 16 | import java.io.InputStream;
 17 | import java.net.URISyntaxException;
 18 | import java.nio.charset.Charset;
 19 | import java.util.List;
 20 | 
 21 | import static org.jsoup.nodes.Document.OutputSettings.Syntax;
 22 | import static org.junit.Assert.assertEquals;
 23 | import static org.junit.Assert.assertFalse;
 24 | 
 25 | /**
 26 |  * Tests XmlTreeBuilder.
 27 |  *
 28 |  * @author Jonathan Hedley
 29 |  */
 30 | public class XmlTreeBuilderTest {
 31 |     @Test
 32 |     public void testSimpleXmlParse() {
 33 |         String xml = "<doc id=2 href='/bar'>Foo <br /><link>One</link><link>Two</link></doc>";
 34 |         XmlTreeBuilder tb = new XmlTreeBuilder();
 35 |         Document doc = tb.parse(xml, "http://foo.com/");
 36 |         assertEquals("<doc id=\"2\" href=\"/bar\">Foo <br /><link>One</link><link>Two</link></doc>",
 37 |                 TextUtil.stripNewlines(doc.html()));
 38 |         assertEquals(doc.getElementById("2").absUrl("href"), "http://foo.com/bar");
 39 |     }
 40 | 
 41 |     @Test
 42 |     public void testPopToClose() {
 43 |         // test: </val> closes Two, </bar> ignored
 44 |         String xml = "<doc><val>One<val>Two</val></bar>Three</doc>";
 45 |         XmlTreeBuilder tb = new XmlTreeBuilder();
 46 |         Document doc = tb.parse(xml, "http://foo.com/");
 47 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 48 |                 TextUtil.stripNewlines(doc.html()));
 49 |     }
 50 | 
 51 |     @Test
 52 |     public void testCommentAndDocType() {
 53 |         String xml = "<!DOCTYPE HTML><!-- a comment -->One <qux />Two";
 54 |         XmlTreeBuilder tb = new XmlTreeBuilder();
 55 |         Document doc = tb.parse(xml, "http://foo.com/");
 56 |         assertEquals("<!DOCTYPE HTML><!-- a comment -->One <qux />Two",
 57 |                 TextUtil.stripNewlines(doc.html()));
 58 |     }
 59 | 
 60 |     @Test
 61 |     public void testSupplyParserToJsoupClass() {
 62 |         String xml = "<doc><val>One<val>Two</val></bar>Three</doc>";
 63 |         Document doc = Jsoup.parse(xml, "http://foo.com/", Parser.xmlParser());
 64 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 65 |                 TextUtil.stripNewlines(doc.html()));
 66 |     }
 67 | 
 68 |     @Ignore
 69 |     @Test
 70 |     public void testSupplyParserToConnection() throws IOException {
 71 |         String xmlUrl = "http://direct.infohound.net/tools/jsoup-xml-test.xml";
 72 | 
 73 |         // parse with both xml and html parser, ensure different
 74 |         Document xmlDoc = Jsoup.connect(xmlUrl).parser(Parser.xmlParser()).get();
 75 |         Document htmlDoc = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()).get();
 76 |         Document autoXmlDoc = Jsoup.connect(xmlUrl).get(); // check connection auto detects xml, uses xml parser
 77 | 
 78 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 79 |                 TextUtil.stripNewlines(xmlDoc.html()));
 80 |         assertFalse(htmlDoc.equals(xmlDoc));
 81 |         assertEquals(xmlDoc, autoXmlDoc);
 82 |         assertEquals(1, htmlDoc.select("head").size()); // html parser normalises
 83 |         assertEquals(0, xmlDoc.select("head").size()); // xml parser does not
 84 |         assertEquals(0, autoXmlDoc.select("head").size()); // xml parser does not
 85 |     }
 86 | 
 87 |     @Test
 88 |     public void testSupplyParserToDataStream() throws IOException, URISyntaxException {
 89 |         File xmlFile = new File(XmlTreeBuilder.class.getResource("/htmltests/xml-test.xml").toURI());
 90 |         InputStream inStream = new FileInputStream(xmlFile);
 91 |         Document doc = Jsoup.parse(inStream, null, "http://foo.com", Parser.xmlParser());
 92 |         assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
 93 |                 TextUtil.stripNewlines(doc.html()));
 94 |     }
 95 | 
 96 |     @Test
 97 |     public void testDoesNotForceSelfClosingKnownTags() {
 98 |         // html will force "<br>one</br>" to logically "<br />One<br />". XML should be stay "<br>one</br> -- don't recognise tag.
 99 |         Document htmlDoc = Jsoup.parse("<br>one</br>");
100 |         assertEquals("<br>one\n<br>", htmlDoc.body().html());
101 | 
102 |         Document xmlDoc = Jsoup.parse("<br>one</br>", "", Parser.xmlParser());
103 |         assertEquals("<br>one</br>", xmlDoc.html());
104 |     }
105 | 
106 |     @Test public void handlesXmlDeclarationAsDeclaration() {
107 |         String html = "<?xml encoding='UTF-8' ?><body>One</body><!-- comment -->";
108 |         Document doc = Jsoup.parse(html, "", Parser.xmlParser());
109 |         assertEquals("<?xml encoding=\"UTF-8\"?> <body> One </body> <!-- comment -->",
110 |                 StringUtil.normaliseWhitespace(doc.outerHtml()));
111 |         assertEquals("#declaration", doc.childNode(0).nodeName());
112 |         assertEquals("#comment", doc.childNode(2).nodeName());
113 |     }
114 | 
115 |     @Test public void xmlFragment() {
116 |         String xml = "<one src='/foo/' />Two<three><four /></three>";
117 |         List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
118 |         assertEquals(3, nodes.size());
119 | 
120 |         assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
121 |         assertEquals("one", nodes.get(0).nodeName());
122 |         assertEquals("Two", ((TextNode)nodes.get(1)).text());
123 |     }
124 | 
125 |     @Test public void xmlParseDefaultsToHtmlOutputSyntax() {
126 |         Document doc = Jsoup.parse("x", "", Parser.xmlParser());
127 |         assertEquals(Syntax.xml, doc.outputSettings().syntax());
128 |     }
129 | 
130 |     @Test
131 |     public void testDoesHandleEOFInTag() {
132 |         String html = "<img src=asdf onerror=\"alert(1)\" x=";
133 |         Document xmlDoc = Jsoup.parse(html, "", Parser.xmlParser());
134 |         assertEquals("<img src=\"asdf\" onerror=\"alert(1)\" x=\"\" />", xmlDoc.html());
135 |     }
136 | 
137 |     @Test
138 |     public void testDetectCharsetEncodingDeclaration() throws IOException, URISyntaxException {
139 |         File xmlFile = new File(XmlTreeBuilder.class.getResource("/htmltests/xml-charset.xml").toURI());
140 |         InputStream inStream = new FileInputStream(xmlFile);
141 |         Document doc = Jsoup.parse(inStream, null, "http://example.com/", Parser.xmlParser());
142 |         assertEquals("ISO-8859-1", doc.charset().name());
143 |         assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?> <data>äöåéü</data>",
144 |             TextUtil.stripNewlines(doc.html()));
145 |     }
146 | 
147 |     @Test
148 |     public void testParseDeclarationAttributes() {
149 |         String xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>";
150 |         Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
151 |         XmlDeclaration decl = (XmlDeclaration) doc.childNode(0);
152 |         assertEquals("1", decl.attr("version"));
153 |         assertEquals("UTF-8", decl.attr("encoding"));
154 |         assertEquals("else", decl.attr("something"));
155 |         assertEquals("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.getWholeDeclaration());
156 |         assertEquals("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.outerHtml());
157 |     }
158 | 
159 |     @Test
160 |     public void caseSensitiveDeclaration() {
161 |         String xml = "<?XML version='1' encoding='UTF-8' something='else'?>";
162 |         Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
163 |         assertEquals("<?XML version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", doc.outerHtml());
164 |     }
165 | 
166 |     @Test
167 |     public void testCreatesValidProlog() {
168 |         Document document = Document.createShell("");
169 |         document.outputSettings().syntax(Syntax.xml);
170 |         document.charset(Charset.forName("utf-8"));
171 |         assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
172 |             "<html>\n" +
173 |             " <head></head>\n" +
174 |             " <body></body>\n" +
175 |             "</html>", document.outerHtml());
176 |     }
177 | 
178 |     @Test
179 |     public void preservesCaseByDefault() {
180 |         String xml = "<TEST ID=1>Check</TEST>";
181 |         Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
182 |         assertEquals("<TEST ID=\"1\">Check</TEST>", TextUtil.stripNewlines(doc.html()));
183 |     }
184 | 
185 |     @Test
186 |     public void canNormalizeCase() {
187 |         String xml = "<TEST ID=1>Check</TEST>";
188 |         Document doc = Jsoup.parse(xml, "", Parser.xmlParser().settings(ParseSettings.htmlDefault));
189 |         assertEquals("<test id=\"1\">Check</test>", TextUtil.stripNewlines(doc.html()));
190 |     }
191 | }
192 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/select/CssTest.java:
--------------------------------------------------------------------------------
  1 | package org.jsoup.select;
  2 | 
  3 | import static org.junit.Assert.*;
  4 | 
  5 | import org.jsoup.Jsoup;
  6 | import org.jsoup.nodes.Document;
  7 | import org.jsoup.parser.Tag;
  8 | import org.junit.Before;
  9 | import org.junit.BeforeClass;
 10 | import org.junit.Test;
 11 | 
 12 | public class CssTest {
 13 | 
 14 | 	private Document html = null;
 15 | 	private static String htmlString;
 16 | 	
 17 | 	@BeforeClass
 18 | 	public static void initClass() {
 19 | 		StringBuilder sb = new StringBuilder("<html><head></head><body>");
 20 | 		
 21 | 		sb.append("<div id='pseudo'>");
 22 | 		for (int i = 1; i <= 10; i++) {
 23 | 			sb.append(String.format("<p>%d</p>",i));
 24 | 		}
 25 | 		sb.append("</div>");
 26 | 
 27 | 		sb.append("<div id='type'>");
 28 | 		for (int i = 1; i <= 10; i++) {
 29 | 			sb.append(String.format("<p>%d</p>",i));
 30 | 			sb.append(String.format("<span>%d</span>",i));
 31 | 			sb.append(String.format("<em>%d</em>",i));
 32 |             sb.append(String.format("<svg>%d</svg>",i));
 33 | 		}
 34 | 		sb.append("</div>");
 35 | 
 36 | 		sb.append("<span id='onlySpan'><br /></span>");
 37 | 		sb.append("<p class='empty'><!-- Comment only is still empty! --></p>");
 38 | 		
 39 | 		sb.append("<div id='only'>");
 40 | 		sb.append("Some text before the <em>only</em> child in this div");
 41 | 		sb.append("</div>");
 42 | 		
 43 | 		sb.append("</body></html>");
 44 | 		htmlString = sb.toString();
 45 | 	}
 46 | 
 47 | 	@Before
 48 | 	public void init() {
 49 | 		html  = Jsoup.parse(htmlString);
 50 | 	}
 51 | 	
 52 | 	@Test
 53 | 	public void firstChild() {
 54 | 		check(html.select("#pseudo :first-child"), "1");
 55 | 		check(html.select("html:first-child"));
 56 | 	}
 57 | 
 58 | 	@Test
 59 | 	public void lastChild() {
 60 | 		check(html.select("#pseudo :last-child"), "10");
 61 | 		check(html.select("html:last-child"));
 62 | 	}
 63 | 	
 64 | 	@Test
 65 | 	public void nthChild_simple() {
 66 | 		for(int i = 1; i <=10; i++) {
 67 | 			check(html.select(String.format("#pseudo :nth-child(%d)", i)), String.valueOf(i));
 68 | 		}
 69 | 	}
 70 | 
 71 |     @Test
 72 |     public void nthOfType_unknownTag() {
 73 |         for(int i = 1; i <=10; i++) {
 74 |             check(html.select(String.format("#type svg:nth-of-type(%d)", i)), String.valueOf(i));
 75 |         }
 76 |     }
 77 | 
 78 | 	@Test
 79 | 	public void nthLastChild_simple() {
 80 | 		for(int i = 1; i <=10; i++) {
 81 | 			check(html.select(String.format("#pseudo :nth-last-child(%d)", i)), String.valueOf(11-i));
 82 | 		}
 83 | 	}
 84 | 
 85 | 	@Test
 86 | 	public void nthOfType_simple() {
 87 | 		for(int i = 1; i <=10; i++) {
 88 | 			check(html.select(String.format("#type p:nth-of-type(%d)", i)), String.valueOf(i));
 89 | 		}
 90 | 	}
 91 | 	
 92 | 	@Test
 93 | 	public void nthLastOfType_simple() {
 94 | 		for(int i = 1; i <=10; i++) {
 95 | 			check(html.select(String.format("#type :nth-last-of-type(%d)", i)), String.valueOf(11-i),String.valueOf(11-i),String.valueOf(11-i),String.valueOf(11-i));
 96 | 		}
 97 | 	}
 98 | 
 99 | 	@Test
100 | 	public void nthChild_advanced() {
101 | 		check(html.select("#pseudo :nth-child(-5)"));
102 | 		check(html.select("#pseudo :nth-child(odd)"), "1", "3", "5", "7", "9");
103 | 		check(html.select("#pseudo :nth-child(2n-1)"), "1", "3", "5", "7", "9");
104 | 		check(html.select("#pseudo :nth-child(2n+1)"), "1", "3", "5", "7", "9");
105 | 		check(html.select("#pseudo :nth-child(2n+3)"), "3", "5", "7", "9");
106 | 		check(html.select("#pseudo :nth-child(even)"), "2", "4", "6", "8", "10");
107 | 		check(html.select("#pseudo :nth-child(2n)"), "2", "4", "6", "8", "10");
108 | 		check(html.select("#pseudo :nth-child(3n-1)"), "2", "5", "8");
109 | 		check(html.select("#pseudo :nth-child(-2n+5)"), "1", "3", "5");
110 | 		check(html.select("#pseudo :nth-child(+5)"), "5");
111 | 	}
112 | 
113 | 	@Test
114 | 	public void nthOfType_advanced() {
115 | 		check(html.select("#type :nth-of-type(-5)"));
116 | 		check(html.select("#type p:nth-of-type(odd)"), "1", "3", "5", "7", "9");
117 | 		check(html.select("#type em:nth-of-type(2n-1)"), "1", "3", "5", "7", "9");
118 | 		check(html.select("#type p:nth-of-type(2n+1)"), "1", "3", "5", "7", "9");
119 | 		check(html.select("#type span:nth-of-type(2n+3)"), "3", "5", "7", "9");
120 | 		check(html.select("#type p:nth-of-type(even)"), "2", "4", "6", "8", "10");
121 | 		check(html.select("#type p:nth-of-type(2n)"), "2", "4", "6", "8", "10");
122 | 		check(html.select("#type p:nth-of-type(3n-1)"), "2", "5", "8");
123 | 		check(html.select("#type p:nth-of-type(-2n+5)"), "1", "3", "5");
124 | 		check(html.select("#type :nth-of-type(+5)"), "5", "5", "5", "5");
125 | 	}
126 | 
127 | 	
128 | 	@Test
129 | 	public void nthLastChild_advanced() {
130 | 		check(html.select("#pseudo :nth-last-child(-5)"));
131 | 		check(html.select("#pseudo :nth-last-child(odd)"), "2", "4", "6", "8", "10");
132 | 		check(html.select("#pseudo :nth-last-child(2n-1)"), "2", "4", "6", "8", "10");
133 | 		check(html.select("#pseudo :nth-last-child(2n+1)"), "2", "4", "6", "8", "10");
134 | 		check(html.select("#pseudo :nth-last-child(2n+3)"), "2", "4", "6", "8");
135 | 		check(html.select("#pseudo :nth-last-child(even)"), "1", "3", "5", "7", "9");
136 | 		check(html.select("#pseudo :nth-last-child(2n)"), "1", "3", "5", "7", "9");
137 | 		check(html.select("#pseudo :nth-last-child(3n-1)"), "3", "6", "9");
138 | 
139 | 		check(html.select("#pseudo :nth-last-child(-2n+5)"), "6", "8", "10");
140 | 		check(html.select("#pseudo :nth-last-child(+5)"), "6");
141 | 	}
142 | 
143 | 	@Test
144 | 	public void nthLastOfType_advanced() {
145 | 		check(html.select("#type :nth-last-of-type(-5)"));
146 | 		check(html.select("#type p:nth-last-of-type(odd)"), "2", "4", "6", "8", "10");
147 | 		check(html.select("#type em:nth-last-of-type(2n-1)"), "2", "4", "6", "8", "10");
148 | 		check(html.select("#type p:nth-last-of-type(2n+1)"), "2", "4", "6", "8", "10");
149 | 		check(html.select("#type span:nth-last-of-type(2n+3)"), "2", "4", "6", "8");
150 | 		check(html.select("#type p:nth-last-of-type(even)"), "1", "3", "5", "7", "9");
151 | 		check(html.select("#type p:nth-last-of-type(2n)"), "1", "3", "5", "7", "9");
152 | 		check(html.select("#type p:nth-last-of-type(3n-1)"), "3", "6", "9");
153 | 
154 | 		check(html.select("#type span:nth-last-of-type(-2n+5)"), "6", "8", "10");
155 | 		check(html.select("#type :nth-last-of-type(+5)"), "6", "6", "6", "6");
156 | 	}
157 | 	
158 | 	@Test
159 | 	public void firstOfType() {
160 | 		check(html.select("div:not(#only) :first-of-type"), "1", "1", "1", "1", "1");
161 | 	}
162 | 
163 | 	@Test
164 | 	public void lastOfType() {
165 | 		check(html.select("div:not(#only) :last-of-type"), "10", "10", "10", "10", "10");
166 | 	}
167 | 
168 | 	@Test
169 | 	public void empty() {
170 | 		final Elements sel = html.select(":empty");
171 | 		assertEquals(3, sel.size());
172 | 		assertEquals("head", sel.get(0).tagName());
173 | 		assertEquals("br", sel.get(1).tagName());
174 | 		assertEquals("p", sel.get(2).tagName());
175 | 	}
176 | 	
177 | 	@Test
178 | 	public void onlyChild() {
179 | 		final Elements sel = html.select("span :only-child");
180 | 		assertEquals(1, sel.size());
181 | 		assertEquals("br", sel.get(0).tagName());
182 | 		
183 | 		check(html.select("#only :only-child"), "only");
184 | 	}
185 | 	
186 | 	@Test
187 | 	public void onlyOfType() {
188 | 		final Elements sel = html.select(":only-of-type");
189 | 		assertEquals(6, sel.size());
190 | 		assertEquals("head", sel.get(0).tagName());
191 | 		assertEquals("body", sel.get(1).tagName());
192 | 		assertEquals("span", sel.get(2).tagName());
193 | 		assertEquals("br", sel.get(3).tagName());
194 | 		assertEquals("p", sel.get(4).tagName());
195 | 		assertTrue(sel.get(4).hasClass("empty"));
196 | 		assertEquals("em", sel.get(5).tagName());
197 | 	}
198 | 	
199 | 	protected void check(Elements result, String...expectedContent ) {
200 | 		assertEquals("Number of elements", expectedContent.length, result.size());
201 | 		for (int i = 0; i < expectedContent.length; i++) {
202 | 			assertNotNull(result.get(i));
203 | 			assertEquals("Expected element",expectedContent[i], result.get(i).ownText());
204 | 		}
205 | 	}
206 | 
207 | 	
208 | 	@Test
209 | 	public void root() {
210 | 		Elements sel = html.select(":root");
211 | 		assertEquals(1, sel.size());
212 | 		assertNotNull(sel.get(0));
213 | 		assertEquals(Tag.valueOf("html"), sel.get(0).tag());
214 | 
215 | 		Elements sel2 = html.select("body").select(":root");
216 | 		assertEquals(1, sel2.size());
217 | 		assertNotNull(sel2.get(0));
218 | 		assertEquals(Tag.valueOf("body"), sel2.get(0).tag());
219 | 	}
220 | 
221 | }
222 | 


--------------------------------------------------------------------------------
/src/test/java/org/jsoup/select/QueryParserTest.java:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select;
 2 | 
 3 | import org.junit.Test;
 4 | import static org.junit.Assert.*;
 5 | 
 6 | /**
 7 |  * Tests for the Selector Query Parser.
 8 |  *
 9 |  * @author Jonathan Hedley
10 |  */
11 | public class QueryParserTest {
12 |     @Test public void testOrGetsCorrectPrecedence() {
13 |         // tests that a selector "a b, c d, e f" evals to (a AND b) OR (c AND d) OR (e AND f)"
14 |         // top level or, three child ands
15 |         Evaluator eval = QueryParser.parse("a b, c d, e f");
16 |         assertTrue(eval instanceof CombiningEvaluator.Or);
17 |         CombiningEvaluator.Or or = (CombiningEvaluator.Or) eval;
18 |         assertEquals(3, or.evaluators.size());
19 |         for (Evaluator innerEval: or.evaluators) {
20 |             assertTrue(innerEval instanceof CombiningEvaluator.And);
21 |             CombiningEvaluator.And and = (CombiningEvaluator.And) innerEval;
22 |             assertEquals(2, and.evaluators.size());
23 |             assertTrue(and.evaluators.get(0) instanceof Evaluator.Tag);
24 |             assertTrue(and.evaluators.get(1) instanceof StructuralEvaluator.Parent);
25 |         }
26 |     }
27 | 
28 |     @Test public void testParsesMultiCorrectly() {
29 |         Evaluator eval = QueryParser.parse(".foo > ol, ol > li + li");
30 |         assertTrue(eval instanceof CombiningEvaluator.Or);
31 |         CombiningEvaluator.Or or = (CombiningEvaluator.Or) eval;
32 |         assertEquals(2, or.evaluators.size());
33 | 
34 |         CombiningEvaluator.And andLeft = (CombiningEvaluator.And) or.evaluators.get(0);
35 |         CombiningEvaluator.And andRight = (CombiningEvaluator.And) or.evaluators.get(1);
36 | 
37 |         assertEquals("ol :ImmediateParent.foo", andLeft.toString());
38 |         assertEquals(2, andLeft.evaluators.size());
39 |         assertEquals("li :prevli :ImmediateParentol", andRight.toString());
40 |         assertEquals(2, andLeft.evaluators.size());
41 |     }
42 | 
43 |     @Test(expected = Selector.SelectorParseException.class) public void exceptionOnUncloseAttribute() {
44 |         Evaluator parse = QueryParser.parse("section > a[href=\"]");
45 |     }
46 | 
47 |     @Test(expected = Selector.SelectorParseException.class)  public void testParsesSingleQuoteInContains() {
48 |         Evaluator parse = QueryParser.parse("p:contains(One \" One)");
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/test/resources/bomtests/bom_utf16be.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/bomtests/bom_utf16be.html


--------------------------------------------------------------------------------
/src/test/resources/bomtests/bom_utf16le.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/bomtests/bom_utf16le.html


--------------------------------------------------------------------------------
/src/test/resources/bomtests/bom_utf32be.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/bomtests/bom_utf32be.html


--------------------------------------------------------------------------------
/src/test/resources/bomtests/bom_utf32le.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/bomtests/bom_utf32le.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/README:
--------------------------------------------------------------------------------
 1 | Note
 2 | ====
 3 | 
 4 | The HTML files in this directory (htmltests) are intended to be used for testing the Jsoup parser and improving its
 5 | interoperability with real world published HTML. These files are not distributed in the core Jsoup library.
 6 | 
 7 | These files remain the copyright of the original owner.
 8 | 
 9 | If you are the copyright holder and do not wish your works to be used in this manner, please contact Jonathan Hedley
10 | (jonathan@hedley.net) and your works will be removed from this test-suite.
11 | 
12 | Sources
13 | ========
14 | 
15 | * yahoo-article-1.html    http://news.yahoo.com/s/nm/20100831/bs_nm/us_gm_china 1-Sep-2010
16 | * smh-biz-article-1.html  http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html
17 | * news-com-au-home.html   http://www.news.com.au/	11-Jan-2010
18 | * google-ipod.html		  http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10	11-Jan-2010
19 | * yahoo-jp.html			  http://www.yahoo.co.jp/index.html	12-Jan-2010
20 | * baidu-cn-home.html	  http://www.baidu.com/ 15-Jul-2010
21 | * nyt-article-1.html      http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp
22 | 


--------------------------------------------------------------------------------
/src/test/resources/htmltests/baidu-cn-home.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/htmltests/baidu-cn-home.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/baidu-variant.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/htmltests/baidu-variant.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/meta-charset-1.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/htmltests/meta-charset-1.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/meta-charset-2.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/htmltests/meta-charset-2.html


--------------------------------------------------------------------------------
/src/test/resources/htmltests/meta-charset-3.html:
--------------------------------------------------------------------------------
1 | <html>
2 | <head></head>
3 | <body>新</body>
4 | </html>


--------------------------------------------------------------------------------
/src/test/resources/htmltests/namespaces.xhtml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8" standalone="no"?>
 2 | <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en" lang="en">
 3 | 	<head>
 4 | 		<title>Cover</title>
 5 | 		<style type="text/css">
 6 | 			img{
 7 | 				max-width:100%;
 8 | 			}
 9 | 		</style>
10 | 	</head>
11 | 	<body>
12 | 	<figure id="cover-image">
13 | 		<img src="covers/9781449328030_lrg.jpg" alt="First Edition"/>
14 | 	</figure>
15 | 	<epub:title id="epubTitle">Check</epub:title>
16 | 	<x:section xmlns:x="urn:test">
17 | 		<x:title id="xTitle">Another</x:title>
18 | 		Section Text.
19 | 	</x:section>
20 | 
21 | 	<:foo>Test</:foo>
22 | 	</body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/src/test/resources/htmltests/table-invalid-elements.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <body>
 3 | <table>
 4 |   <tr>
 5 |     <td>
 6 |       <table>
 7 |         <tr>
 8 |           <!--Comment-->
 9 |           <table>
10 |             <p>Why am I here?</p>
11 |             </tr>
12 |           </table>
13 |           </td>
14 |         </tr>
15 |       </table>
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/src/test/resources/htmltests/thumb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/htmltests/thumb.jpg


--------------------------------------------------------------------------------
/src/test/resources/htmltests/xml-charset.xml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/src/test/resources/htmltests/xml-charset.xml


--------------------------------------------------------------------------------
/src/test/resources/htmltests/xml-test.xml:
--------------------------------------------------------------------------------
1 | <doc><val>One<val>Two</val>Three</val></doc>
2 | 


--------------------------------------------------------------------------------