├── .gitignore ├── .travis.yml ├── CHANGES ├── LICENSE ├── README.md ├── common.png ├── jsoup01.png ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── jsoup │ │ ├── Connection.java │ │ ├── HttpStatusException.java │ │ ├── Jsoup.java │ │ ├── SerializationException.java │ │ ├── UnsupportedMimeTypeException.java │ │ ├── examples │ │ ├── HtmlToPlainText.java │ │ ├── ListLinks.java │ │ └── package-info.java │ │ ├── helper │ │ ├── DataUtil.java │ │ ├── DescendableLinkedList.java │ │ ├── HttpConnection.java │ │ ├── StringUtil.java │ │ ├── Validate.java │ │ └── W3CDom.java │ │ ├── nodes │ │ ├── Attribute.java │ │ ├── Attributes.java │ │ ├── BooleanAttribute.java │ │ ├── Comment.java │ │ ├── DataNode.java │ │ ├── Document.java │ │ ├── DocumentType.java │ │ ├── Element.java │ │ ├── Entities.java │ │ ├── FormElement.java │ │ ├── Node.java │ │ ├── TextNode.java │ │ ├── XmlDeclaration.java │ │ ├── entities-base.properties │ │ ├── entities-full.properties │ │ ├── entities-xhtml.properties │ │ └── package-info.java │ │ ├── package-info.java │ │ ├── parser │ │ ├── CharacterReader.java │ │ ├── HtmlTreeBuilder.java │ │ ├── HtmlTreeBuilderState.java │ │ ├── ParseError.java │ │ ├── ParseErrorList.java │ │ ├── ParseSettings.java │ │ ├── Parser.java │ │ ├── Tag.java │ │ ├── Token.java │ │ ├── TokenQueue.java │ │ ├── Tokeniser.java │ │ ├── TokeniserState.java │ │ ├── TreeBuilder.java │ │ ├── XmlTreeBuilder.java │ │ └── package-info.java │ │ ├── safety │ │ ├── Cleaner.java │ │ ├── Whitelist.java │ │ └── package-info.java │ │ └── select │ │ ├── Collector.java │ │ ├── CombiningEvaluator.java │ │ ├── Elements.java │ │ ├── Evaluator.java │ │ ├── NodeTraversor.java │ │ ├── NodeVisitor.java │ │ ├── QueryParser.java │ │ ├── Selector.java │ │ ├── StructuralEvaluator.java │ │ └── package-info.java └── javadoc │ └── overview.html └── test ├── java └── org │ └── jsoup │ ├── TextUtil.java │ ├── helper │ ├── DataUtilTest.java │ ├── HttpConnectionTest.java │ ├── StringUtilTest.java │ └── W3CDomTest.java │ ├── integration │ ├── Benchmark.java │ ├── ParseTest.java │ └── UrlConnectTest.java │ ├── nodes │ ├── AttributeTest.java │ ├── AttributesTest.java │ ├── BuildEntities.java │ ├── DocumentTest.java │ ├── DocumentTypeTest.java │ ├── ElementTest.java │ ├── EntitiesTest.java │ ├── FormElementTest.java │ ├── NodeTest.java │ └── TextNodeTest.java │ ├── parser │ ├── AttributeParseTest.java │ ├── CharacterReaderTest.java │ ├── HtmlParserTest.java │ ├── ParserSettingsTest.java │ ├── TagTest.java │ ├── TokenQueueTest.java │ └── XmlTreeBuilderTest.java │ ├── safety │ └── CleanerTest.java │ └── select │ ├── CssTest.java │ ├── ElementsTest.java │ ├── QueryParserTest.java │ └── SelectorTest.java └── resources ├── bomtests ├── bom_utf16be.html ├── bom_utf16le.html ├── bom_utf32be.html └── bom_utf32le.html └── htmltests ├── README ├── baidu-cn-home.html ├── baidu-variant.html ├── google-ipod.html ├── meta-charset-1.html ├── meta-charset-2.html ├── meta-charset-3.html ├── namespaces.xhtml ├── news-com-au-home.html ├── nyt-article-1.html ├── smh-biz-article-1.html ├── table-invalid-elements.html ├── thumb.jpg ├── xml-charset.xml ├── xml-test.xml ├── yahoo-article-1.html └── yahoo-jp.html /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | jsoup.iml 3 | jsoup.ipr 4 | jsoup.iws 5 | target/ 6 | .classpath 7 | .project 8 | .settings/ 9 | *Thrash* 10 | 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | jdk: 4 | - openjdk6 5 | - openjdk7 6 | - oraclejdk7 7 | - oraclejdk8 8 | 9 | cache: 10 | directories: 11 | - $HOME/.m2 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | © 2009-2017, Jonathan Hedley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 开源项目Jsoup使用简介 2 | 3 | jsoup 是一款 Java 的HTML 解析器,可通过DOM,CSS选择器以及类似于JQuery的操作方法来提取和操作Html文档数据。 4 | 5 | 6 | 开源地址:[https://github.com/open-android/Jsoup](https://github.com/open-android/Jsoup "开源项目地址") 7 | 8 | * [配套视频](https://www.boxuegu.com/web/html/video.html?courseId=172§ionId=8a2c9bed5a3a4c7e015a4aa700eb0a2a&chapterId=8a2c9bed5a3a4c7e015a4aa767150a2b&vId=8a2c9bed5a3a4c7e015a4aa7ad870a2c&videoId=D9C78456B7F047A79C33DC5901307461) 9 | 10 | * 爱生活,爱学习,更爱做代码的搬运工,分类查找更方便请下载黑马助手app 11 | 12 | ![黑马助手.png](http://upload-images.jianshu.io/upload_images/4037105-f777f1214328dcc4.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 13 | 14 | # 使用效果 15 | ![](jsoup01.png) 16 | 17 | ## 使用步骤 18 | 19 | ### 1. 在project的build.gradle添加如下代码(如下图) 20 | 21 | allprojects { 22 | repositories { 23 | ... 24 | maven { url "https://jitpack.io" } 25 | } 26 | } 27 | 28 | ![](common.png) 29 | 30 | ### 2. 在Module的build.gradle添加依赖 31 | 32 | compile 'com.github.open-android:Jsoup:jsoup-1.10.2' 33 | 34 | ### 3.演示步骤 35 | 36 | * a.测试用html内容如下 37 | 38 | 39 | 40 | 41 | First parse 42 | 43 | 44 |

attribute parse

45 |

text parse

46 | 47 | 48 | 49 | * b.将演示代码复制到Activity的onCreate方法中 50 | 51 | //测试用html字符串 52 | String html = "First parse" 53 | + "

attribute parse

" 54 | + "

text parse

"; 55 | 56 | //Jsoup解析获得Document对象 57 | Document doc = Jsoup.parse(html); 58 | 59 | System.out.println("解析出来的html:\n"+doc.toString()); 60 | 61 | 62 | //获得head元素对象 63 | Element head = doc.head(); 64 | 65 | //DOM方式获得第一个title元素 66 | Element title = head.getElementsByTag("title").first(); 67 | 68 | //获得title元素中文本 69 | String text = title.text(); 70 | System.out.println("title标签中文本: " + text); 71 | 72 | 73 | //--------------------------------------- 74 | 75 | 76 | //获得body元素对象 77 | Element body = doc.body(); 78 | 79 | //选择器语法查找p元素 80 | Elements lists = body.select("p"); 81 | 82 | //遍历所有p元素,输出p元素文本 83 | for(Element p : lists){ 84 | System.out.println("p元素文本: " + p.text()); 85 | } 86 | 87 | 88 | //选择器语法查找第一个拥有align属性的p元素 89 | Element pElement = body.select("p[align]").first(); 90 | 91 | //获得p元素align属性值 92 | String align = pElement.attr("align"); 93 | System.out.println("p元素align属性值: " + align); 94 | 95 | 96 | > 注意:如果解析指定url需要添加网络访问权限 97 | > 98 | 99 | * 欢迎关注微信公众号 100 | 101 | ![](http://upload-images.jianshu.io/upload_images/4037105-8f737b5104dd0b5d.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 102 | -------------------------------------------------------------------------------- /common.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/common.png -------------------------------------------------------------------------------- /jsoup01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-android/jsoup/5bd7757a0e68177a6db3c3d9e4056c4fe65abd14/jsoup01.png -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | jsoup Java HTML Parser 5 | 6 | org.jsoup 7 | jsoup 8 | 1.10.3-SNAPSHOT 9 | jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers do. 10 | https://jsoup.org/ 11 | 2009 12 | 13 | GitHub 14 | http://github.com/jhy/jsoup/issues 15 | 16 | 17 | 18 | The MIT License 19 | https://jsoup.org/license 20 | repo 21 | 22 | 23 | 24 | https://github.com/jhy/jsoup 25 | scm:git:https://github.com/jhy/jsoup.git 26 | 27 | HEAD 28 | 29 | 30 | Jonathan Hedley 31 | http://jonathanhedley.com/ 32 | 33 | 34 | 35 | 36 | 37 | org.apache.maven.plugins 38 | maven-compiler-plugin 39 | 3.5.1 40 | 41 | 1.5 42 | 1.5 43 | UTF-8 44 | 45 | 46 | 47 | 48 | org.codehaus.mojo 49 | animal-sniffer-maven-plugin 50 | 1.15 51 | 52 | 53 | animal-sniffer 54 | compile 55 | 56 | check 57 | 58 | 59 | 60 | org.codehaus.mojo.signature 61 | java15 62 | 1.0 63 | 64 | 65 | 66 | 67 | 68 | 69 | org.apache.maven.plugins 70 | maven-javadoc-plugin 71 | 2.10.4 72 | 73 | -Xdoclint:none 74 | 75 | 76 | 77 | attach-javadoc 78 | verify 79 | 80 | jar 81 | 82 | 83 | 84 | 85 | 86 | org.apache.maven.plugins 87 | maven-source-plugin 88 | 3.0.1 89 | 90 | 91 | 92 | 93 | attach-sources 94 | verify 95 | 96 | jar 97 | 98 | 99 | 100 | 101 | 102 | org.apache.maven.plugins 103 | maven-jar-plugin 104 | 3.0.2 105 | 106 | 107 | ${project.build.outputDirectory}/META-INF/MANIFEST.MF 108 | 109 | 110 | 111 | 112 | org.apache.felix 113 | maven-bundle-plugin 114 | 2.5.4 115 | 116 | 117 | bundle-manifest 118 | process-classes 119 | 120 | manifest 121 | 122 | 123 | 124 | 125 | 126 | https://jsoup.org/ 127 | 128 | 129 | 130 | 131 | org.apache.maven.plugins 132 | maven-resources-plugin 133 | 3.0.1 134 | 135 | 136 | maven-release-plugin 137 | 2.5.3 138 | 139 | 140 | 141 | 142 | src/main/java 143 | 144 | **/*.properties 145 | 146 | 147 | 148 | ./ 149 | META-INF/ 150 | false 151 | 152 | LICENSE 153 | README.md 154 | CHANGES 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | sonatype-nexus-snapshots 163 | Sonatype Nexus Snapshots 164 | https://oss.sonatype.org/content/repositories/snapshots 165 | 166 | 167 | sonatype-nexus-staging 168 | Nexus Release Repository 169 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 170 | 171 | 172 | 173 | 174 | 175 | release-sign-artifacts 176 | 177 | 178 | performRelease 179 | true 180 | 181 | 182 | 183 | 184 | 185 | org.apache.maven.plugins 186 | maven-gpg-plugin 187 | 188 | 189 | sign-artifacts 190 | verify 191 | 192 | sign 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | junit 207 | junit 208 | 4.12 209 | test 210 | 211 | 212 | 213 | 214 | com.google.code.gson 215 | gson 216 | 2.7 217 | test 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | UTF-8 229 | 230 | 231 | 232 | 233 | jhy 234 | Jonathan Hedley 235 | jonathan@hedley.net 236 | 237 | Lead Developer 238 | 239 | +11 240 | 241 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/HttpStatusException.java: -------------------------------------------------------------------------------- 1 | package org.jsoup; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | * Signals that a HTTP request resulted in a not OK HTTP response. 7 | */ 8 | public class HttpStatusException extends IOException { 9 | private int statusCode; 10 | private String url; 11 | 12 | public HttpStatusException(String message, int statusCode, String url) { 13 | super(message); 14 | this.statusCode = statusCode; 15 | this.url = url; 16 | } 17 | 18 | public int getStatusCode() { 19 | return statusCode; 20 | } 21 | 22 | public String getUrl() { 23 | return url; 24 | } 25 | 26 | @Override 27 | public String toString() { 28 | return super.toString() + ". Status=" + statusCode + ", URL=" + url; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/SerializationException.java: -------------------------------------------------------------------------------- 1 | package org.jsoup; 2 | 3 | /** 4 | * A SerializationException is raised whenever serialization of a DOM element fails. This exception usually wraps an 5 | * {@link java.io.IOException} that may be thrown due to an inaccessible output stream. 6 | */ 7 | public final class SerializationException extends RuntimeException { 8 | /** 9 | * Creates and initializes a new serialization exception with no error message and cause. 10 | */ 11 | public SerializationException() { 12 | super(); 13 | } 14 | 15 | /** 16 | * Creates and initializes a new serialization exception with the given error message and no cause. 17 | * 18 | * @param message 19 | * the error message of the new serialization exception (may be null). 20 | */ 21 | public SerializationException(String message) { 22 | super(message); 23 | } 24 | 25 | /** 26 | * Creates and initializes a new serialization exception with the specified cause and an error message of 27 | * (cause==null ? null : cause.toString()) (which typically contains the class and error message of 28 | * cause). 29 | * 30 | * @param cause 31 | * the cause of the new serialization exception (may be null). 32 | */ 33 | public SerializationException(Throwable cause) { 34 | super(cause); 35 | } 36 | 37 | /** 38 | * Creates and initializes a new serialization exception with the given error message and cause. 39 | * 40 | * @param message 41 | * the error message of the new serialization exception. 42 | * @param cause 43 | * the cause of the new serialization exception. 44 | */ 45 | public SerializationException(String message, Throwable cause) { 46 | super(message, cause); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/UnsupportedMimeTypeException.java: -------------------------------------------------------------------------------- 1 | package org.jsoup; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | * Signals that a HTTP response returned a mime type that is not supported. 7 | */ 8 | public class UnsupportedMimeTypeException extends IOException { 9 | private String mimeType; 10 | private String url; 11 | 12 | public UnsupportedMimeTypeException(String message, String mimeType, String url) { 13 | super(message); 14 | this.mimeType = mimeType; 15 | this.url = url; 16 | } 17 | 18 | public String getMimeType() { 19 | return mimeType; 20 | } 21 | 22 | public String getUrl() { 23 | return url; 24 | } 25 | 26 | @Override 27 | public String toString() { 28 | return super.toString() + ". Mimetype=" + mimeType + ", URL="+url; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/examples/HtmlToPlainText.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.examples; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.helper.StringUtil; 5 | import org.jsoup.helper.Validate; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.nodes.Node; 9 | import org.jsoup.nodes.TextNode; 10 | import org.jsoup.select.Elements; 11 | import org.jsoup.select.NodeTraversor; 12 | import org.jsoup.select.NodeVisitor; 13 | 14 | import java.io.IOException; 15 | 16 | /** 17 | * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted 18 | * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a 19 | * scrape. 20 | *

21 | * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend. 22 | *

23 | *

24 | * To invoke from the command line, assuming you've downloaded the jsoup jar to your current directory:

25 | *

java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]

26 | * where url is the URL to fetch, and selector is an optional CSS selector. 27 | * 28 | * @author Jonathan Hedley, jonathan@hedley.net 29 | */ 30 | public class HtmlToPlainText { 31 | private static final String userAgent = "Mozilla/5.0 (jsoup)"; 32 | private static final int timeout = 5 * 1000; 33 | 34 | public static void main(String... args) throws IOException { 35 | Validate.isTrue(args.length == 1 || args.length == 2, "usage: java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]"); 36 | final String url = args[0]; 37 | final String selector = args.length == 2 ? args[1] : null; 38 | 39 | // fetch the specified URL and parse to a HTML DOM 40 | Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get(); 41 | 42 | HtmlToPlainText formatter = new HtmlToPlainText(); 43 | 44 | if (selector != null) { 45 | Elements elements = doc.select(selector); // get each element that matches the CSS selector 46 | for (Element element : elements) { 47 | String plainText = formatter.getPlainText(element); // format that element to plain text 48 | System.out.println(plainText); 49 | } 50 | } else { // format the whole doc 51 | String plainText = formatter.getPlainText(doc); 52 | System.out.println(plainText); 53 | } 54 | } 55 | 56 | /** 57 | * Format an Element to plain-text 58 | * @param element the root element to format 59 | * @return formatted text 60 | */ 61 | public String getPlainText(Element element) { 62 | FormattingVisitor formatter = new FormattingVisitor(); 63 | NodeTraversor traversor = new NodeTraversor(formatter); 64 | traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node 65 | 66 | return formatter.toString(); 67 | } 68 | 69 | // the formatting rules, implemented in a breadth-first DOM traverse 70 | private class FormattingVisitor implements NodeVisitor { 71 | private static final int maxWidth = 80; 72 | private int width = 0; 73 | private StringBuilder accum = new StringBuilder(); // holds the accumulated text 74 | 75 | // hit when the node is first seen 76 | public void head(Node node, int depth) { 77 | String name = node.nodeName(); 78 | if (node instanceof TextNode) 79 | append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. 80 | else if (name.equals("li")) 81 | append("\n * "); 82 | else if (name.equals("dt")) 83 | append(" "); 84 | else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) 85 | append("\n"); 86 | } 87 | 88 | // hit when all of the node's children (if any) have been visited 89 | public void tail(Node node, int depth) { 90 | String name = node.nodeName(); 91 | if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) 92 | append("\n"); 93 | else if (name.equals("a")) 94 | append(String.format(" <%s>", node.absUrl("href"))); 95 | } 96 | 97 | // appends text to the string builder with a simple word wrap method 98 | private void append(String text) { 99 | if (text.startsWith("\n")) 100 | width = 0; // reset counter if starts with a newline. only from formats above, not in natural text 101 | if (text.equals(" ") && 102 | (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n"))) 103 | return; // don't accumulate long runs of empty spaces 104 | 105 | if (text.length() + width > maxWidth) { // won't fit, needs to wrap 106 | String words[] = text.split("\\s+"); 107 | for (int i = 0; i < words.length; i++) { 108 | String word = words[i]; 109 | boolean last = i == words.length - 1; 110 | if (!last) // insert a space if not the last word 111 | word = word + " "; 112 | if (word.length() + width > maxWidth) { // wrap and reset counter 113 | accum.append("\n").append(word); 114 | width = word.length(); 115 | } else { 116 | accum.append(word); 117 | width += word.length(); 118 | } 119 | } 120 | } else { // fits as is, without need to wrap text 121 | accum.append(text); 122 | width += text.length(); 123 | } 124 | } 125 | 126 | @Override 127 | public String toString() { 128 | return accum.toString(); 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/examples/ListLinks.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.examples; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.helper.Validate; 5 | import org.jsoup.nodes.Document; 6 | import org.jsoup.nodes.Element; 7 | import org.jsoup.select.Elements; 8 | 9 | import java.io.IOException; 10 | 11 | /** 12 | * Example program to list links from a URL. 13 | */ 14 | public class ListLinks { 15 | public static void main(String[] args) throws IOException { 16 | Validate.isTrue(args.length == 1, "usage: supply url to fetch"); 17 | String url = args[0]; 18 | print("Fetching %s...", url); 19 | 20 | Document doc = Jsoup.connect(url).get(); 21 | Elements links = doc.select("a[href]"); 22 | Elements media = doc.select("[src]"); 23 | Elements imports = doc.select("link[href]"); 24 | 25 | print("\nMedia: (%d)", media.size()); 26 | for (Element src : media) { 27 | if (src.tagName().equals("img")) 28 | print(" * %s: <%s> %sx%s (%s)", 29 | src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), 30 | trim(src.attr("alt"), 20)); 31 | else 32 | print(" * %s: <%s>", src.tagName(), src.attr("abs:src")); 33 | } 34 | 35 | print("\nImports: (%d)", imports.size()); 36 | for (Element link : imports) { 37 | print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel")); 38 | } 39 | 40 | print("\nLinks: (%d)", links.size()); 41 | for (Element link : links) { 42 | print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); 43 | } 44 | } 45 | 46 | private static void print(String msg, Object... args) { 47 | System.out.println(String.format(msg, args)); 48 | } 49 | 50 | private static String trim(String s, int width) { 51 | if (s.length() > width) 52 | return s.substring(0, width-1) + "."; 53 | else 54 | return s; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/examples/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | Contains example programs and use of jsoup. See the jsoup cookbook. 3 | */ 4 | package org.jsoup.examples; -------------------------------------------------------------------------------- /src/main/java/org/jsoup/helper/DescendableLinkedList.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | import java.util.Iterator; 4 | import java.util.LinkedList; 5 | import java.util.ListIterator; 6 | 7 | /** 8 | * Provides a descending iterator and other 1.6 methods to allow support on the 1.5 JRE. 9 | * @param Type of elements 10 | */ 11 | public class DescendableLinkedList extends LinkedList { 12 | 13 | /** 14 | * Create a new DescendableLinkedList. 15 | */ 16 | public DescendableLinkedList() { 17 | super(); 18 | } 19 | 20 | /** 21 | * Add a new element to the start of the list. 22 | * @param e element to add 23 | */ 24 | public void push(E e) { 25 | addFirst(e); 26 | } 27 | 28 | /** 29 | * Look at the last element, if there is one. 30 | * @return the last element, or null 31 | */ 32 | public E peekLast() { 33 | return size() == 0 ? null : getLast(); 34 | } 35 | 36 | /** 37 | * Remove and return the last element, if there is one 38 | * @return the last element, or null 39 | */ 40 | public E pollLast() { 41 | return size() == 0 ? null : removeLast(); 42 | } 43 | 44 | /** 45 | * Get an iterator that starts and the end of the list and works towards the start. 46 | * @return an iterator that starts and the end of the list and works towards the start. 47 | */ 48 | public Iterator descendingIterator() { 49 | return new DescendingIterator(size()); 50 | } 51 | 52 | private class DescendingIterator implements Iterator { 53 | private final ListIterator iter; 54 | 55 | @SuppressWarnings("unchecked") 56 | private DescendingIterator(int index) { 57 | iter = (ListIterator) listIterator(index); 58 | } 59 | 60 | /** 61 | * Check if there is another element on the list. 62 | * @return if another element 63 | */ 64 | public boolean hasNext() { 65 | return iter.hasPrevious(); 66 | } 67 | 68 | /** 69 | * Get the next element. 70 | * @return the next element. 71 | */ 72 | public E next() { 73 | return iter.previous(); 74 | } 75 | 76 | /** 77 | * Remove the current element. 78 | */ 79 | public void remove() { 80 | iter.remove(); 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/helper/StringUtil.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | import java.net.MalformedURLException; 4 | import java.net.URL; 5 | import java.util.Arrays; 6 | import java.util.Collection; 7 | import java.util.Iterator; 8 | 9 | /** 10 | * A minimal String utility class. Designed for internal jsoup use only. 11 | */ 12 | public final class StringUtil { 13 | // memoised padding up to 10 14 | private static final String[] padding = {"", " ", " ", " ", " ", " ", " ", " ", " ", " ", " "}; 15 | 16 | /** 17 | * Join a collection of strings by a separator 18 | * @param strings collection of string objects 19 | * @param sep string to place between strings 20 | * @return joined string 21 | */ 22 | public static String join(Collection strings, String sep) { 23 | return join(strings.iterator(), sep); 24 | } 25 | 26 | /** 27 | * Join a collection of strings by a separator 28 | * @param strings iterator of string objects 29 | * @param sep string to place between strings 30 | * @return joined string 31 | */ 32 | public static String join(Iterator strings, String sep) { 33 | if (!strings.hasNext()) 34 | return ""; 35 | 36 | String start = strings.next().toString(); 37 | if (!strings.hasNext()) // only one, avoid builder 38 | return start; 39 | 40 | StringBuilder sb = new StringBuilder(64).append(start); 41 | while (strings.hasNext()) { 42 | sb.append(sep); 43 | sb.append(strings.next()); 44 | } 45 | return sb.toString(); 46 | } 47 | 48 | /** 49 | * Returns space padding 50 | * @param width amount of padding desired 51 | * @return string of spaces * width 52 | */ 53 | public static String padding(int width) { 54 | if (width < 0) 55 | throw new IllegalArgumentException("width must be > 0"); 56 | 57 | if (width < padding.length) 58 | return padding[width]; 59 | 60 | char[] out = new char[width]; 61 | for (int i = 0; i < width; i++) 62 | out[i] = ' '; 63 | return String.valueOf(out); 64 | } 65 | 66 | /** 67 | * Tests if a string is blank: null, empty, or only whitespace (" ", \r\n, \t, etc) 68 | * @param string string to test 69 | * @return if string is blank 70 | */ 71 | public static boolean isBlank(String string) { 72 | if (string == null || string.length() == 0) 73 | return true; 74 | 75 | int l = string.length(); 76 | for (int i = 0; i < l; i++) { 77 | if (!StringUtil.isWhitespace(string.codePointAt(i))) 78 | return false; 79 | } 80 | return true; 81 | } 82 | 83 | /** 84 | * Tests if a string is numeric, i.e. contains only digit characters 85 | * @param string string to test 86 | * @return true if only digit chars, false if empty or null or contains non-digit chars 87 | */ 88 | public static boolean isNumeric(String string) { 89 | if (string == null || string.length() == 0) 90 | return false; 91 | 92 | int l = string.length(); 93 | for (int i = 0; i < l; i++) { 94 | if (!Character.isDigit(string.codePointAt(i))) 95 | return false; 96 | } 97 | return true; 98 | } 99 | 100 | /** 101 | * Tests if a code point is "whitespace" as defined in the HTML spec. 102 | * @param c code point to test 103 | * @return true if code point is whitespace, false otherwise 104 | */ 105 | public static boolean isWhitespace(int c){ 106 | return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; 107 | } 108 | 109 | /** 110 | * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters 111 | * (e.g. newline, tab) convert to a simple space 112 | * @param string content to normalise 113 | * @return normalised string 114 | */ 115 | public static String normaliseWhitespace(String string) { 116 | StringBuilder sb = new StringBuilder(string.length()); 117 | appendNormalisedWhitespace(sb, string, false); 118 | return sb.toString(); 119 | } 120 | 121 | /** 122 | * After normalizing the whitespace within a string, appends it to a string builder. 123 | * @param accum builder to append to 124 | * @param string string to normalize whitespace within 125 | * @param stripLeading set to true if you wish to remove any leading whitespace 126 | */ 127 | public static void appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading) { 128 | boolean lastWasWhite = false; 129 | boolean reachedNonWhite = false; 130 | 131 | int len = string.length(); 132 | int c; 133 | for (int i = 0; i < len; i+= Character.charCount(c)) { 134 | c = string.codePointAt(i); 135 | if (isWhitespace(c)) { 136 | if ((stripLeading && !reachedNonWhite) || lastWasWhite) 137 | continue; 138 | accum.append(' '); 139 | lastWasWhite = true; 140 | } 141 | else { 142 | accum.appendCodePoint(c); 143 | lastWasWhite = false; 144 | reachedNonWhite = true; 145 | } 146 | } 147 | } 148 | 149 | public static boolean in(String needle, String... haystack) { 150 | for (String hay : haystack) { 151 | if (hay.equals(needle)) 152 | return true; 153 | } 154 | return false; 155 | } 156 | 157 | public static boolean inSorted(String needle, String[] haystack) { 158 | return Arrays.binarySearch(haystack, needle) >= 0; 159 | } 160 | 161 | /** 162 | * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. 163 | * @param base the existing absolute base URL 164 | * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) 165 | * @return the resolved absolute URL 166 | * @throws MalformedURLException if an error occurred generating the URL 167 | */ 168 | public static URL resolve(URL base, String relUrl) throws MalformedURLException { 169 | // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired 170 | if (relUrl.startsWith("?")) 171 | relUrl = base.getPath() + relUrl; 172 | // workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo 173 | if (relUrl.indexOf('.') == 0 && base.getFile().indexOf('/') != 0) { 174 | base = new URL(base.getProtocol(), base.getHost(), base.getPort(), "/" + base.getFile()); 175 | } 176 | return new URL(base, relUrl); 177 | } 178 | 179 | /** 180 | * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. 181 | * @param baseUrl the existing absolute base URL 182 | * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) 183 | * @return an absolute URL if one was able to be generated, or the empty string if not 184 | */ 185 | public static String resolve(final String baseUrl, final String relUrl) { 186 | URL base; 187 | try { 188 | try { 189 | base = new URL(baseUrl); 190 | } catch (MalformedURLException e) { 191 | // the base is unsuitable, but the attribute/rel may be abs on its own, so try that 192 | URL abs = new URL(relUrl); 193 | return abs.toExternalForm(); 194 | } 195 | return resolve(base, relUrl).toExternalForm(); 196 | } catch (MalformedURLException e) { 197 | return ""; 198 | } 199 | 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/helper/Validate.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | /** 4 | * Simple validation methods. Designed for jsoup internal use 5 | */ 6 | public final class Validate { 7 | 8 | private Validate() {} 9 | 10 | /** 11 | * Validates that the object is not null 12 | * @param obj object to test 13 | */ 14 | public static void notNull(Object obj) { 15 | if (obj == null) 16 | throw new IllegalArgumentException("Object must not be null"); 17 | } 18 | 19 | /** 20 | * Validates that the object is not null 21 | * @param obj object to test 22 | * @param msg message to output if validation fails 23 | */ 24 | public static void notNull(Object obj, String msg) { 25 | if (obj == null) 26 | throw new IllegalArgumentException(msg); 27 | } 28 | 29 | /** 30 | * Validates that the value is true 31 | * @param val object to test 32 | */ 33 | public static void isTrue(boolean val) { 34 | if (!val) 35 | throw new IllegalArgumentException("Must be true"); 36 | } 37 | 38 | /** 39 | * Validates that the value is true 40 | * @param val object to test 41 | * @param msg message to output if validation fails 42 | */ 43 | public static void isTrue(boolean val, String msg) { 44 | if (!val) 45 | throw new IllegalArgumentException(msg); 46 | } 47 | 48 | /** 49 | * Validates that the value is false 50 | * @param val object to test 51 | */ 52 | public static void isFalse(boolean val) { 53 | if (val) 54 | throw new IllegalArgumentException("Must be false"); 55 | } 56 | 57 | /** 58 | * Validates that the value is false 59 | * @param val object to test 60 | * @param msg message to output if validation fails 61 | */ 62 | public static void isFalse(boolean val, String msg) { 63 | if (val) 64 | throw new IllegalArgumentException(msg); 65 | } 66 | 67 | /** 68 | * Validates that the array contains no null elements 69 | * @param objects the array to test 70 | */ 71 | public static void noNullElements(Object[] objects) { 72 | noNullElements(objects, "Array must not contain any null objects"); 73 | } 74 | 75 | /** 76 | * Validates that the array contains no null elements 77 | * @param objects the array to test 78 | * @param msg message to output if validation fails 79 | */ 80 | public static void noNullElements(Object[] objects, String msg) { 81 | for (Object obj : objects) 82 | if (obj == null) 83 | throw new IllegalArgumentException(msg); 84 | } 85 | 86 | /** 87 | * Validates that the string is not empty 88 | * @param string the string to test 89 | */ 90 | public static void notEmpty(String string) { 91 | if (string == null || string.length() == 0) 92 | throw new IllegalArgumentException("String must not be empty"); 93 | } 94 | 95 | /** 96 | * Validates that the string is not empty 97 | * @param string the string to test 98 | * @param msg message to output if validation fails 99 | */ 100 | public static void notEmpty(String string, String msg) { 101 | if (string == null || string.length() == 0) 102 | throw new IllegalArgumentException(msg); 103 | } 104 | 105 | /** 106 | Cause a failure. 107 | @param msg message to output. 108 | */ 109 | public static void fail(String msg) { 110 | throw new IllegalArgumentException(msg); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/helper/W3CDom.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | import org.jsoup.nodes.Attribute; 4 | import org.jsoup.nodes.Attributes; 5 | import org.jsoup.select.NodeTraversor; 6 | import org.jsoup.select.NodeVisitor; 7 | import org.w3c.dom.Comment; 8 | import org.w3c.dom.Document; 9 | import org.w3c.dom.Element; 10 | import org.w3c.dom.Text; 11 | 12 | import javax.xml.parsers.DocumentBuilder; 13 | import javax.xml.parsers.DocumentBuilderFactory; 14 | import javax.xml.parsers.ParserConfigurationException; 15 | import javax.xml.transform.Transformer; 16 | import javax.xml.transform.TransformerException; 17 | import javax.xml.transform.TransformerFactory; 18 | import javax.xml.transform.dom.DOMSource; 19 | import javax.xml.transform.stream.StreamResult; 20 | import java.io.StringWriter; 21 | import java.util.HashMap; 22 | 23 | /** 24 | * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, 25 | * for integration with toolsets that use the W3C DOM. 26 | */ 27 | public class W3CDom { 28 | protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 29 | 30 | /** 31 | * Convert a jsoup Document to a W3C Document. 32 | * @param in jsoup doc 33 | * @return w3c doc 34 | */ 35 | public Document fromJsoup(org.jsoup.nodes.Document in) { 36 | Validate.notNull(in); 37 | DocumentBuilder builder; 38 | try { 39 | //set the factory to be namespace-aware 40 | factory.setNamespaceAware(true); 41 | builder = factory.newDocumentBuilder(); 42 | Document out = builder.newDocument(); 43 | convert(in, out); 44 | return out; 45 | } catch (ParserConfigurationException e) { 46 | throw new IllegalStateException(e); 47 | } 48 | } 49 | 50 | /** 51 | * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output document 52 | * before converting. 53 | * @param in jsoup doc 54 | * @param out w3c doc 55 | * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Document) 56 | */ 57 | public void convert(org.jsoup.nodes.Document in, Document out) { 58 | if (!StringUtil.isBlank(in.location())) 59 | out.setDocumentURI(in.location()); 60 | 61 | org.jsoup.nodes.Element rootEl = in.child(0); // skip the #root node 62 | NodeTraversor traversor = new NodeTraversor(new W3CBuilder(out)); 63 | traversor.traverse(rootEl); 64 | } 65 | 66 | /** 67 | * Implements the conversion by walking the input. 68 | */ 69 | protected static class W3CBuilder implements NodeVisitor { 70 | private static final String xmlnsKey = "xmlns"; 71 | private static final String xmlnsPrefix = "xmlns:"; 72 | 73 | private final Document doc; 74 | private final HashMap namespaces = new HashMap(); // prefix => urn 75 | private Element dest; 76 | 77 | public W3CBuilder(Document doc) { 78 | this.doc = doc; 79 | } 80 | 81 | public void head(org.jsoup.nodes.Node source, int depth) { 82 | if (source instanceof org.jsoup.nodes.Element) { 83 | org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; 84 | 85 | String prefix = updateNamespaces(sourceEl); 86 | String namespace = namespaces.get(prefix); 87 | 88 | Element el = doc.createElementNS(namespace, sourceEl.tagName()); 89 | copyAttributes(sourceEl, el); 90 | if (dest == null) { // sets up the root 91 | doc.appendChild(el); 92 | } else { 93 | dest.appendChild(el); 94 | } 95 | dest = el; // descend 96 | } else if (source instanceof org.jsoup.nodes.TextNode) { 97 | org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; 98 | Text text = doc.createTextNode(sourceText.getWholeText()); 99 | dest.appendChild(text); 100 | } else if (source instanceof org.jsoup.nodes.Comment) { 101 | org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; 102 | Comment comment = doc.createComment(sourceComment.getData()); 103 | dest.appendChild(comment); 104 | } else if (source instanceof org.jsoup.nodes.DataNode) { 105 | org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; 106 | Text node = doc.createTextNode(sourceData.getWholeData()); 107 | dest.appendChild(node); 108 | } else { 109 | // unhandled 110 | } 111 | } 112 | 113 | public void tail(org.jsoup.nodes.Node source, int depth) { 114 | if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { 115 | dest = (Element) dest.getParentNode(); // undescend. cromulent. 116 | } 117 | } 118 | 119 | private void copyAttributes(org.jsoup.nodes.Node source, Element el) { 120 | for (Attribute attribute : source.attributes()) { 121 | // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.] 122 | String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", ""); 123 | if (key.matches("[a-zA-Z_:]{1}[-a-zA-Z0-9_:.]*")) 124 | el.setAttribute(key, attribute.getValue()); 125 | } 126 | } 127 | 128 | /** 129 | * Finds any namespaces defined in this element. Returns any tag prefix. 130 | */ 131 | private String updateNamespaces(org.jsoup.nodes.Element el) { 132 | // scan the element for namespace declarations 133 | // like: xmlns="blah" or xmlns:prefix="blah" 134 | Attributes attributes = el.attributes(); 135 | for (Attribute attr : attributes) { 136 | String key = attr.getKey(); 137 | String prefix; 138 | if (key.equals(xmlnsKey)) { 139 | prefix = ""; 140 | } else if (key.startsWith(xmlnsPrefix)) { 141 | prefix = key.substring(xmlnsPrefix.length()); 142 | } else { 143 | continue; 144 | } 145 | namespaces.put(prefix, attr.getValue()); 146 | } 147 | 148 | // get the element prefix if any 149 | int pos = el.tagName().indexOf(":"); 150 | return pos > 0 ? el.tagName().substring(0, pos) : ""; 151 | } 152 | 153 | } 154 | 155 | /** 156 | * Serialize a W3C document to a String. 157 | * @param doc Document 158 | * @return Document as string 159 | */ 160 | public String asString(Document doc) { 161 | try { 162 | DOMSource domSource = new DOMSource(doc); 163 | StringWriter writer = new StringWriter(); 164 | StreamResult result = new StreamResult(writer); 165 | TransformerFactory tf = TransformerFactory.newInstance(); 166 | Transformer transformer = tf.newTransformer(); 167 | transformer.transform(domSource, result); 168 | return writer.toString(); 169 | } catch (TransformerException e) { 170 | throw new IllegalStateException(e); 171 | } 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/Attribute.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.jsoup.SerializationException; 4 | import org.jsoup.helper.Validate; 5 | 6 | import java.io.IOException; 7 | import java.util.Arrays; 8 | import java.util.Map; 9 | 10 | /** 11 | A single key + value attribute. Keys are trimmed and normalised to lower-case. 12 | 13 | @author Jonathan Hedley, jonathan@hedley.net */ 14 | public class Attribute implements Map.Entry, Cloneable { 15 | private static final String[] booleanAttributes = { 16 | "allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled", 17 | "formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize", 18 | "noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected", 19 | "sortable", "truespeed", "typemustmatch" 20 | }; 21 | 22 | private String key; 23 | private String value; 24 | 25 | /** 26 | * Create a new attribute from unencoded (raw) key and value. 27 | * @param key attribute key; case is preserved. 28 | * @param value attribute value 29 | * @see #createFromEncoded 30 | */ 31 | public Attribute(String key, String value) { 32 | Validate.notNull(key); 33 | Validate.notNull(value); 34 | this.key = key.trim(); 35 | Validate.notEmpty(key); // trimming could potentially make empty, so validate here 36 | this.value = value; 37 | } 38 | 39 | /** 40 | Get the attribute key. 41 | @return the attribute key 42 | */ 43 | public String getKey() { 44 | return key; 45 | } 46 | 47 | /** 48 | Set the attribute key; case is preserved. 49 | @param key the new key; must not be null 50 | */ 51 | public void setKey(String key) { 52 | Validate.notEmpty(key); 53 | this.key = key.trim(); 54 | } 55 | 56 | /** 57 | Get the attribute value. 58 | @return the attribute value 59 | */ 60 | public String getValue() { 61 | return value; 62 | } 63 | 64 | /** 65 | Set the attribute value. 66 | @param value the new attribute value; must not be null 67 | */ 68 | public String setValue(String value) { 69 | Validate.notNull(value); 70 | String old = this.value; 71 | this.value = value; 72 | return old; 73 | } 74 | 75 | /** 76 | Get the HTML representation of this attribute; e.g. {@code href="index.html"}. 77 | @return HTML 78 | */ 79 | public String html() { 80 | StringBuilder accum = new StringBuilder(); 81 | 82 | try { 83 | html(accum, (new Document("")).outputSettings()); 84 | } catch(IOException exception) { 85 | throw new SerializationException(exception); 86 | } 87 | return accum.toString(); 88 | } 89 | 90 | protected void html(Appendable accum, Document.OutputSettings out) throws IOException { 91 | accum.append(key); 92 | if (!shouldCollapseAttribute(out)) { 93 | accum.append("=\""); 94 | Entities.escape(accum, value, out, true, false, false); 95 | accum.append('"'); 96 | } 97 | } 98 | 99 | /** 100 | Get the string representation of this attribute, implemented as {@link #html()}. 101 | @return string 102 | */ 103 | @Override 104 | public String toString() { 105 | return html(); 106 | } 107 | 108 | /** 109 | * Create a new Attribute from an unencoded key and a HTML attribute encoded value. 110 | * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars. 111 | * @param encodedValue HTML attribute encoded value 112 | * @return attribute 113 | */ 114 | public static Attribute createFromEncoded(String unencodedKey, String encodedValue) { 115 | String value = Entities.unescape(encodedValue, true); 116 | return new Attribute(unencodedKey, value); 117 | } 118 | 119 | protected boolean isDataAttribute() { 120 | return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length(); 121 | } 122 | 123 | /** 124 | * Collapsible if it's a boolean attribute and value is empty or same as name 125 | * 126 | * @param out output settings 127 | * @return Returns whether collapsible or not 128 | */ 129 | protected final boolean shouldCollapseAttribute(Document.OutputSettings out) { 130 | return ("".equals(value) || value.equalsIgnoreCase(key)) 131 | && out.syntax() == Document.OutputSettings.Syntax.html 132 | && isBooleanAttribute(); 133 | } 134 | 135 | protected boolean isBooleanAttribute() { 136 | return Arrays.binarySearch(booleanAttributes, key) >= 0; 137 | } 138 | 139 | @Override 140 | public boolean equals(Object o) { 141 | if (this == o) return true; 142 | if (!(o instanceof Attribute)) return false; 143 | 144 | Attribute attribute = (Attribute) o; 145 | 146 | if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false; 147 | return !(value != null ? !value.equals(attribute.value) : attribute.value != null); 148 | } 149 | 150 | @Override 151 | public int hashCode() { 152 | int result = key != null ? key.hashCode() : 0; 153 | result = 31 * result + (value != null ? value.hashCode() : 0); 154 | return result; 155 | } 156 | 157 | @Override 158 | public Attribute clone() { 159 | try { 160 | return (Attribute) super.clone(); // only fields are immutable strings key and value, so no more deep copy required 161 | } catch (CloneNotSupportedException e) { 162 | throw new RuntimeException(e); 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/BooleanAttribute.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | /** 4 | * A boolean attribute that is written out without any value. 5 | */ 6 | public class BooleanAttribute extends Attribute { 7 | /** 8 | * Create a new boolean attribute from unencoded (raw) key. 9 | * @param key attribute key 10 | */ 11 | public BooleanAttribute(String key) { 12 | super(key, ""); 13 | } 14 | 15 | @Override 16 | protected boolean isBooleanAttribute() { 17 | return true; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/Comment.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | A comment node. 7 | 8 | @author Jonathan Hedley, jonathan@hedley.net */ 9 | public class Comment extends Node { 10 | private static final String COMMENT_KEY = "comment"; 11 | 12 | /** 13 | Create a new comment node. 14 | @param data The contents of the comment 15 | @param baseUri base URI 16 | */ 17 | public Comment(String data, String baseUri) { 18 | super(baseUri); 19 | attributes.put(COMMENT_KEY, data); 20 | } 21 | 22 | public String nodeName() { 23 | return "#comment"; 24 | } 25 | 26 | /** 27 | Get the contents of the comment. 28 | @return comment content 29 | */ 30 | public String getData() { 31 | return attributes.get(COMMENT_KEY); 32 | } 33 | 34 | void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { 35 | if (out.prettyPrint()) 36 | indent(accum, depth, out); 37 | accum 38 | .append(""); 41 | } 42 | 43 | void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {} 44 | 45 | @Override 46 | public String toString() { 47 | return outerHtml(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/DataNode.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | A data node, for contents of style, script tags etc, where contents should not show in text(). 7 | 8 | @author Jonathan Hedley, jonathan@hedley.net */ 9 | public class DataNode extends Node{ 10 | private static final String DATA_KEY = "data"; 11 | 12 | /** 13 | Create a new DataNode. 14 | @param data data contents 15 | @param baseUri base URI 16 | */ 17 | public DataNode(String data, String baseUri) { 18 | super(baseUri); 19 | attributes.put(DATA_KEY, data); 20 | } 21 | 22 | public String nodeName() { 23 | return "#data"; 24 | } 25 | 26 | /** 27 | Get the data contents of this node. Will be unescaped and with original new lines, space etc. 28 | @return data 29 | */ 30 | public String getWholeData() { 31 | return attributes.get(DATA_KEY); 32 | } 33 | 34 | /** 35 | * Set the data contents of this node. 36 | * @param data unencoded data 37 | * @return this node, for chaining 38 | */ 39 | public DataNode setWholeData(String data) { 40 | attributes.put(DATA_KEY, data); 41 | return this; 42 | } 43 | 44 | void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { 45 | accum.append(getWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain 46 | } 47 | 48 | void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {} 49 | 50 | @Override 51 | public String toString() { 52 | return outerHtml(); 53 | } 54 | 55 | /** 56 | Create a new DataNode from HTML encoded data. 57 | @param encodedData encoded data 58 | @param baseUri bass URI 59 | @return new DataNode 60 | */ 61 | public static DataNode createFromEncoded(String encodedData, String baseUri) { 62 | String data = Entities.unescape(encodedData); 63 | return new DataNode(data, baseUri); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/DocumentType.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.helper.StringUtil; 6 | import org.jsoup.nodes.Document.OutputSettings.*; 7 | 8 | /** 9 | * A {@code } node. 10 | */ 11 | public class DocumentType extends Node { 12 | public static final String PUBLIC_KEY = "PUBLIC"; 13 | public static final String SYSTEM_KEY = "SYSTEM"; 14 | private static final String NAME = "name"; 15 | private static final String PUB_SYS_KEY = "pubSysKey"; // PUBLIC or SYSTEM 16 | private static final String PUBLIC_ID = "publicId"; 17 | private static final String SYSTEM_ID = "systemId"; 18 | // todo: quirk mode from publicId and systemId 19 | 20 | /** 21 | * Create a new doctype element. 22 | * @param name the doctype's name 23 | * @param publicId the doctype's public ID 24 | * @param systemId the doctype's system ID 25 | * @param baseUri the doctype's base URI 26 | */ 27 | public DocumentType(String name, String publicId, String systemId, String baseUri) { 28 | super(baseUri); 29 | 30 | attr(NAME, name); 31 | attr(PUBLIC_ID, publicId); 32 | if (has(PUBLIC_ID)) { 33 | attr(PUB_SYS_KEY, PUBLIC_KEY); 34 | } 35 | attr(SYSTEM_ID, systemId); 36 | } 37 | 38 | /** 39 | * Create a new doctype element. 40 | * @param name the doctype's name 41 | * @param publicId the doctype's public ID 42 | * @param systemId the doctype's system ID 43 | * @param baseUri the doctype's base URI 44 | */ 45 | public DocumentType(String name, String pubSysKey, String publicId, String systemId, String baseUri) { 46 | super(baseUri); 47 | 48 | attr(NAME, name); 49 | if (pubSysKey != null) { 50 | attr(PUB_SYS_KEY, pubSysKey); 51 | } 52 | attr(PUBLIC_ID, publicId); 53 | attr(SYSTEM_ID, systemId); 54 | } 55 | 56 | @Override 57 | public String nodeName() { 58 | return "#doctype"; 59 | } 60 | 61 | @Override 62 | void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { 63 | if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) { 64 | // looks like a html5 doctype, go lowercase for aesthetics 65 | accum.append("'); 78 | } 79 | 80 | @Override 81 | void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) { 82 | } 83 | 84 | private boolean has(final String attribute) { 85 | return !StringUtil.isBlank(attr(attribute)); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/FormElement.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.jsoup.Connection; 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.helper.HttpConnection; 6 | import org.jsoup.helper.Validate; 7 | import org.jsoup.parser.Tag; 8 | import org.jsoup.select.Elements; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * A HTML Form Element provides ready access to the form fields/controls that are associated with it. It also allows a 15 | * form to easily be submitted. 16 | */ 17 | public class FormElement extends Element { 18 | private final Elements elements = new Elements(); 19 | 20 | /** 21 | * Create a new, standalone form element. 22 | * 23 | * @param tag tag of this element 24 | * @param baseUri the base URI 25 | * @param attributes initial attributes 26 | */ 27 | public FormElement(Tag tag, String baseUri, Attributes attributes) { 28 | super(tag, baseUri, attributes); 29 | } 30 | 31 | /** 32 | * Get the list of form control elements associated with this form. 33 | * @return form controls associated with this element. 34 | */ 35 | public Elements elements() { 36 | return elements; 37 | } 38 | 39 | /** 40 | * Add a form control element to this form. 41 | * @param element form control to add 42 | * @return this form element, for chaining 43 | */ 44 | public FormElement addElement(Element element) { 45 | elements.add(element); 46 | return this; 47 | } 48 | 49 | /** 50 | * Prepare to submit this form. A Connection object is created with the request set up from the form values. You 51 | * can then set up other options (like user-agent, timeout, cookies), then execute it. 52 | * @return a connection prepared from the values of this form. 53 | * @throws IllegalArgumentException if the form's absolute action URL cannot be determined. Make sure you pass the 54 | * document's base URI when parsing. 55 | */ 56 | public Connection submit() { 57 | String action = hasAttr("action") ? absUrl("action") : baseUri(); 58 | Validate.notEmpty(action, "Could not determine a form action URL for submit. Ensure you set a base URI when parsing."); 59 | Connection.Method method = attr("method").toUpperCase().equals("POST") ? 60 | Connection.Method.POST : Connection.Method.GET; 61 | 62 | return Jsoup.connect(action) 63 | .data(formData()) 64 | .method(method); 65 | } 66 | 67 | /** 68 | * Get the data that this form submits. The returned list is a copy of the data, and changes to the contents of the 69 | * list will not be reflected in the DOM. 70 | * @return a list of key vals 71 | */ 72 | public List formData() { 73 | ArrayList data = new ArrayList(); 74 | 75 | // iterate the form control elements and accumulate their values 76 | for (Element el: elements) { 77 | if (!el.tag().isFormSubmittable()) continue; // contents are form listable, superset of submitable 78 | if (el.hasAttr("disabled")) continue; // skip disabled form inputs 79 | String name = el.attr("name"); 80 | if (name.length() == 0) continue; 81 | String type = el.attr("type"); 82 | 83 | if ("select".equals(el.tagName())) { 84 | Elements options = el.select("option[selected]"); 85 | boolean set = false; 86 | for (Element option: options) { 87 | data.add(HttpConnection.KeyVal.create(name, option.val())); 88 | set = true; 89 | } 90 | if (!set) { 91 | Element option = el.select("option").first(); 92 | if (option != null) 93 | data.add(HttpConnection.KeyVal.create(name, option.val())); 94 | } 95 | } else if ("checkbox".equalsIgnoreCase(type) || "radio".equalsIgnoreCase(type)) { 96 | // only add checkbox or radio if they have the checked attribute 97 | if (el.hasAttr("checked")) { 98 | final String val = el.val().length() > 0 ? el.val() : "on"; 99 | data.add(HttpConnection.KeyVal.create(name, val)); 100 | } 101 | } else { 102 | data.add(HttpConnection.KeyVal.create(name, el.val())); 103 | } 104 | } 105 | return data; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/TextNode.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.helper.StringUtil; 6 | import org.jsoup.helper.Validate; 7 | 8 | /** 9 | A text node. 10 | 11 | @author Jonathan Hedley, jonathan@hedley.net */ 12 | public class TextNode extends Node { 13 | /* 14 | TextNode is a node, and so by default comes with attributes and children. The attributes are seldom used, but use 15 | memory, and the child nodes are never used. So we don't have them, and override accessors to attributes to create 16 | them as needed on the fly. 17 | */ 18 | private static final String TEXT_KEY = "text"; 19 | String text; 20 | 21 | /** 22 | Create a new TextNode representing the supplied (unencoded) text). 23 | 24 | @param text raw text 25 | @param baseUri base uri 26 | @see #createFromEncoded(String, String) 27 | */ 28 | public TextNode(String text, String baseUri) { 29 | this.baseUri = baseUri; 30 | this.text = text; 31 | } 32 | 33 | public String nodeName() { 34 | return "#text"; 35 | } 36 | 37 | /** 38 | * Get the text content of this text node. 39 | * @return Unencoded, normalised text. 40 | * @see TextNode#getWholeText() 41 | */ 42 | public String text() { 43 | return normaliseWhitespace(getWholeText()); 44 | } 45 | 46 | /** 47 | * Set the text content of this text node. 48 | * @param text unencoded text 49 | * @return this, for chaining 50 | */ 51 | public TextNode text(String text) { 52 | this.text = text; 53 | if (attributes != null) 54 | attributes.put(TEXT_KEY, text); 55 | return this; 56 | } 57 | 58 | /** 59 | Get the (unencoded) text of this text node, including any newlines and spaces present in the original. 60 | @return text 61 | */ 62 | public String getWholeText() { 63 | return attributes == null ? text : attributes.get(TEXT_KEY); 64 | } 65 | 66 | /** 67 | Test if this text node is blank -- that is, empty or only whitespace (including newlines). 68 | @return true if this document is empty or only whitespace, false if it contains any text content. 69 | */ 70 | public boolean isBlank() { 71 | return StringUtil.isBlank(getWholeText()); 72 | } 73 | 74 | /** 75 | * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the 76 | * original text up to the offset, and will have a new text node sibling containing the text after the offset. 77 | * @param offset string offset point to split node at. 78 | * @return the newly created text node containing the text after the offset. 79 | */ 80 | public TextNode splitText(int offset) { 81 | Validate.isTrue(offset >= 0, "Split offset must be not be negative"); 82 | Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length"); 83 | 84 | String head = getWholeText().substring(0, offset); 85 | String tail = getWholeText().substring(offset); 86 | text(head); 87 | TextNode tailNode = new TextNode(tail, this.baseUri()); 88 | if (parent() != null) 89 | parent().addChildren(siblingIndex()+1, tailNode); 90 | 91 | return tailNode; 92 | } 93 | 94 | void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { 95 | if (out.prettyPrint() && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) )) 96 | indent(accum, depth, out); 97 | 98 | boolean normaliseWhite = out.prettyPrint() && parent() instanceof Element 99 | && !Element.preserveWhitespace(parent()); 100 | Entities.escape(accum, getWholeText(), out, false, normaliseWhite, false); 101 | } 102 | 103 | void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {} 104 | 105 | @Override 106 | public String toString() { 107 | return outerHtml(); 108 | } 109 | 110 | /** 111 | * Create a new TextNode from HTML encoded (aka escaped) data. 112 | * @param encodedText Text containing encoded HTML (e.g. &lt;) 113 | * @param baseUri Base uri 114 | * @return TextNode containing unencoded data (e.g. <) 115 | */ 116 | public static TextNode createFromEncoded(String encodedText, String baseUri) { 117 | String text = Entities.unescape(encodedText); 118 | return new TextNode(text, baseUri); 119 | } 120 | 121 | static String normaliseWhitespace(String text) { 122 | text = StringUtil.normaliseWhitespace(text); 123 | return text; 124 | } 125 | 126 | static String stripLeadingWhitespace(String text) { 127 | return text.replaceFirst("^\\s+", ""); 128 | } 129 | 130 | static boolean lastCharIsWhitespace(StringBuilder sb) { 131 | return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; 132 | } 133 | 134 | // attribute fiddling. create on first access. 135 | private void ensureAttributes() { 136 | if (attributes == null) { 137 | attributes = new Attributes(); 138 | attributes.put(TEXT_KEY, text); 139 | } 140 | } 141 | 142 | @Override 143 | public String attr(String attributeKey) { 144 | ensureAttributes(); 145 | return super.attr(attributeKey); 146 | } 147 | 148 | @Override 149 | public Attributes attributes() { 150 | ensureAttributes(); 151 | return super.attributes(); 152 | } 153 | 154 | @Override 155 | public Node attr(String attributeKey, String attributeValue) { 156 | ensureAttributes(); 157 | return super.attr(attributeKey, attributeValue); 158 | } 159 | 160 | @Override 161 | public boolean hasAttr(String attributeKey) { 162 | ensureAttributes(); 163 | return super.hasAttr(attributeKey); 164 | } 165 | 166 | @Override 167 | public Node removeAttr(String attributeKey) { 168 | ensureAttributes(); 169 | return super.removeAttr(attributeKey); 170 | } 171 | 172 | @Override 173 | public String absUrl(String attributeKey) { 174 | ensureAttributes(); 175 | return super.absUrl(attributeKey); 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/XmlDeclaration.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.jsoup.helper.Validate; 4 | 5 | import java.io.IOException; 6 | 7 | /** 8 | An XML Declaration. 9 | 10 | @author Jonathan Hedley, jonathan@hedley.net */ 11 | public class XmlDeclaration extends Node { 12 | private final String name; 13 | private final boolean isProcessingInstruction; // "); 58 | } 59 | 60 | void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {} 61 | 62 | @Override 63 | public String toString() { 64 | return outerHtml(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/entities-base.properties: -------------------------------------------------------------------------------- 1 | AElig=5i;1c 2 | AMP=12;2 3 | Aacute=5d;17 4 | Acirc=5e;18 5 | Agrave=5c;16 6 | Aring=5h;1b 7 | Atilde=5f;19 8 | Auml=5g;1a 9 | COPY=4p;h 10 | Ccedil=5j;1d 11 | ETH=5s;1m 12 | Eacute=5l;1f 13 | Ecirc=5m;1g 14 | Egrave=5k;1e 15 | Euml=5n;1h 16 | GT=1q;6 17 | Iacute=5p;1j 18 | Icirc=5q;1k 19 | Igrave=5o;1i 20 | Iuml=5r;1l 21 | LT=1o;4 22 | Ntilde=5t;1n 23 | Oacute=5v;1p 24 | Ocirc=5w;1q 25 | Ograve=5u;1o 26 | Oslash=60;1u 27 | Otilde=5x;1r 28 | Ouml=5y;1s 29 | QUOT=y;0 30 | REG=4u;n 31 | THORN=66;20 32 | Uacute=62;1w 33 | Ucirc=63;1x 34 | Ugrave=61;1v 35 | Uuml=64;1y 36 | Yacute=65;1z 37 | aacute=69;23 38 | acirc=6a;24 39 | acute=50;u 40 | aelig=6e;28 41 | agrave=68;22 42 | amp=12;3 43 | aring=6d;27 44 | atilde=6b;25 45 | auml=6c;26 46 | brvbar=4m;e 47 | ccedil=6f;29 48 | cedil=54;y 49 | cent=4i;a 50 | copy=4p;i 51 | curren=4k;c 52 | deg=4w;q 53 | divide=6v;2p 54 | eacute=6h;2b 55 | ecirc=6i;2c 56 | egrave=6g;2a 57 | eth=6o;2i 58 | euml=6j;2d 59 | frac12=59;13 60 | frac14=58;12 61 | frac34=5a;14 62 | gt=1q;7 63 | iacute=6l;2f 64 | icirc=6m;2g 65 | iexcl=4h;9 66 | igrave=6k;2e 67 | iquest=5b;15 68 | iuml=6n;2h 69 | laquo=4r;k 70 | lt=1o;5 71 | macr=4v;p 72 | micro=51;v 73 | middot=53;x 74 | nbsp=4g;8 75 | not=4s;l 76 | ntilde=6p;2j 77 | oacute=6r;2l 78 | ocirc=6s;2m 79 | ograve=6q;2k 80 | ordf=4q;j 81 | ordm=56;10 82 | oslash=6w;2q 83 | otilde=6t;2n 84 | ouml=6u;2o 85 | para=52;w 86 | plusmn=4x;r 87 | pound=4j;b 88 | quot=y;1 89 | raquo=57;11 90 | reg=4u;o 91 | sect=4n;f 92 | shy=4t;m 93 | sup1=55;z 94 | sup2=4y;s 95 | sup3=4z;t 96 | szlig=67;21 97 | thorn=72;2w 98 | times=5z;1t 99 | uacute=6y;2s 100 | ucirc=6z;2t 101 | ugrave=6x;2r 102 | uml=4o;g 103 | uuml=70;2u 104 | yacute=71;2v 105 | yen=4l;d 106 | yuml=73;2x 107 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/entities-xhtml.properties: -------------------------------------------------------------------------------- 1 | amp=12;1 2 | gt=1q;3 3 | lt=1o;2 4 | quot=y;0 5 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/nodes/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | HTML document structure nodes. 3 | */ 4 | package org.jsoup.nodes; -------------------------------------------------------------------------------- /src/main/java/org/jsoup/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | Contains the main {@link org.jsoup.Jsoup} class, which provides convenient static access to the jsoup functionality. 3 | */ 4 | package org.jsoup; -------------------------------------------------------------------------------- /src/main/java/org/jsoup/parser/ParseError.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.parser; 2 | 3 | /** 4 | * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase. 5 | */ 6 | public class ParseError { 7 | private int pos; 8 | private String errorMsg; 9 | 10 | ParseError(int pos, String errorMsg) { 11 | this.pos = pos; 12 | this.errorMsg = errorMsg; 13 | } 14 | 15 | ParseError(int pos, String errorFormat, Object... args) { 16 | this.errorMsg = String.format(errorFormat, args); 17 | this.pos = pos; 18 | } 19 | 20 | /** 21 | * Retrieve the error message. 22 | * @return the error message. 23 | */ 24 | public String getErrorMessage() { 25 | return errorMsg; 26 | } 27 | 28 | /** 29 | * Retrieves the offset of the error. 30 | * @return error offset within input 31 | */ 32 | public int getPosition() { 33 | return pos; 34 | } 35 | 36 | @Override 37 | public String toString() { 38 | return pos + ": " + errorMsg; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/parser/ParseErrorList.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.parser; 2 | 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * A container for ParseErrors. 7 | * 8 | * @author Jonathan Hedley 9 | */ 10 | public class ParseErrorList extends ArrayList{ 11 | private static final int INITIAL_CAPACITY = 16; 12 | private final int maxSize; 13 | 14 | ParseErrorList(int initialCapacity, int maxSize) { 15 | super(initialCapacity); 16 | this.maxSize = maxSize; 17 | } 18 | 19 | boolean canAddError() { 20 | return size() < maxSize; 21 | } 22 | 23 | int getMaxSize() { 24 | return maxSize; 25 | } 26 | 27 | public static ParseErrorList noTracking() { 28 | return new ParseErrorList(0, 0); 29 | } 30 | 31 | public static ParseErrorList tracking(int maxSize) { 32 | return new ParseErrorList(INITIAL_CAPACITY, maxSize); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/parser/ParseSettings.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.parser; 2 | 3 | import org.jsoup.nodes.Attribute; 4 | import org.jsoup.nodes.Attributes; 5 | 6 | /** 7 | * Controls parser settings, to optionally preserve tag and/or attribute name case. 8 | */ 9 | public class ParseSettings { 10 | /** 11 | * HTML default settings: both tag and attribute names are lower-cased during parsing. 12 | */ 13 | public static final ParseSettings htmlDefault; 14 | /** 15 | * Preserve both tag and attribute case. 16 | */ 17 | public static final ParseSettings preserveCase; 18 | 19 | static { 20 | htmlDefault = new ParseSettings(false, false); 21 | preserveCase = new ParseSettings(true, true); 22 | } 23 | 24 | private final boolean preserveTagCase; 25 | private final boolean preserveAttributeCase; 26 | 27 | /** 28 | * Define parse settings. 29 | * @param tag preserve tag case? 30 | * @param attribute preserve attribute name case? 31 | */ 32 | public ParseSettings(boolean tag, boolean attribute) { 33 | preserveTagCase = tag; 34 | preserveAttributeCase = attribute; 35 | } 36 | 37 | String normalizeTag(String name) { 38 | name = name.trim(); 39 | if (!preserveTagCase) 40 | name = name.toLowerCase(); 41 | return name; 42 | } 43 | 44 | String normalizeAttribute(String name) { 45 | name = name.trim(); 46 | if (!preserveAttributeCase) 47 | name = name.toLowerCase(); 48 | return name; 49 | } 50 | 51 | Attributes normalizeAttributes(Attributes attributes) { 52 | if (!preserveAttributeCase) { 53 | for (Attribute attr : attributes) { 54 | attr.setKey(attr.getKey().toLowerCase()); 55 | } 56 | } 57 | return attributes; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/parser/Parser.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.parser; 2 | 3 | import org.jsoup.nodes.Document; 4 | import org.jsoup.nodes.Element; 5 | import org.jsoup.nodes.Node; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods 11 | * in {@link org.jsoup.Jsoup}. 12 | */ 13 | public class Parser { 14 | private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled. 15 | 16 | private TreeBuilder treeBuilder; 17 | private int maxErrors = DEFAULT_MAX_ERRORS; 18 | private ParseErrorList errors; 19 | private ParseSettings settings; 20 | 21 | /** 22 | * Create a new Parser, using the specified TreeBuilder 23 | * @param treeBuilder TreeBuilder to use to parse input into Documents. 24 | */ 25 | public Parser(TreeBuilder treeBuilder) { 26 | this.treeBuilder = treeBuilder; 27 | settings = treeBuilder.defaultSettings(); 28 | } 29 | 30 | public Document parseInput(String html, String baseUri) { 31 | errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 32 | return treeBuilder.parse(html, baseUri, errors, settings); 33 | } 34 | 35 | // gets & sets 36 | /** 37 | * Get the TreeBuilder currently in use. 38 | * @return current TreeBuilder. 39 | */ 40 | public TreeBuilder getTreeBuilder() { 41 | return treeBuilder; 42 | } 43 | 44 | /** 45 | * Update the TreeBuilder used when parsing content. 46 | * @param treeBuilder current TreeBuilder 47 | * @return this, for chaining 48 | */ 49 | public Parser setTreeBuilder(TreeBuilder treeBuilder) { 50 | this.treeBuilder = treeBuilder; 51 | return this; 52 | } 53 | 54 | /** 55 | * Check if parse error tracking is enabled. 56 | * @return current track error state. 57 | */ 58 | public boolean isTrackErrors() { 59 | return maxErrors > 0; 60 | } 61 | 62 | /** 63 | * Enable or disable parse error tracking for the next parse. 64 | * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 65 | * @return this, for chaining 66 | */ 67 | public Parser setTrackErrors(int maxErrors) { 68 | this.maxErrors = maxErrors; 69 | return this; 70 | } 71 | 72 | /** 73 | * Retrieve the parse errors, if any, from the last parse. 74 | * @return list of parse errors, up to the size of the maximum errors tracked. 75 | */ 76 | public List getErrors() { 77 | return errors; 78 | } 79 | 80 | public Parser settings(ParseSettings settings) { 81 | this.settings = settings; 82 | return this; 83 | } 84 | 85 | public ParseSettings settings() { 86 | return settings; 87 | } 88 | 89 | // static parse functions below 90 | /** 91 | * Parse HTML into a Document. 92 | * 93 | * @param html HTML to parse 94 | * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 95 | * 96 | * @return parsed Document 97 | */ 98 | public static Document parse(String html, String baseUri) { 99 | TreeBuilder treeBuilder = new HtmlTreeBuilder(); 100 | return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings()); 101 | } 102 | 103 | /** 104 | * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 105 | * 106 | * @param fragmentHtml the fragment of HTML to parse 107 | * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 108 | * provides stack context (for implicit element creation). 109 | * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 110 | * 111 | * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 112 | */ 113 | public static List parseFragment(String fragmentHtml, Element context, String baseUri) { 114 | HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 115 | return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings()); 116 | } 117 | 118 | /** 119 | * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 120 | * 121 | * @param fragmentHtml the fragment of HTML to parse 122 | * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 123 | * provides stack context (for implicit element creation). 124 | * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 125 | * @param errorList list to add errors to 126 | * 127 | * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 128 | */ 129 | public static List parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 130 | HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 131 | return treeBuilder.parseFragment(fragmentHtml, context, baseUri, errorList, treeBuilder.defaultSettings()); 132 | } 133 | 134 | /** 135 | * Parse a fragment of XML into a list of nodes. 136 | * 137 | * @param fragmentXml the fragment of XML to parse 138 | * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 139 | * @return list of nodes parsed from the input XML. 140 | */ 141 | public static List parseXmlFragment(String fragmentXml, String baseUri) { 142 | XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 143 | return treeBuilder.parseFragment(fragmentXml, baseUri, ParseErrorList.noTracking(), treeBuilder.defaultSettings()); 144 | } 145 | 146 | /** 147 | * Parse a fragment of HTML into the {@code body} of a Document. 148 | * 149 | * @param bodyHtml fragment of HTML 150 | * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 151 | * 152 | * @return Document, with empty head, and HTML parsed into body 153 | */ 154 | public static Document parseBodyFragment(String bodyHtml, String baseUri) { 155 | Document doc = Document.createShell(baseUri); 156 | Element body = doc.body(); 157 | List nodeList = parseFragment(bodyHtml, body, baseUri); 158 | Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented 159 | for (int i = nodes.length - 1; i > 0; i--) { 160 | nodes[i].remove(); 161 | } 162 | for (Node node : nodes) { 163 | body.appendChild(node); 164 | } 165 | return doc; 166 | } 167 | 168 | /** 169 | * Utility method to unescape HTML entities from a string 170 | * @param string HTML escaped string 171 | * @param inAttribute if the string is to be escaped in strict mode (as attributes are) 172 | * @return an unescaped string 173 | */ 174 | public static String unescapeEntities(String string, boolean inAttribute) { 175 | Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking()); 176 | return tokeniser.unescapeEntities(inAttribute); 177 | } 178 | 179 | /** 180 | * @param bodyHtml HTML to parse 181 | * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 182 | * 183 | * @return parsed Document 184 | * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. 185 | */ 186 | public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { 187 | return parse(bodyHtml, baseUri); 188 | } 189 | 190 | // builders 191 | 192 | /** 193 | * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 194 | * based on a knowledge of the semantics of the incoming tags. 195 | * @return a new HTML parser. 196 | */ 197 | public static Parser htmlParser() { 198 | return new Parser(new HtmlTreeBuilder()); 199 | } 200 | 201 | /** 202 | * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 203 | * rather creates a simple tree directly from the input. 204 | * @return a new simple XML parser. 205 | */ 206 | public static Parser xmlParser() { 207 | return new Parser(new XmlTreeBuilder()); 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/parser/TreeBuilder.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.parser; 2 | 3 | import org.jsoup.helper.Validate; 4 | import org.jsoup.nodes.Attributes; 5 | import org.jsoup.nodes.Document; 6 | import org.jsoup.nodes.Element; 7 | 8 | import java.util.ArrayList; 9 | 10 | /** 11 | * @author Jonathan Hedley 12 | */ 13 | abstract class TreeBuilder { 14 | CharacterReader reader; 15 | Tokeniser tokeniser; 16 | protected Document doc; // current doc we are building into 17 | protected ArrayList stack; // the stack of open elements 18 | protected String baseUri; // current base uri, for creating new elements 19 | protected Token currentToken; // currentToken is used only for error tracking. 20 | protected ParseErrorList errors; // null when not tracking errors 21 | protected ParseSettings settings; 22 | 23 | private Token.StartTag start = new Token.StartTag(); // start tag to process 24 | private Token.EndTag end = new Token.EndTag(); 25 | 26 | abstract ParseSettings defaultSettings(); 27 | 28 | protected void initialiseParse(String input, String baseUri, ParseErrorList errors, ParseSettings settings) { 29 | Validate.notNull(input, "String input must not be null"); 30 | Validate.notNull(baseUri, "BaseURI must not be null"); 31 | 32 | doc = new Document(baseUri); 33 | this.settings = settings; 34 | reader = new CharacterReader(input); 35 | this.errors = errors; 36 | tokeniser = new Tokeniser(reader, errors); 37 | stack = new ArrayList(32); 38 | this.baseUri = baseUri; 39 | } 40 | 41 | Document parse(String input, String baseUri, ParseErrorList errors, ParseSettings settings) { 42 | initialiseParse(input, baseUri, errors, settings); 43 | runParser(); 44 | return doc; 45 | } 46 | 47 | protected void runParser() { 48 | while (true) { 49 | Token token = tokeniser.read(); 50 | process(token); 51 | token.reset(); 52 | 53 | if (token.type == Token.TokenType.EOF) 54 | break; 55 | } 56 | } 57 | 58 | protected abstract boolean process(Token token); 59 | 60 | protected boolean processStartTag(String name) { 61 | if (currentToken == start) { // don't recycle an in-use token 62 | return process(new Token.StartTag().name(name)); 63 | } 64 | return process(start.reset().name(name)); 65 | } 66 | 67 | public boolean processStartTag(String name, Attributes attrs) { 68 | if (currentToken == start) { // don't recycle an in-use token 69 | return process(new Token.StartTag().nameAttr(name, attrs)); 70 | } 71 | start.reset(); 72 | start.nameAttr(name, attrs); 73 | return process(start); 74 | } 75 | 76 | protected boolean processEndTag(String name) { 77 | if (currentToken == end) { // don't recycle an in-use token 78 | return process(new Token.EndTag().name(name)); 79 | } 80 | return process(end.reset().name(name)); 81 | } 82 | 83 | 84 | protected Element currentElement() { 85 | int size = stack.size(); 86 | return size > 0 ? stack.get(size-1) : null; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/parser/XmlTreeBuilder.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.parser; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.helper.Validate; 5 | import org.jsoup.nodes.*; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 11 | * document. 12 | *

Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}

13 | * 14 | * @author Jonathan Hedley 15 | */ 16 | public class XmlTreeBuilder extends TreeBuilder { 17 | ParseSettings defaultSettings() { 18 | return ParseSettings.preserveCase; 19 | } 20 | 21 | Document parse(String input, String baseUri) { 22 | return parse(input, baseUri, ParseErrorList.noTracking(), ParseSettings.preserveCase); 23 | } 24 | 25 | @Override 26 | protected void initialiseParse(String input, String baseUri, ParseErrorList errors, ParseSettings settings) { 27 | super.initialiseParse(input, baseUri, errors, settings); 28 | stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack) 29 | doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); 30 | } 31 | 32 | @Override 33 | protected boolean process(Token token) { 34 | // start tag, end tag, doctype, comment, character, eof 35 | switch (token.type) { 36 | case StartTag: 37 | insert(token.asStartTag()); 38 | break; 39 | case EndTag: 40 | popStackToClose(token.asEndTag()); 41 | break; 42 | case Comment: 43 | insert(token.asComment()); 44 | break; 45 | case Character: 46 | insert(token.asCharacter()); 47 | break; 48 | case Doctype: 49 | insert(token.asDoctype()); 50 | break; 51 | case EOF: // could put some normalisation here if desired 52 | break; 53 | default: 54 | Validate.fail("Unexpected token type: " + token.type); 55 | } 56 | return true; 57 | } 58 | 59 | private void insertNode(Node node) { 60 | currentElement().appendChild(node); 61 | } 62 | 63 | Element insert(Token.StartTag startTag) { 64 | Tag tag = Tag.valueOf(startTag.name(), settings); 65 | // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html. 66 | Element el = new Element(tag, baseUri, settings.normalizeAttributes(startTag.attributes)); 67 | insertNode(el); 68 | if (startTag.isSelfClosing()) { 69 | tokeniser.acknowledgeSelfClosingFlag(); 70 | if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above. 71 | tag.setSelfClosing(); 72 | } else { 73 | stack.add(el); 74 | } 75 | return el; 76 | } 77 | 78 | void insert(Token.Comment commentToken) { 79 | Comment comment = new Comment(commentToken.getData(), baseUri); 80 | Node insert = comment; 81 | if (commentToken.bogus) { // xml declarations are emitted as bogus comments (which is right for html, but not xml) 82 | // so we do a bit of a hack and parse the data as an element to pull the attributes out 83 | String data = comment.getData(); 84 | if (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) { 85 | Document doc = Jsoup.parse("<" + data.substring(1, data.length() -1) + ">", baseUri, Parser.xmlParser()); 86 | Element el = doc.child(0); 87 | insert = new XmlDeclaration(settings.normalizeTag(el.tagName()), comment.baseUri(), data.startsWith("!")); 88 | insert.attributes().addAll(el.attributes()); 89 | } 90 | } 91 | insertNode(insert); 92 | } 93 | 94 | void insert(Token.Character characterToken) { 95 | Node node = new TextNode(characterToken.getData(), baseUri); 96 | insertNode(node); 97 | } 98 | 99 | void insert(Token.Doctype d) { 100 | DocumentType doctypeNode = new DocumentType(settings.normalizeTag(d.getName()), d.getPubSysKey(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri); 101 | insertNode(doctypeNode); 102 | } 103 | 104 | /** 105 | * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 106 | * found, skips. 107 | * 108 | * @param endTag 109 | */ 110 | private void popStackToClose(Token.EndTag endTag) { 111 | String elName = endTag.name(); 112 | Element firstFound = null; 113 | 114 | for (int pos = stack.size() -1; pos >= 0; pos--) { 115 | Element next = stack.get(pos); 116 | if (next.nodeName().equals(elName)) { 117 | firstFound = next; 118 | break; 119 | } 120 | } 121 | if (firstFound == null) 122 | return; // not found, skip 123 | 124 | for (int pos = stack.size() -1; pos >= 0; pos--) { 125 | Element next = stack.get(pos); 126 | stack.remove(pos); 127 | if (next == firstFound) 128 | break; 129 | } 130 | } 131 | 132 | List parseFragment(String inputFragment, String baseUri, ParseErrorList errors, ParseSettings settings) { 133 | initialiseParse(inputFragment, baseUri, errors, settings); 134 | runParser(); 135 | return doc.childNodes(); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/parser/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | Contains the HTML parser, tag specifications, and HTML tokeniser. 3 | */ 4 | package org.jsoup.parser; 5 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/safety/Cleaner.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.safety; 2 | 3 | import org.jsoup.helper.Validate; 4 | import org.jsoup.nodes.Attribute; 5 | import org.jsoup.nodes.Attributes; 6 | import org.jsoup.nodes.DataNode; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.nodes.Node; 10 | import org.jsoup.nodes.TextNode; 11 | import org.jsoup.parser.ParseErrorList; 12 | import org.jsoup.parser.Parser; 13 | import org.jsoup.parser.Tag; 14 | import org.jsoup.select.NodeTraversor; 15 | import org.jsoup.select.NodeVisitor; 16 | 17 | import java.util.List; 18 | 19 | 20 | /** 21 | The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes 22 | that you are expecting; no junk, and no cross-site scripting attacks! 23 |

24 | The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain 25 | HTML that is allowed by the whitelist. 26 |

27 |

28 | It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the 29 | canned white-lists only allow body contained tags. 30 |

31 |

32 | Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. 33 |

34 | */ 35 | public class Cleaner { 36 | private Whitelist whitelist; 37 | 38 | /** 39 | Create a new cleaner, that sanitizes documents using the supplied whitelist. 40 | @param whitelist white-list to clean with 41 | */ 42 | public Cleaner(Whitelist whitelist) { 43 | Validate.notNull(whitelist); 44 | this.whitelist = whitelist; 45 | } 46 | 47 | /** 48 | Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. 49 | The original document is not modified. Only elements from the dirt document's body are used. 50 | @param dirtyDocument Untrusted base document to clean. 51 | @return cleaned document. 52 | */ 53 | public Document clean(Document dirtyDocument) { 54 | Validate.notNull(dirtyDocument); 55 | 56 | Document clean = Document.createShell(dirtyDocument.baseUri()); 57 | if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body. 58 | copySafeNodes(dirtyDocument.body(), clean.body()); 59 | 60 | return clean; 61 | } 62 | 63 | /** 64 | Determines if the input document bodyis valid, against the whitelist. It is considered valid if all the tags and attributes 65 | in the input HTML are allowed by the whitelist, and that there is no content in the head. 66 |

67 | This method can be used as a validator for user input. An invalid document will still be cleaned successfully 68 | using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document 69 | to ensure enforced attributes are set correctly, and that the output is tidied. 70 |

71 | @param dirtyDocument document to test 72 | @return true if no tags or attributes need to be removed; false if they do 73 | */ 74 | public boolean isValid(Document dirtyDocument) { 75 | Validate.notNull(dirtyDocument); 76 | 77 | Document clean = Document.createShell(dirtyDocument.baseUri()); 78 | int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); 79 | return numDiscarded == 0 80 | && dirtyDocument.head().childNodes().size() == 0; // because we only look at the body, but we start from a shell, make sure there's nothing in the head 81 | } 82 | 83 | public boolean isValidBodyHtml(String bodyHtml) { 84 | Document clean = Document.createShell(""); 85 | Document dirty = Document.createShell(""); 86 | ParseErrorList errorList = ParseErrorList.tracking(1); 87 | List nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList); 88 | dirty.body().insertChildren(0, nodes); 89 | int numDiscarded = copySafeNodes(dirty.body(), clean.body()); 90 | return numDiscarded == 0 && errorList.size() == 0; 91 | } 92 | 93 | /** 94 | Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. 95 | */ 96 | private final class CleaningVisitor implements NodeVisitor { 97 | private int numDiscarded = 0; 98 | private final Element root; 99 | private Element destination; // current element to append nodes to 100 | 101 | private CleaningVisitor(Element root, Element destination) { 102 | this.root = root; 103 | this.destination = destination; 104 | } 105 | 106 | public void head(Node source, int depth) { 107 | if (source instanceof Element) { 108 | Element sourceEl = (Element) source; 109 | 110 | if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs 111 | ElementMeta meta = createSafeElement(sourceEl); 112 | Element destChild = meta.el; 113 | destination.appendChild(destChild); 114 | 115 | numDiscarded += meta.numAttribsDiscarded; 116 | destination = destChild; 117 | } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. 118 | numDiscarded++; 119 | } 120 | } else if (source instanceof TextNode) { 121 | TextNode sourceText = (TextNode) source; 122 | TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); 123 | destination.appendChild(destText); 124 | } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { 125 | DataNode sourceData = (DataNode) source; 126 | DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); 127 | destination.appendChild(destData); 128 | } else { // else, we don't care about comments, xml proc instructions, etc 129 | numDiscarded++; 130 | } 131 | } 132 | 133 | public void tail(Node source, int depth) { 134 | if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) { 135 | destination = destination.parent(); // would have descended, so pop destination stack 136 | } 137 | } 138 | } 139 | 140 | private int copySafeNodes(Element source, Element dest) { 141 | CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); 142 | NodeTraversor traversor = new NodeTraversor(cleaningVisitor); 143 | traversor.traverse(source); 144 | return cleaningVisitor.numDiscarded; 145 | } 146 | 147 | private ElementMeta createSafeElement(Element sourceEl) { 148 | String sourceTag = sourceEl.tagName(); 149 | Attributes destAttrs = new Attributes(); 150 | Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); 151 | int numDiscarded = 0; 152 | 153 | Attributes sourceAttrs = sourceEl.attributes(); 154 | for (Attribute sourceAttr : sourceAttrs) { 155 | if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) 156 | destAttrs.put(sourceAttr); 157 | else 158 | numDiscarded++; 159 | } 160 | Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); 161 | destAttrs.addAll(enforcedAttrs); 162 | 163 | return new ElementMeta(dest, numDiscarded); 164 | } 165 | 166 | private static class ElementMeta { 167 | Element el; 168 | int numAttribsDiscarded; 169 | 170 | ElementMeta(Element el, int numAttribsDiscarded) { 171 | this.el = el; 172 | this.numAttribsDiscarded = numAttribsDiscarded; 173 | } 174 | } 175 | 176 | } 177 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/safety/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | Contains the jsoup HTML cleaner, and whitelist definitions. 3 | */ 4 | package org.jsoup.safety; 5 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/select/Collector.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.select; 2 | 3 | import org.jsoup.nodes.Element; 4 | import org.jsoup.nodes.Node; 5 | 6 | /** 7 | * Collects a list of elements that match the supplied criteria. 8 | * 9 | * @author Jonathan Hedley 10 | */ 11 | public class Collector { 12 | 13 | private Collector() { 14 | } 15 | 16 | /** 17 | Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator. 18 | @param eval Evaluator to test elements against 19 | @param root root of tree to descend 20 | @return list of matches; empty if none 21 | */ 22 | public static Elements collect (Evaluator eval, Element root) { 23 | Elements elements = new Elements(); 24 | new NodeTraversor(new Accumulator(root, elements, eval)).traverse(root); 25 | return elements; 26 | } 27 | 28 | private static class Accumulator implements NodeVisitor { 29 | private final Element root; 30 | private final Elements elements; 31 | private final Evaluator eval; 32 | 33 | Accumulator(Element root, Elements elements, Evaluator eval) { 34 | this.root = root; 35 | this.elements = elements; 36 | this.eval = eval; 37 | } 38 | 39 | public void head(Node node, int depth) { 40 | if (node instanceof Element) { 41 | Element el = (Element) node; 42 | if (eval.matches(root, el)) 43 | elements.add(el); 44 | } 45 | } 46 | 47 | public void tail(Node node, int depth) { 48 | // void 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/select/CombiningEvaluator.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.select; 2 | 3 | import org.jsoup.helper.StringUtil; 4 | import org.jsoup.nodes.Element; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.Collection; 9 | 10 | /** 11 | * Base combining (and, or) evaluator. 12 | */ 13 | abstract class CombiningEvaluator extends Evaluator { 14 | final ArrayList evaluators; 15 | int num = 0; 16 | 17 | CombiningEvaluator() { 18 | super(); 19 | evaluators = new ArrayList(); 20 | } 21 | 22 | CombiningEvaluator(Collection evaluators) { 23 | this(); 24 | this.evaluators.addAll(evaluators); 25 | updateNumEvaluators(); 26 | } 27 | 28 | Evaluator rightMostEvaluator() { 29 | return num > 0 ? evaluators.get(num - 1) : null; 30 | } 31 | 32 | void replaceRightMostEvaluator(Evaluator replacement) { 33 | evaluators.set(num - 1, replacement); 34 | } 35 | 36 | void updateNumEvaluators() { 37 | // used so we don't need to bash on size() for every match test 38 | num = evaluators.size(); 39 | } 40 | 41 | static final class And extends CombiningEvaluator { 42 | And(Collection evaluators) { 43 | super(evaluators); 44 | } 45 | 46 | And(Evaluator... evaluators) { 47 | this(Arrays.asList(evaluators)); 48 | } 49 | 50 | @Override 51 | public boolean matches(Element root, Element node) { 52 | for (int i = 0; i < num; i++) { 53 | Evaluator s = evaluators.get(i); 54 | if (!s.matches(root, node)) 55 | return false; 56 | } 57 | return true; 58 | } 59 | 60 | @Override 61 | public String toString() { 62 | return StringUtil.join(evaluators, " "); 63 | } 64 | } 65 | 66 | static final class Or extends CombiningEvaluator { 67 | /** 68 | * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR. 69 | * @param evaluators initial OR clause (these are wrapped into an AND evaluator). 70 | */ 71 | Or(Collection evaluators) { 72 | super(); 73 | if (num > 1) 74 | this.evaluators.add(new And(evaluators)); 75 | else // 0 or 1 76 | this.evaluators.addAll(evaluators); 77 | updateNumEvaluators(); 78 | } 79 | 80 | Or(Evaluator... evaluators) { this(Arrays.asList(evaluators)); } 81 | 82 | Or() { 83 | super(); 84 | } 85 | 86 | public void add(Evaluator e) { 87 | evaluators.add(e); 88 | updateNumEvaluators(); 89 | } 90 | 91 | @Override 92 | public boolean matches(Element root, Element node) { 93 | for (int i = 0; i < num; i++) { 94 | Evaluator s = evaluators.get(i); 95 | if (s.matches(root, node)) 96 | return true; 97 | } 98 | return false; 99 | } 100 | 101 | @Override 102 | public String toString() { 103 | return String.format(":or%s", evaluators); 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/select/NodeTraversor.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.select; 2 | 3 | import org.jsoup.nodes.Node; 4 | 5 | /** 6 | * Depth-first node traversor. Use to iterate through all nodes under and including the specified root node. 7 | *

8 | * This implementation does not use recursion, so a deep DOM does not risk blowing the stack. 9 | *

10 | */ 11 | public class NodeTraversor { 12 | private NodeVisitor visitor; 13 | 14 | /** 15 | * Create a new traversor. 16 | * @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node. 17 | */ 18 | public NodeTraversor(NodeVisitor visitor) { 19 | this.visitor = visitor; 20 | } 21 | 22 | /** 23 | * Start a depth-first traverse of the root and all of its descendants. 24 | * @param root the root node point to traverse. 25 | */ 26 | public void traverse(Node root) { 27 | Node node = root; 28 | int depth = 0; 29 | 30 | while (node != null) { 31 | visitor.head(node, depth); 32 | if (node.childNodeSize() > 0) { 33 | node = node.childNode(0); 34 | depth++; 35 | } else { 36 | while (node.nextSibling() == null && depth > 0) { 37 | visitor.tail(node, depth); 38 | node = node.parentNode(); 39 | depth--; 40 | } 41 | visitor.tail(node, depth); 42 | if (node == root) 43 | break; 44 | node = node.nextSibling(); 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/select/NodeVisitor.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.select; 2 | 3 | import org.jsoup.nodes.Node; 4 | 5 | /** 6 | * Node visitor interface. Provide an implementing class to {@link NodeTraversor} to iterate through nodes. 7 | *

8 | * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first 9 | * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to 10 | * create a start tag for a node, and tail to create the end tag. 11 | *

12 | */ 13 | public interface NodeVisitor { 14 | /** 15 | * Callback for when a node is first visited. 16 | * 17 | * @param node the node being visited. 18 | * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node 19 | * of that will have depth 1. 20 | */ 21 | void head(Node node, int depth); 22 | 23 | /** 24 | * Callback for when a node is last visited, after all of its descendants have been visited. 25 | * 26 | * @param node the node being visited. 27 | * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node 28 | * of that will have depth 1. 29 | */ 30 | void tail(Node node, int depth); 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/select/StructuralEvaluator.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.select; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | /** 6 | * Base structural evaluator. 7 | */ 8 | abstract class StructuralEvaluator extends Evaluator { 9 | Evaluator evaluator; 10 | 11 | static class Root extends Evaluator { 12 | public boolean matches(Element root, Element element) { 13 | return root == element; 14 | } 15 | } 16 | 17 | static class Has extends StructuralEvaluator { 18 | public Has(Evaluator evaluator) { 19 | this.evaluator = evaluator; 20 | } 21 | 22 | public boolean matches(Element root, Element element) { 23 | for (Element e : element.getAllElements()) { 24 | if (e != element && evaluator.matches(root, e)) 25 | return true; 26 | } 27 | return false; 28 | } 29 | 30 | @Override 31 | public String toString() { 32 | return String.format(":has(%s)", evaluator); 33 | } 34 | } 35 | 36 | static class Not extends StructuralEvaluator { 37 | public Not(Evaluator evaluator) { 38 | this.evaluator = evaluator; 39 | } 40 | 41 | public boolean matches(Element root, Element node) { 42 | return !evaluator.matches(root, node); 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return String.format(":not%s", evaluator); 48 | } 49 | } 50 | 51 | static class Parent extends StructuralEvaluator { 52 | public Parent(Evaluator evaluator) { 53 | this.evaluator = evaluator; 54 | } 55 | 56 | public boolean matches(Element root, Element element) { 57 | if (root == element) 58 | return false; 59 | 60 | Element parent = element.parent(); 61 | while (true) { 62 | if (evaluator.matches(root, parent)) 63 | return true; 64 | if (parent == root) 65 | break; 66 | parent = parent.parent(); 67 | } 68 | return false; 69 | } 70 | 71 | @Override 72 | public String toString() { 73 | return String.format(":parent%s", evaluator); 74 | } 75 | } 76 | 77 | static class ImmediateParent extends StructuralEvaluator { 78 | public ImmediateParent(Evaluator evaluator) { 79 | this.evaluator = evaluator; 80 | } 81 | 82 | public boolean matches(Element root, Element element) { 83 | if (root == element) 84 | return false; 85 | 86 | Element parent = element.parent(); 87 | return parent != null && evaluator.matches(root, parent); 88 | } 89 | 90 | @Override 91 | public String toString() { 92 | return String.format(":ImmediateParent%s", evaluator); 93 | } 94 | } 95 | 96 | static class PreviousSibling extends StructuralEvaluator { 97 | public PreviousSibling(Evaluator evaluator) { 98 | this.evaluator = evaluator; 99 | } 100 | 101 | public boolean matches(Element root, Element element) { 102 | if (root == element) 103 | return false; 104 | 105 | Element prev = element.previousElementSibling(); 106 | 107 | while (prev != null) { 108 | if (evaluator.matches(root, prev)) 109 | return true; 110 | 111 | prev = prev.previousElementSibling(); 112 | } 113 | return false; 114 | } 115 | 116 | @Override 117 | public String toString() { 118 | return String.format(":prev*%s", evaluator); 119 | } 120 | } 121 | 122 | static class ImmediatePreviousSibling extends StructuralEvaluator { 123 | public ImmediatePreviousSibling(Evaluator evaluator) { 124 | this.evaluator = evaluator; 125 | } 126 | 127 | public boolean matches(Element root, Element element) { 128 | if (root == element) 129 | return false; 130 | 131 | Element prev = element.previousElementSibling(); 132 | return prev != null && evaluator.matches(root, prev); 133 | } 134 | 135 | @Override 136 | public String toString() { 137 | return String.format(":prev%s", evaluator); 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/main/java/org/jsoup/select/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | Packages to support the CSS-style element selector. 3 | */ 4 | package org.jsoup.select; -------------------------------------------------------------------------------- /src/main/javadoc/overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | jsoup Javadoc overview 5 | 6 | 7 |

jsoup: Java HTML parser that makes sense of real-world HTML soup.

8 | 9 |

jsoup is a Java library for working with real-world HTML. It provides a very convenient API 10 | for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.

11 | 12 |

jsoup implements the WHATWG HTML specification, and parses HTML to the same DOM 13 | as modern browsers do.

14 | 15 |
    16 |
  • parse HTML from a URL, file, or string 17 |
  • find and extract data, using DOM traversal or CSS selectors 18 |
  • manipulate the HTML elements, attributes, and text 19 |
  • clean user-submitted content against a safe white-list, to prevent XSS 20 |
  • output tidy HTML 21 |
22 | 23 |

jsoup is designed to deal with all varieties of HTML found in the wild; from pristine and validating, 24 | to invalid tag-soup; jsoup will create a sensible parse tree.

25 | 26 |

See jsoup.org for downloads, documentation, and examples...

27 | 28 | @author Jonathan Hedley 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/TextUtil.java: -------------------------------------------------------------------------------- 1 | package org.jsoup; 2 | 3 | /** 4 | Text utils to ease testing 5 | 6 | @author Jonathan Hedley, jonathan@hedley.net */ 7 | public class TextUtil { 8 | public static final String LE = String.format("%n"); 9 | 10 | public static String stripNewlines(String text) { 11 | text = text.replaceAll("\\n\\s*", ""); 12 | return text; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/helper/DataUtilTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.UnsupportedEncodingException; 6 | 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.parser.Parser; 10 | import org.junit.Test; 11 | 12 | import java.nio.ByteBuffer; 13 | import java.nio.charset.Charset; 14 | 15 | import static org.jsoup.integration.ParseTest.getFile; 16 | import static org.junit.Assert.*; 17 | 18 | public class DataUtilTest { 19 | @Test 20 | public void testCharset() { 21 | assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 ")); 22 | assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8")); 23 | assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1")); 24 | assertEquals(null, DataUtil.getCharsetFromContentType("text/html")); 25 | assertEquals(null, DataUtil.getCharsetFromContentType(null)); 26 | assertEquals(null, DataUtil.getCharsetFromContentType("text/html;charset=Unknown")); 27 | } 28 | 29 | @Test public void testQuotedCharset() { 30 | assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\"")); 31 | assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"UTF-8\"")); 32 | assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\"")); 33 | assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=\"Unsupported\"")); 34 | assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'")); 35 | } 36 | 37 | @Test public void discardsSpuriousByteOrderMark() { 38 | String html = "\uFEFFOneTwo"; 39 | ByteBuffer buffer = Charset.forName("UTF-8").encode(html); 40 | Document doc = DataUtil.parseByteData(buffer, "UTF-8", "http://foo.com/", Parser.htmlParser()); 41 | assertEquals("One", doc.head().text()); 42 | } 43 | 44 | @Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() { 45 | String html = "\uFEFFOneTwo"; 46 | ByteBuffer buffer = Charset.forName("UTF-8").encode(html); 47 | Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser()); 48 | assertEquals("One", doc.head().text()); 49 | assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 50 | } 51 | 52 | @Test 53 | public void shouldNotThrowExceptionOnEmptyCharset() { 54 | assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=")); 55 | assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=;")); 56 | } 57 | 58 | @Test 59 | public void shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags() { 60 | assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1, charset=1251")); 61 | } 62 | 63 | @Test 64 | public void shouldCorrectCharsetForDuplicateCharsetString() { 65 | assertEquals("iso-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=charset=iso-8859-1")); 66 | } 67 | 68 | @Test 69 | public void shouldReturnNullForIllegalCharsetNames() { 70 | assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=$HJKDF§$/(")); 71 | } 72 | 73 | @Test 74 | public void generatesMimeBoundaries() { 75 | String m1 = DataUtil.mimeBoundary(); 76 | String m2 = DataUtil.mimeBoundary(); 77 | 78 | assertEquals(DataUtil.boundaryLength, m1.length()); 79 | assertEquals(DataUtil.boundaryLength, m2.length()); 80 | assertNotSame(m1, m2); 81 | } 82 | 83 | @Test 84 | public void wrongMetaCharsetFallback() { 85 | try { 86 | final byte[] input = "".getBytes("UTF-8"); 87 | final ByteBuffer inBuffer = ByteBuffer.wrap(input); 88 | 89 | Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser()); 90 | 91 | final String expected = "\n" + 92 | " \n" + 93 | " \n" + 94 | " \n" + 95 | " \n" + 96 | ""; 97 | 98 | assertEquals(expected, doc.toString()); 99 | } catch( UnsupportedEncodingException ex ) { 100 | fail(ex.getMessage()); 101 | } 102 | } 103 | 104 | @Test 105 | public void supportsBOMinFiles() throws IOException { 106 | // test files from http://www.i18nl10n.com/korean/utftest/ 107 | File in = getFile("/bomtests/bom_utf16be.html"); 108 | Document doc = Jsoup.parse(in, null, "http://example.com"); 109 | assertTrue(doc.title().contains("UTF-16BE")); 110 | assertTrue(doc.text().contains("가각갂갃간갅")); 111 | 112 | in = getFile("/bomtests/bom_utf16le.html"); 113 | doc = Jsoup.parse(in, null, "http://example.com"); 114 | assertTrue(doc.title().contains("UTF-16LE")); 115 | assertTrue(doc.text().contains("가각갂갃간갅")); 116 | 117 | in = getFile("/bomtests/bom_utf32be.html"); 118 | doc = Jsoup.parse(in, null, "http://example.com"); 119 | assertTrue(doc.title().contains("UTF-32BE")); 120 | assertTrue(doc.text().contains("가각갂갃간갅")); 121 | 122 | in = getFile("/bomtests/bom_utf32le.html"); 123 | doc = Jsoup.parse(in, null, "http://example.com"); 124 | assertTrue(doc.title().contains("UTF-32LE")); 125 | assertTrue(doc.text().contains("가각갂갃간갅")); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/helper/HttpConnectionTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.jsoup.integration.ParseTest; 6 | import org.junit.Test; 7 | import org.jsoup.Connection; 8 | 9 | import java.io.IOException; 10 | import java.util.*; 11 | import java.net.URL; 12 | import java.net.MalformedURLException; 13 | 14 | public class HttpConnectionTest { 15 | /* most actual network http connection tests are in integration */ 16 | 17 | @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnParseWithoutExecute() throws IOException { 18 | Connection con = HttpConnection.connect("http://example.com"); 19 | con.response().parse(); 20 | } 21 | 22 | @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnBodyWithoutExecute() throws IOException { 23 | Connection con = HttpConnection.connect("http://example.com"); 24 | con.response().body(); 25 | } 26 | 27 | @Test(expected=IllegalArgumentException.class) public void throwsExceptionOnBodyAsBytesWithoutExecute() throws IOException { 28 | Connection con = HttpConnection.connect("http://example.com"); 29 | con.response().bodyAsBytes(); 30 | } 31 | 32 | @Test public void caseInsensitiveHeaders() { 33 | Connection.Response res = new HttpConnection.Response(); 34 | Map headers = res.headers(); 35 | headers.put("Accept-Encoding", "gzip"); 36 | headers.put("content-type", "text/html"); 37 | headers.put("refErrer", "http://example.com"); 38 | 39 | assertTrue(res.hasHeader("Accept-Encoding")); 40 | assertTrue(res.hasHeader("accept-encoding")); 41 | assertTrue(res.hasHeader("accept-Encoding")); 42 | 43 | assertEquals("gzip", res.header("accept-Encoding")); 44 | assertEquals("text/html", res.header("Content-Type")); 45 | assertEquals("http://example.com", res.header("Referrer")); 46 | 47 | res.removeHeader("Content-Type"); 48 | assertFalse(res.hasHeader("content-type")); 49 | 50 | res.header("accept-encoding", "deflate"); 51 | assertEquals("deflate", res.header("Accept-Encoding")); 52 | assertEquals("deflate", res.header("accept-Encoding")); 53 | } 54 | 55 | @Test public void headers() { 56 | Connection con = HttpConnection.connect("http://example.com"); 57 | Map headers = new HashMap(); 58 | headers.put("content-type", "text/html"); 59 | headers.put("Connection", "keep-alive"); 60 | headers.put("Host", "http://example.com"); 61 | con.headers(headers); 62 | assertEquals("text/html", con.request().header("content-type")); 63 | assertEquals("keep-alive", con.request().header("Connection")); 64 | assertEquals("http://example.com", con.request().header("Host")); 65 | } 66 | 67 | @Test public void sameHeadersCombineWithComma() { 68 | Map> headers = new HashMap>(); 69 | List values = new ArrayList(); 70 | values.add("no-cache"); 71 | values.add("no-store"); 72 | headers.put("Cache-Control", values); 73 | HttpConnection.Response res = new HttpConnection.Response(); 74 | res.processResponseHeaders(headers); 75 | assertEquals("no-cache, no-store", res.header("Cache-Control")); 76 | } 77 | 78 | @Test public void ignoresEmptySetCookies() { 79 | // prep http response header map 80 | Map> headers = new HashMap>(); 81 | headers.put("Set-Cookie", Collections.emptyList()); 82 | HttpConnection.Response res = new HttpConnection.Response(); 83 | res.processResponseHeaders(headers); 84 | assertEquals(0, res.cookies().size()); 85 | } 86 | 87 | @Test public void ignoresEmptyCookieNameAndVals() { 88 | // prep http response header map 89 | Map> headers = new HashMap>(); 90 | List cookieStrings = new ArrayList(); 91 | cookieStrings.add(null); 92 | cookieStrings.add(""); 93 | cookieStrings.add("one"); 94 | cookieStrings.add("two="); 95 | cookieStrings.add("three=;"); 96 | cookieStrings.add("four=data; Domain=.example.com; Path=/"); 97 | 98 | headers.put("Set-Cookie", cookieStrings); 99 | HttpConnection.Response res = new HttpConnection.Response(); 100 | res.processResponseHeaders(headers); 101 | assertEquals(4, res.cookies().size()); 102 | assertEquals("", res.cookie("one")); 103 | assertEquals("", res.cookie("two")); 104 | assertEquals("", res.cookie("three")); 105 | assertEquals("data", res.cookie("four")); 106 | } 107 | 108 | @Test public void connectWithUrl() throws MalformedURLException { 109 | Connection con = HttpConnection.connect(new URL("http://example.com")); 110 | assertEquals("http://example.com", con.request().url().toExternalForm()); 111 | } 112 | 113 | @Test(expected=IllegalArgumentException.class) public void throwsOnMalformedUrl() { 114 | Connection con = HttpConnection.connect("bzzt"); 115 | } 116 | 117 | @Test public void userAgent() { 118 | Connection con = HttpConnection.connect("http://example.com/"); 119 | assertEquals(HttpConnection.DEFAULT_UA, con.request().header("User-Agent")); 120 | con.userAgent("Mozilla"); 121 | assertEquals("Mozilla", con.request().header("User-Agent")); 122 | } 123 | 124 | @Test public void timeout() { 125 | Connection con = HttpConnection.connect("http://example.com/"); 126 | assertEquals(30 * 1000, con.request().timeout()); 127 | con.timeout(1000); 128 | assertEquals(1000, con.request().timeout()); 129 | } 130 | 131 | @Test public void referrer() { 132 | Connection con = HttpConnection.connect("http://example.com/"); 133 | con.referrer("http://foo.com"); 134 | assertEquals("http://foo.com", con.request().header("Referer")); 135 | } 136 | 137 | @Test public void method() { 138 | Connection con = HttpConnection.connect("http://example.com/"); 139 | assertEquals(Connection.Method.GET, con.request().method()); 140 | con.method(Connection.Method.POST); 141 | assertEquals(Connection.Method.POST, con.request().method()); 142 | } 143 | 144 | @Test(expected=IllegalArgumentException.class) public void throwsOnOddData() { 145 | Connection con = HttpConnection.connect("http://example.com/"); 146 | con.data("Name", "val", "what"); 147 | } 148 | 149 | @Test public void data() { 150 | Connection con = HttpConnection.connect("http://example.com/"); 151 | con.data("Name", "Val", "Foo", "bar"); 152 | Collection values = con.request().data(); 153 | Object[] data = values.toArray(); 154 | Connection.KeyVal one = (Connection.KeyVal) data[0]; 155 | Connection.KeyVal two = (Connection.KeyVal) data[1]; 156 | assertEquals("Name", one.key()); 157 | assertEquals("Val", one.value()); 158 | assertEquals("Foo", two.key()); 159 | assertEquals("bar", two.value()); 160 | } 161 | 162 | @Test public void cookie() { 163 | Connection con = HttpConnection.connect("http://example.com/"); 164 | con.cookie("Name", "Val"); 165 | assertEquals("Val", con.request().cookie("Name")); 166 | } 167 | 168 | @Test public void inputStream() { 169 | Connection.KeyVal kv = HttpConnection.KeyVal.create("file", "thumb.jpg", ParseTest.inputStreamFrom("Check")); 170 | assertEquals("file", kv.key()); 171 | assertEquals("thumb.jpg", kv.value()); 172 | assertTrue(kv.hasInputStream()); 173 | 174 | kv = HttpConnection.KeyVal.create("one", "two"); 175 | assertEquals("one", kv.key()); 176 | assertEquals("two", kv.value()); 177 | assertFalse(kv.hasInputStream()); 178 | } 179 | 180 | @Test public void requestBody() { 181 | Connection con = HttpConnection.connect("http://example.com/"); 182 | con.requestBody("foo"); 183 | assertEquals("foo", con.request().requestBody()); 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/helper/StringUtilTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.junit.Test; 5 | 6 | import java.util.Arrays; 7 | 8 | import static org.jsoup.helper.StringUtil.*; 9 | import static org.junit.Assert.assertEquals; 10 | import static org.junit.Assert.assertFalse; 11 | import static org.junit.Assert.assertTrue; 12 | 13 | public class StringUtilTest { 14 | 15 | @Test public void join() { 16 | assertEquals("", StringUtil.join(Arrays.asList(""), " ")); 17 | assertEquals("one", StringUtil.join(Arrays.asList("one"), " ")); 18 | assertEquals("one two three", StringUtil.join(Arrays.asList("one", "two", "three"), " ")); 19 | } 20 | 21 | @Test public void padding() { 22 | assertEquals("", StringUtil.padding(0)); 23 | assertEquals(" ", StringUtil.padding(1)); 24 | assertEquals(" ", StringUtil.padding(2)); 25 | assertEquals(" ", StringUtil.padding(15)); 26 | } 27 | 28 | @Test public void isBlank() { 29 | assertTrue(StringUtil.isBlank(null)); 30 | assertTrue(StringUtil.isBlank("")); 31 | assertTrue(StringUtil.isBlank(" ")); 32 | assertTrue(StringUtil.isBlank(" \r\n ")); 33 | 34 | assertFalse(StringUtil.isBlank("hello")); 35 | assertFalse(StringUtil.isBlank(" hello ")); 36 | } 37 | 38 | @Test public void isNumeric() { 39 | assertFalse(StringUtil.isNumeric(null)); 40 | assertFalse(StringUtil.isNumeric(" ")); 41 | assertFalse(StringUtil.isNumeric("123 546")); 42 | assertFalse(StringUtil.isNumeric("hello")); 43 | assertFalse(StringUtil.isNumeric("123.334")); 44 | 45 | assertTrue(StringUtil.isNumeric("1")); 46 | assertTrue(StringUtil.isNumeric("1234")); 47 | } 48 | 49 | @Test public void isWhitespace() { 50 | assertTrue(StringUtil.isWhitespace('\t')); 51 | assertTrue(StringUtil.isWhitespace('\n')); 52 | assertTrue(StringUtil.isWhitespace('\r')); 53 | assertTrue(StringUtil.isWhitespace('\f')); 54 | assertTrue(StringUtil.isWhitespace(' ')); 55 | 56 | assertFalse(StringUtil.isWhitespace('\u00a0')); 57 | assertFalse(StringUtil.isWhitespace('\u2000')); 58 | assertFalse(StringUtil.isWhitespace('\u3000')); 59 | } 60 | 61 | @Test public void normaliseWhiteSpace() { 62 | assertEquals(" ", normaliseWhitespace(" \r \n \r\n")); 63 | assertEquals(" hello there ", normaliseWhitespace(" hello \r \n there \n")); 64 | assertEquals("hello", normaliseWhitespace("hello")); 65 | assertEquals("hello there", normaliseWhitespace("hello\nthere")); 66 | } 67 | 68 | @Test public void normaliseWhiteSpaceHandlesHighSurrogates() { 69 | String test71540chars = "\ud869\udeb2\u304b\u309a 1"; 70 | String test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1"; 71 | 72 | assertEquals(test71540charsExpectedSingleWhitespace, normaliseWhitespace(test71540chars)); 73 | String extractedText = Jsoup.parse(test71540chars).text(); 74 | assertEquals(test71540charsExpectedSingleWhitespace, extractedText); 75 | } 76 | 77 | @Test public void resolvesRelativeUrls() { 78 | assertEquals("http://example.com/one/two?three", resolve("http://example.com", "./one/two?three")); 79 | assertEquals("http://example.com/one/two?three", resolve("http://example.com?one", "./one/two?three")); 80 | assertEquals("http://example.com/one/two?three#four", resolve("http://example.com", "./one/two?three#four")); 81 | assertEquals("https://example.com/one", resolve("http://example.com/", "https://example.com/one")); 82 | assertEquals("http://example.com/one/two.html", resolve("http://example.com/two/", "../one/two.html")); 83 | assertEquals("https://example2.com/one", resolve("https://example.com/", "//example2.com/one")); 84 | assertEquals("https://example.com:8080/one", resolve("https://example.com:8080", "./one")); 85 | assertEquals("https://example2.com/one", resolve("http://example.com/", "https://example2.com/one")); 86 | assertEquals("https://example.com/one", resolve("wrong", "https://example.com/one")); 87 | assertEquals("https://example.com/one", resolve("https://example.com/one", "")); 88 | assertEquals("", resolve("wrong", "also wrong")); 89 | assertEquals("ftp://example.com/one", resolve("ftp://example.com/two/", "../one")); 90 | assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "./two.c")); 91 | assertEquals("ftp://example.com/one/two.c", resolve("ftp://example.com/one/", "two.c")); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/helper/W3CDomTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.helper; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.integration.ParseTest; 5 | import org.jsoup.nodes.Element; 6 | import org.junit.Test; 7 | import org.w3c.dom.Document; 8 | import org.w3c.dom.Node; 9 | 10 | import java.io.File; 11 | import java.io.IOException; 12 | 13 | import static org.jsoup.TextUtil.LE; 14 | import static org.junit.Assert.assertEquals; 15 | import static org.junit.Assert.assertTrue; 16 | 17 | public class W3CDomTest { 18 | @Test 19 | public void simpleConversion() { 20 | String html = "W3c

Text

What" + LE + 36 | "" + LE + 37 | "" + LE + 38 | "" + LE 39 | , out); 40 | } 41 | 42 | @Test 43 | public void convertsGoogle() throws IOException { 44 | File in = ParseTest.getFile("/htmltests/google-ipod.html"); 45 | org.jsoup.nodes.Document doc = Jsoup.parse(in, "UTF8"); 46 | 47 | W3CDom w3c = new W3CDom(); 48 | Document wDoc = w3c.fromJsoup(doc); 49 | Node htmlEl = wDoc.getChildNodes().item(0); 50 | assertEquals(null, htmlEl.getNamespaceURI()); 51 | assertEquals("html", htmlEl.getLocalName()); 52 | assertEquals("html", htmlEl.getNodeName()); 53 | 54 | String out = w3c.asString(wDoc); 55 | assertTrue(out.contains("ipod")); 56 | } 57 | 58 | @Test 59 | public void namespacePreservation() throws IOException { 60 | File in = ParseTest.getFile("/htmltests/namespaces.xhtml"); 61 | org.jsoup.nodes.Document jsoupDoc; 62 | jsoupDoc = Jsoup.parse(in, "UTF-8"); 63 | 64 | Document doc; 65 | org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom(); 66 | doc = jDom.fromJsoup(jsoupDoc); 67 | 68 | Node htmlEl = doc.getChildNodes().item(0); 69 | assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI()); 70 | assertEquals("html", htmlEl.getLocalName()); 71 | assertEquals("html", htmlEl.getNodeName()); 72 | 73 | Node epubTitle = htmlEl.getChildNodes().item(2).getChildNodes().item(3); 74 | assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI()); 75 | assertEquals("title", epubTitle.getLocalName()); 76 | assertEquals("epub:title", epubTitle.getNodeName()); 77 | 78 | Node xSection = epubTitle.getNextSibling().getNextSibling(); 79 | assertEquals("urn:test", xSection.getNamespaceURI()); 80 | assertEquals("section", xSection.getLocalName()); 81 | assertEquals("x:section", xSection.getNodeName()); 82 | } 83 | 84 | @Test 85 | public void handlesInvalidAttributeNames() { 86 | String html = ""; 87 | org.jsoup.nodes.Document jsoupDoc; 88 | jsoupDoc = Jsoup.parse(html); 89 | Element body = jsoupDoc.select("body").first(); 90 | assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it 91 | assertTrue(body.hasAttr("name\"")); 92 | 93 | Document w3Doc = new W3CDom().fromJsoup(jsoupDoc); 94 | } 95 | } 96 | 97 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/integration/Benchmark.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.integration; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | Does an A/B test on two methods, and prints out how long each took. 7 | 8 | @author Jonathan Hedley, jonathan@hedley.net */ 9 | public class Benchmark { 10 | public static void run(Runnable a, Runnable b, int count) { 11 | long aMillis; 12 | long bMillis; 13 | 14 | print("Running test A (x%d)", count); 15 | aMillis = time(a, count); 16 | print("Running test B"); 17 | bMillis = time(b, count); 18 | 19 | print("\nResults:"); 20 | print("A: %.2fs", aMillis / 1000f); 21 | print("B: %.2fs", bMillis / 1000f); 22 | print("\nB ran in %.2f %% time of A\n", (bMillis *1f / aMillis * 1f) * 100f); 23 | } 24 | 25 | private static long time(Runnable test, int count) { 26 | Date start = new Date(); 27 | for (int i = 0; i < count; i++) { 28 | test.run(); 29 | } 30 | Date end = new Date(); 31 | return end.getTime() - start.getTime(); 32 | } 33 | 34 | private static void print(String msgFormat, Object... msgParams) { 35 | System.out.println(String.format(msgFormat, msgParams)); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/integration/ParseTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.integration; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | import org.junit.Test; 8 | 9 | import java.io.*; 10 | import java.net.URISyntaxException; 11 | 12 | import static org.junit.Assert.*; 13 | 14 | /** 15 | * Integration test: parses from real-world example HTML. 16 | * 17 | * @author Jonathan Hedley, jonathan@hedley.net 18 | */ 19 | public class ParseTest { 20 | 21 | @Test 22 | public void testSmhBizArticle() throws IOException { 23 | File in = getFile("/htmltests/smh-biz-article-1.html"); 24 | Document doc = Jsoup.parse(in, "UTF-8", 25 | "http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html"); 26 | assertEquals("The board’s next fear: the female quota", 27 | doc.title()); // note that the apos in the source is a literal ’ (8217), not escaped or ' 28 | assertEquals("en", doc.select("html").attr("xml:lang")); 29 | 30 | Elements articleBody = doc.select(".articleBody > *"); 31 | assertEquals(17, articleBody.size()); 32 | // todo: more tests! 33 | 34 | } 35 | 36 | @Test 37 | public void testNewsHomepage() throws IOException { 38 | File in = getFile("/htmltests/news-com-au-home.html"); 39 | Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/"); 40 | assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title()); 41 | assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim()); 42 | 43 | Element a = doc.select("a[href=/entertainment/horoscopes]").first(); 44 | assertEquals("/entertainment/horoscopes", a.attr("href")); 45 | assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href")); 46 | 47 | Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first(); 48 | assertEquals( 49 | "http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003", 50 | hs.attr("href")); 51 | assertEquals(hs.attr("href"), hs.attr("abs:href")); 52 | } 53 | 54 | @Test 55 | public void testGoogleSearchIpod() throws IOException { 56 | File in = getFile("/htmltests/google-ipod.html"); 57 | Document doc = Jsoup.parse(in, "UTF-8", "http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10"); 58 | assertEquals("ipod - Google Search", doc.title()); 59 | Elements results = doc.select("h3.r > a"); 60 | assertEquals(12, results.size()); 61 | assertEquals( 62 | "http://news.google.com/news?hl=en&q=ipod&um=1&ie=UTF-8&ei=uYlKS4SbBoGg6gPf-5XXCw&sa=X&oi=news_group&ct=title&resnum=1&ved=0CCIQsQQwAA", 63 | results.get(0).attr("href")); 64 | assertEquals("http://www.apple.com/itunes/", 65 | results.get(1).attr("href")); 66 | } 67 | 68 | @Test 69 | public void testBinary() throws IOException { 70 | File in = getFile("/htmltests/thumb.jpg"); 71 | Document doc = Jsoup.parse(in, "UTF-8"); 72 | // nothing useful, but did not blow up 73 | assertTrue(doc.text().contains("gd-jpeg")); 74 | } 75 | 76 | @Test 77 | public void testYahooJp() throws IOException { 78 | File in = getFile("/htmltests/yahoo-jp.html"); 79 | Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html"); // http charset is utf-8. 80 | assertEquals("Yahoo! JAPAN", doc.title()); 81 | Element a = doc.select("a[href=t/2322m2]").first(); 82 | assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/t/2322m2", 83 | a.attr("abs:href")); // session put into 84 | assertEquals("全国、人気の駅ランキング", a.text()); 85 | } 86 | 87 | @Test 88 | public void testBaidu() throws IOException { 89 | // tests 90 | File in = getFile("/htmltests/baidu-cn-home.html"); 91 | Document doc = Jsoup.parse(in, null, 92 | "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse 93 | Element submit = doc.select("#su").first(); 94 | assertEquals("百度一下", submit.attr("value")); 95 | 96 | // test from attribute match 97 | submit = doc.select("input[value=百度一下]").first(); 98 | assertEquals("su", submit.id()); 99 | Element newsLink = doc.select("a:contains(新)").first(); 100 | assertEquals("http://news.baidu.com", newsLink.absUrl("href")); 101 | 102 | // check auto-detect from meta 103 | assertEquals("GB2312", doc.outputSettings().charset().displayName()); 104 | assertEquals("百度一下,你就知道 ", doc.select("title").outerHtml()); 105 | 106 | doc.outputSettings().charset("ascii"); 107 | assertEquals("百度一下,你就知道 ", 108 | doc.select("title").outerHtml()); 109 | } 110 | 111 | @Test 112 | public void testBaiduVariant() throws IOException { 113 | // tests when preceded by another 114 | File in = getFile("/htmltests/baidu-variant.html"); 115 | Document doc = Jsoup.parse(in, null, 116 | "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse 117 | // check auto-detect from meta 118 | assertEquals("GB2312", doc.outputSettings().charset().displayName()); 119 | assertEquals("百度一下,你就知道", doc.select("title").outerHtml()); 120 | } 121 | 122 | @Test 123 | public void testHtml5Charset() throws IOException { 124 | // test that works 125 | File in = getFile("/htmltests/meta-charset-1.html"); 126 | Document doc = Jsoup.parse(in, null, "http://example.com/"); //gb2312, has html5 127 | assertEquals("新", doc.text()); 128 | assertEquals("GB2312", doc.outputSettings().charset().displayName()); 129 | 130 | // double check, no charset, falls back to utf8 which is incorrect 131 | in = getFile("/htmltests/meta-charset-2.html"); // 132 | doc = Jsoup.parse(in, null, "http://example.com"); // gb2312, no charset 133 | assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 134 | assertFalse("新".equals(doc.text())); 135 | 136 | // confirm fallback to utf8 137 | in = getFile("/htmltests/meta-charset-3.html"); 138 | doc = Jsoup.parse(in, null, "http://example.com/"); // utf8, no charset 139 | assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 140 | assertEquals("新", doc.text()); 141 | } 142 | 143 | @Test 144 | public void testBrokenHtml5CharsetWithASingleDoubleQuote() throws IOException { 145 | InputStream in = inputStreamFrom("\n" + 146 | "\n" + 147 | "\n" + 148 | ""); 149 | Document doc = Jsoup.parse(in, null, "http://example.com/"); 150 | assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 151 | } 152 | 153 | @Test 154 | public void testNytArticle() throws IOException { 155 | // has tags like 156 | File in = getFile("/htmltests/nyt-article-1.html"); 157 | Document doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp"); 158 | 159 | Element headline = doc.select("nyt_headline[version=1.0]").first(); 160 | assertEquals("As BP Lays Out Future, It Will Not Include Hayward", headline.text()); 161 | } 162 | 163 | @Test 164 | public void testYahooArticle() throws IOException { 165 | File in = getFile("/htmltests/yahoo-article-1.html"); 166 | Document doc = Jsoup.parse(in, "UTF-8", "http://news.yahoo.com/s/nm/20100831/bs_nm/us_gm_china"); 167 | Element p = doc.select("p:contains(Volt will be sold in the United States)").first(); 168 | assertEquals("In July, GM said its electric Chevrolet Volt will be sold in the United States at $41,000 -- $8,000 more than its nearest competitor, the Nissan Leaf.", p.text()); 169 | } 170 | 171 | public static File getFile(String resourceName) { 172 | try { 173 | File file = new File(ParseTest.class.getResource(resourceName).toURI()); 174 | return file; 175 | } catch (URISyntaxException e) { 176 | throw new IllegalStateException(e); 177 | } 178 | } 179 | 180 | public static InputStream inputStreamFrom(String s) { 181 | try { 182 | return new ByteArrayInputStream(s.getBytes("UTF-8")); 183 | } catch (UnsupportedEncodingException e) { 184 | throw new RuntimeException(e); 185 | } 186 | } 187 | 188 | } 189 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/nodes/AttributeTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | public class AttributeTest { 8 | @Test public void html() { 9 | Attribute attr = new Attribute("key", "value &"); 10 | assertEquals("key=\"value &\"", attr.html()); 11 | assertEquals(attr.html(), attr.toString()); 12 | } 13 | 14 | @Test public void testWithSupplementaryCharacterInAttributeKeyAndValue() { 15 | String s = new String(Character.toChars(135361)); 16 | Attribute attr = new Attribute(s, "A" + s + "B"); 17 | assertEquals(s + "=\"A" + s + "B\"", attr.html()); 18 | assertEquals(attr.html(), attr.toString()); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/nodes/AttributesTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.Iterator; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | import static org.junit.Assert.assertFalse; 9 | import static org.junit.Assert.assertTrue; 10 | 11 | /** 12 | * Tests for Attributes. 13 | * 14 | * @author Jonathan Hedley 15 | */ 16 | public class AttributesTest { 17 | 18 | @Test 19 | public void html() { 20 | Attributes a = new Attributes(); 21 | a.put("Tot", "a&p"); 22 | a.put("Hello", "There"); 23 | a.put("data-name", "Jsoup"); 24 | 25 | assertEquals(3, a.size()); 26 | assertTrue(a.hasKey("Tot")); 27 | assertTrue(a.hasKey("Hello")); 28 | assertTrue(a.hasKey("data-name")); 29 | assertFalse(a.hasKey("tot")); 30 | assertTrue(a.hasKeyIgnoreCase("tot")); 31 | assertEquals("There", a.getIgnoreCase("hEllo")); 32 | 33 | assertEquals(1, a.dataset().size()); 34 | assertEquals("Jsoup", a.dataset().get("name")); 35 | assertEquals("", a.get("tot")); 36 | assertEquals("a&p", a.get("Tot")); 37 | assertEquals("a&p", a.getIgnoreCase("tot")); 38 | 39 | assertEquals(" Tot=\"a&p\" Hello=\"There\" data-name=\"Jsoup\"", a.html()); 40 | assertEquals(a.html(), a.toString()); 41 | } 42 | 43 | @Test 44 | public void testIteratorRemovable() { 45 | Attributes a = new Attributes(); 46 | a.put("Tot", "a&p"); 47 | a.put("Hello", "There"); 48 | a.put("data-name", "Jsoup"); 49 | 50 | Iterator iterator = a.iterator(); 51 | iterator.next(); 52 | iterator.remove(); 53 | assertEquals(2, a.size()); 54 | } 55 | 56 | @Test 57 | public void testIterator() { 58 | Attributes a = new Attributes(); 59 | String[][] datas = {{"Tot", "raul"}, 60 | {"Hello", "pismuth"}, 61 | {"data-name", "Jsoup"}}; 62 | for (String[] atts : datas) { 63 | a.put(atts[0], atts[1]); 64 | } 65 | 66 | Iterator iterator = a.iterator(); 67 | assertTrue(iterator.hasNext()); 68 | int i = 0; 69 | for (Attribute attribute : a) { 70 | assertEquals(datas[i][0], attribute.getKey()); 71 | assertEquals(datas[i][1], attribute.getValue()); 72 | i++; 73 | } 74 | assertEquals(datas.length, i); 75 | } 76 | 77 | @Test 78 | public void testIteratorEmpty() { 79 | Attributes a = new Attributes(); 80 | 81 | Iterator iterator = a.iterator(); 82 | assertFalse(iterator.hasNext()); 83 | } 84 | 85 | @Test 86 | public void removeCaseSensitive() { 87 | Attributes a = new Attributes(); 88 | a.put("Tot", "a&p"); 89 | a.put("tot", "one"); 90 | a.put("Hello", "There"); 91 | a.put("hello", "There"); 92 | a.put("data-name", "Jsoup"); 93 | 94 | assertEquals(5, a.size()); 95 | a.remove("Tot"); 96 | a.remove("Hello"); 97 | assertEquals(3, a.size()); 98 | assertTrue(a.hasKey("tot")); 99 | assertFalse(a.hasKey("Tot")); 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/nodes/BuildEntities.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import com.google.gson.Gson; 4 | import com.google.gson.reflect.TypeToken; 5 | import org.jsoup.Connection; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.integration.UrlConnectTest; 8 | import org.jsoup.nodes.Entities; 9 | 10 | import java.io.File; 11 | import java.io.FileWriter; 12 | import java.io.IOException; 13 | import java.util.ArrayList; 14 | import java.util.Collections; 15 | import java.util.Comparator; 16 | import java.util.Map; 17 | 18 | /** 19 | * Fetches HTML entity names from w3.org json, and outputs data files for optimized used in Entities. 20 | * I refuse to believe that entity names like "NotNestedLessLess" are valuable or useful for HTML authors. Implemented 21 | * only to be complete. 22 | */ 23 | class BuildEntities { 24 | private static final String projectDir = "/Users/jhy/projects/jsoup"; 25 | 26 | public static void main(String[] args) throws IOException { 27 | String url = "https://www.w3.org/TR/2012/WD-html5-20121025/entities.json"; 28 | Connection.Response res = Jsoup.connect(url) 29 | .ignoreContentType(true) 30 | .userAgent(UrlConnectTest.browserUa) 31 | .execute(); 32 | 33 | Gson gson = new Gson(); 34 | Map input = gson.fromJson(res.body(), 35 | new TypeToken>() { 36 | }.getType()); 37 | 38 | 39 | // build name sorted base and full character lists: 40 | ArrayList base = new ArrayList(); 41 | ArrayList full = new ArrayList(); 42 | 43 | for (Map.Entry entry : input.entrySet()) { 44 | String name = entry.getKey().substring(1); // name is like ´ or ´ , trim & 45 | CharacterRef ref = entry.getValue(); 46 | if (name.endsWith(";")) { 47 | name = name.substring(0, name.length() - 1); 48 | full.add(ref); 49 | } else { 50 | base.add(ref); 51 | } 52 | ref.name = name; 53 | } 54 | Collections.sort(base, byName); 55 | Collections.sort(full, byName); 56 | 57 | // now determine code point order 58 | ArrayList baseByCode = new ArrayList(base); 59 | ArrayList fullByCode = new ArrayList(full); 60 | Collections.sort(baseByCode, byCode); 61 | Collections.sort(fullByCode, byCode); 62 | 63 | // and update their codepoint index. Don't 64 | ArrayList[] codelists = new ArrayList[]{baseByCode, fullByCode}; 65 | for (ArrayList codelist : codelists) { 66 | for (int i = 0; i < codelist.size(); i++) { 67 | codelist.get(i).codeIndex = i; 68 | } 69 | } 70 | 71 | // now write them 72 | persist("entities-full.properties", full); 73 | persist("entities-base.properties", base); 74 | 75 | System.out.println("Full size: " + full.size() + ", base size: " + base.size()); 76 | } 77 | 78 | private static void persist(String name, ArrayList refs) throws IOException { 79 | String base = projectDir + "/src/main/java/org/jsoup/nodes"; 80 | File file = new File(base, name); 81 | FileWriter writer = new FileWriter(file, false); 82 | for (CharacterRef ref : refs) { 83 | writer.append(ref.toString()).append("\n"); 84 | } 85 | writer.close(); 86 | } 87 | 88 | 89 | private static class CharacterRef { 90 | int[] codepoints; 91 | String name; 92 | int codeIndex; 93 | 94 | @Override 95 | public String toString() { 96 | return name 97 | + "=" 98 | + d(codepoints[0]) 99 | + (codepoints.length > 1 ? "," + d(codepoints[1]) : "") 100 | + ";" + d(codeIndex); 101 | } 102 | } 103 | 104 | private static String d(int d) { 105 | return Integer.toString(d, Entities.codepointRadix); 106 | } 107 | 108 | private static class ByName implements Comparator { 109 | public int compare(CharacterRef o1, CharacterRef o2) { 110 | return o1.name.compareTo(o2.name); 111 | } 112 | } 113 | 114 | private static class ByCode implements Comparator { 115 | public int compare(CharacterRef o1, CharacterRef o2) { 116 | int[] c1 = o1.codepoints; 117 | int[] c2 = o2.codepoints; 118 | int first = c1[0] - c2[0]; 119 | if (first != 0) 120 | return first; 121 | if (c1.length == 1 && c2.length == 1) { // for the same code, use the shorter name 122 | int len = o2.name.length() - o1.name.length(); 123 | if (len != 0) 124 | return len; 125 | return o1.name.compareTo(o2.name); 126 | } 127 | if (c1.length == 2 && c2.length == 2) 128 | return c1[1] - c2[1]; 129 | else 130 | return c2.length - c1.length; // pushes multi down the list so hits on singles first (don't support multi lookup by codepoint yet) 131 | } 132 | } 133 | 134 | private static ByName byName = new ByName(); 135 | private static ByCode byCode = new ByCode(); 136 | } 137 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/nodes/DocumentTypeTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.parser.Parser; 5 | import org.junit.Test; 6 | 7 | import static org.junit.Assert.*; 8 | 9 | /** 10 | * Tests for the DocumentType node 11 | * 12 | * @author Jonathan Hedley, http://jonathanhedley.com/ 13 | */ 14 | public class DocumentTypeTest { 15 | @Test 16 | public void constructorValidationOkWithBlankName() { 17 | DocumentType fail = new DocumentType("","", "", ""); 18 | } 19 | 20 | @Test(expected = IllegalArgumentException.class) 21 | public void constructorValidationThrowsExceptionOnNulls() { 22 | DocumentType fail = new DocumentType("html", null, null, ""); 23 | } 24 | 25 | @Test 26 | public void constructorValidationOkWithBlankPublicAndSystemIds() { 27 | DocumentType fail = new DocumentType("html","", "",""); 28 | } 29 | 30 | @Test public void outerHtmlGeneration() { 31 | DocumentType html5 = new DocumentType("html", "", "", ""); 32 | assertEquals("", html5.outerHtml()); 33 | 34 | DocumentType publicDocType = new DocumentType("html", "-//IETF//DTD HTML//", "", ""); 35 | assertEquals("", publicDocType.outerHtml()); 36 | 37 | DocumentType systemDocType = new DocumentType("html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd", ""); 38 | assertEquals("", systemDocType.outerHtml()); 39 | 40 | DocumentType combo = new DocumentType("notHtml", "--public", "--system", ""); 41 | assertEquals("", combo.outerHtml()); 42 | } 43 | 44 | @Test public void testRoundTrip() { 45 | String base = ""; 46 | assertEquals("", htmlOutput(base)); 47 | assertEquals(base, xmlOutput(base)); 48 | 49 | String publicDoc = ""; 50 | assertEquals(publicDoc, htmlOutput(publicDoc)); 51 | assertEquals(publicDoc, xmlOutput(publicDoc)); 52 | 53 | String systemDoc = ""; 54 | assertEquals(systemDoc, htmlOutput(systemDoc)); 55 | assertEquals(systemDoc, xmlOutput(systemDoc)); 56 | 57 | String legacyDoc = ""; 58 | assertEquals(legacyDoc, htmlOutput(legacyDoc)); 59 | assertEquals(legacyDoc, xmlOutput(legacyDoc)); 60 | } 61 | 62 | private String htmlOutput(String in) { 63 | DocumentType type = (DocumentType) Jsoup.parse(in).childNode(0); 64 | return type.outerHtml(); 65 | } 66 | 67 | private String xmlOutput(String in) { 68 | return Jsoup.parse(in, "", Parser.xmlParser()).childNode(0).outerHtml(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/nodes/EntitiesTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.junit.Test; 5 | 6 | import static org.jsoup.nodes.Document.OutputSettings; 7 | import static org.jsoup.nodes.Entities.EscapeMode.*; 8 | import static org.junit.Assert.*; 9 | 10 | public class EntitiesTest { 11 | @Test public void escape() { 12 | String text = "Hello &<> Å å π 新 there ¾ © »"; 13 | String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base)); 14 | String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended)); 15 | String escapedAsciiXhtml = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(xhtml)); 16 | String escapedUtfFull = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended)); 17 | String escapedUtfMin = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(xhtml)); 18 | 19 | assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAscii); 20 | assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAsciiFull); 21 | assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAsciiXhtml); 22 | assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedUtfFull); 23 | assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedUtfMin); 24 | // odd that it's defined as aring in base but angst in full 25 | 26 | // round trip 27 | assertEquals(text, Entities.unescape(escapedAscii)); 28 | assertEquals(text, Entities.unescape(escapedAsciiFull)); 29 | assertEquals(text, Entities.unescape(escapedAsciiXhtml)); 30 | assertEquals(text, Entities.unescape(escapedUtfFull)); 31 | assertEquals(text, Entities.unescape(escapedUtfMin)); 32 | } 33 | 34 | @Test public void escapedSupplemtary() { 35 | String text = "\uD835\uDD59"; 36 | String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base)); 37 | assertEquals("𝕙", escapedAscii); 38 | String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended)); 39 | assertEquals("𝕙", escapedAsciiFull); 40 | String escapedUtf= Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended)); 41 | assertEquals(text, escapedUtf); 42 | } 43 | 44 | @Test public void unescapeMultiChars() { 45 | String text = "≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫"; // gg is not combo, but 8811 could conflict with NestedGreaterGreater or others 46 | String un = "≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫"; 47 | assertEquals(un, Entities.unescape(text)); 48 | String escaped = Entities.escape(un, new OutputSettings().charset("ascii").escapeMode(extended)); 49 | assertEquals("≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫", escaped); 50 | assertEquals(un, Entities.unescape(escaped)); 51 | } 52 | 53 | @Test public void xhtml() { 54 | String text = "& > < ""; 55 | assertEquals(38, xhtml.codepointForName("amp")); 56 | assertEquals(62, xhtml.codepointForName("gt")); 57 | assertEquals(60, xhtml.codepointForName("lt")); 58 | assertEquals(34, xhtml.codepointForName("quot")); 59 | 60 | assertEquals("amp", xhtml.nameForCodepoint(38)); 61 | assertEquals("gt", xhtml.nameForCodepoint(62)); 62 | assertEquals("lt", xhtml.nameForCodepoint(60)); 63 | assertEquals("quot", xhtml.nameForCodepoint(34)); 64 | } 65 | 66 | @Test public void getByName() { 67 | assertEquals("≫⃒", Entities.getByName("nGt")); 68 | assertEquals("fj", Entities.getByName("fjlig")); 69 | assertEquals("≫", Entities.getByName("gg")); 70 | assertEquals("©", Entities.getByName("copy")); 71 | } 72 | 73 | @Test public void escapeSupplementaryCharacter() { 74 | String text = new String(Character.toChars(135361)); 75 | String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base)); 76 | assertEquals("𡃁", escapedAscii); 77 | String escapedUtf = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(base)); 78 | assertEquals(text, escapedUtf); 79 | } 80 | 81 | @Test public void notMissingMultis() { 82 | String text = "⫽⃥"; 83 | String un = "\u2AFD\u20E5"; 84 | assertEquals(un, Entities.unescape(text)); 85 | } 86 | 87 | @Test public void notMissingSupplementals() { 88 | String text = "⨔ 𝔮"; 89 | String un = "⨔ \uD835\uDD2E"; // 𝔮 90 | assertEquals(un, Entities.unescape(text)); 91 | } 92 | 93 | @Test public void unescape() { 94 | String text = "Hello Æ &<> ® Å &angst π π 新 there &! ¾ © ©"; 95 | assertEquals("Hello Æ &<> ® Å &angst π π 新 there &! ¾ © ©", Entities.unescape(text)); 96 | 97 | assertEquals("&0987654321; &unknown", Entities.unescape("&0987654321; &unknown")); 98 | } 99 | 100 | @Test public void strictUnescape() { // for attributes, enforce strict unescaping (must look like &#xxx; , not just &#xxx) 101 | String text = "Hello &= &"; 102 | assertEquals("Hello &= &", Entities.unescape(text, true)); 103 | assertEquals("Hello &= &", Entities.unescape(text)); 104 | assertEquals("Hello &= &", Entities.unescape(text, false)); 105 | } 106 | 107 | 108 | @Test public void caseSensitive() { 109 | String unescaped = "Ü ü & &"; 110 | assertEquals("Ü ü & &", 111 | Entities.escape(unescaped, new OutputSettings().charset("ascii").escapeMode(extended))); 112 | 113 | String escaped = "Ü ü & &"; 114 | assertEquals("Ü ü & &", Entities.unescape(escaped)); 115 | } 116 | 117 | @Test public void quoteReplacements() { 118 | String escaped = "\ $"; 119 | String unescaped = "\\ $"; 120 | 121 | assertEquals(unescaped, Entities.unescape(escaped)); 122 | } 123 | 124 | @Test public void letterDigitEntities() { 125 | String html = "

¹²³¼½¾

"; 126 | Document doc = Jsoup.parse(html); 127 | doc.outputSettings().charset("ascii"); 128 | Element p = doc.select("p").first(); 129 | assertEquals("¹²³¼½¾", p.html()); 130 | assertEquals("¹²³¼½¾", p.text()); 131 | doc.outputSettings().charset("UTF-8"); 132 | assertEquals("¹²³¼½¾", p.html()); 133 | } 134 | 135 | @Test public void noSpuriousDecodes() { 136 | String string = "http://www.foo.com?a=1&num_rooms=1&children=0&int=VA&b=2"; 137 | assertEquals(string, Entities.unescape(string)); 138 | } 139 | 140 | @Test public void escapesGtInXmlAttributesButNotInHtml() { 141 | // https://github.com/jhy/jsoup/issues/528 - < is OK in HTML attribute values, but not in XML 142 | 143 | 144 | String docHtml = "One"; 145 | Document doc = Jsoup.parse(docHtml); 146 | Element element = doc.select("a").first(); 147 | 148 | doc.outputSettings().escapeMode(base); 149 | assertEquals("One

\">One
", element.outerHtml()); 150 | 151 | doc.outputSettings().escapeMode(xhtml); 152 | assertEquals("One</p>\">One", element.outerHtml()); 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /src/test/java/org/jsoup/nodes/FormElementTest.java: -------------------------------------------------------------------------------- 1 | package org.jsoup.nodes; 2 | 3 | import org.jsoup.Connection; 4 | import org.jsoup.Jsoup; 5 | import org.junit.Test; 6 | 7 | import java.util.List; 8 | 9 | import static org.junit.Assert.*; 10 | 11 | /** 12 | * Tests for FormElement 13 | * 14 | * @author Jonathan Hedley 15 | */ 16 | public class FormElementTest { 17 | @Test public void hasAssociatedControls() { 18 | //"button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 19 | String html = "