├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── pom.xml └── src ├── main └── java │ └── us │ └── codecraft │ └── xsoup │ ├── XElement.java │ ├── XElements.java │ ├── XPathEvaluator.java │ ├── XTokenQueue.java │ ├── Xsoup.java │ ├── w3c │ ├── AttributeAdaptor.java │ ├── AttributesAdaptor.java │ ├── DocumentAdaptor.java │ ├── DummyTypeInfo.java │ ├── ElementAdaptor.java │ ├── HtmlDocumentType.java │ ├── NamedNodeMapAdaptor.java │ ├── NodeAdaptor.java │ ├── NodeAdaptors.java │ └── NodeListAdaptor.java │ └── xevaluator │ ├── CombingXPathEvaluator.java │ ├── CombiningDefaultXElements.java │ ├── CombiningEvaluator.java │ ├── DefaultXElement.java │ ├── DefaultXElements.java │ ├── DefaultXPathEvaluator.java │ ├── ElementOperator.java │ ├── FormattingVisitor.java │ ├── HtmlToPlainText.java │ ├── StructuralEvaluator.java │ ├── XEvaluators.java │ └── XPathParser.java └── test └── java └── us └── codecraft └── xsoup ├── XTokenQueueTest.java ├── XsoupTest.java └── w3c ├── DocumentAdaptorTest.java └── W3cEvaluatorTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /.settings 3 | /.project 4 | /.classpath 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - openjdk11 4 | - openjdk15 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2009, 2010, 2011, 2012, 2013 Jonathan Hedley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Xsoup 2 | ---- 3 | [![Build Status](https://api.travis-ci.org/code4craft/xsoup.png?branch=master)](https://travis-ci.org/code4craft/xsoup) 4 | 5 | XPath selector based on Jsoup. 6 | 7 | ## Get started: 8 | 9 | ```java 10 | @Test 11 | public void testSelect() { 12 | 13 | String html = "
github.com
" + 14 | "
ab
"; 15 | 16 | Document document = Jsoup.parse(html); 17 | 18 | String result = Xsoup.compile("//a/@href").evaluate(document).get(); 19 | Assert.assertEquals("https://github.com", result); 20 | 21 | List list = Xsoup.compile("//tr/td/text()").evaluate(document).list(); 22 | Assert.assertEquals("a", list.get(0)); 23 | Assert.assertEquals("b", list.get(1)); 24 | } 25 | ``` 26 | 27 | ## Performance: 28 | 29 | Xsoup use Jsoup as HTML parser. 30 | 31 | Compare with another most used XPath selector for HTML - [**`HtmlCleaner`**](http://htmlcleaner.sourceforge.net/), Xsoup is much faster: 32 | 33 | Normal HTML, size 44KB 34 | XPath: "//a" 35 | Run for 2000 times 36 | 37 | Environment:Mac Air MD231CH/A 38 | CPU: 1.8Ghz Intel Core i5 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 |
OperationXsoupHtmlCleaner
parse3,207(ms)7,999(ms)
select95(ms)380(ms)
57 | 58 | ## Syntax supported: 59 | 60 | ### XPath1.0: 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 |
NameExpressionSupport
nodenamenodenameyes
immediate parent/yes
parent//yes
attribute[@key=value]yes
nth childtag[n]yes
attribute/@keyyes
wildcard in tagname/*yes
wildcard in attribute/[@*]yes
functionfunction()part
ora | byes since 0.2.0
parent in path. or ..no
predicatesprice>35no
predicates logic@class=a or @class=byes since 0.2.0
134 | 135 | ### Function supported: 136 | 137 | In Xsoup, we use some function (maybe not in Standard XPath 1.0): 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 |
ExpressionDescriptionStandard XPath
text(n)nth text content of element(0 for all)text() only
allText()text including childrennot support
tidyText()text including children, well formattednot support
html()innerhtml of elementnot support
outerHtml()outerHtml of elementnot support
regex(@attr,expr,group)use regex to extract contentnot support
177 | 178 | ### Extended syntax supported: 179 | 180 | These XPath syntax are extended only in Xsoup (for convenience in extracting HTML, refer to Jsoup CSS Selector): 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 |
NameExpressionSupport
attribute value not equals[@key!=value]yes
attribute value start with[@key~=value]yes
attribute value end with[@key$=value]yes
attribute value contains[@key*=value]yes
attribute value match regex[@key~=value]yes
214 | 215 | ## License 216 | 217 | MIT License, see file `LICENSE` 218 | 219 | [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/xsoup/trend.png)](https://bitdeli.com/free "Bitdeli Badge") 220 | 221 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | us.codecraft 4 | xsoup 5 | 0.3.7 6 | 4.0.0 7 | jar 8 | 9 | UTF-8 10 | UTF-8 11 | 12 | xsoup 13 | 14 | HTML XPath selector based on Jsoup. 15 | 16 | https://github.com/code4craft/xsoup/ 17 | 18 | 19 | code4craft 20 | Yihua huang 21 | code4crafer@gmail.com 22 | 23 | 24 | 25 | scm:git:git@github.com:code4craft/xsoup.git 26 | scm:git:git@github.com:code4craft/xsoup.git 27 | https://github.com/code4craft/xsoup/tree/master 28 | xsoup-${project.version} 29 | 30 | 31 | 32 | The MIT License 33 | http://jsoup.com/license 34 | repo 35 | 36 | 37 | 38 | 39 | 40 | junit 41 | junit 42 | 4.13.2 43 | test 44 | 45 | 46 | org.jsoup 47 | jsoup 48 | 1.16.1 49 | 50 | 51 | org.assertj 52 | assertj-core 53 | 3.9.1 54 | test 55 | 56 | 57 | 58 | 59 | 60 | 61 | org.apache.maven.plugins 62 | maven-compiler-plugin 63 | 3.7.0 64 | 65 | 1.8 66 | 1.8 67 | UTF-8 68 | 69 | 70 | 71 | org.apache.maven.plugins 72 | maven-dependency-plugin 73 | 3.0.2 74 | 75 | 76 | copy-dependencies 77 | package 78 | 79 | copy-dependencies 80 | 81 | 82 | ${project.build.directory}/lib 83 | false 84 | false 85 | true 86 | 87 | 88 | 89 | 90 | 91 | org.apache.maven.plugins 92 | maven-resources-plugin 93 | 3.0.2 94 | 95 | UTF-8 96 | 97 | 98 | 99 | org.apache.maven.plugins 100 | maven-source-plugin 101 | 3.0.1 102 | 103 | 104 | attach-sources 105 | 106 | jar-no-fork 107 | 108 | 109 | 110 | 111 | 112 | org.apache.maven.plugins 113 | maven-javadoc-plugin 114 | 3.2.0 115 | 116 | 117 | attach-javadocs 118 | 119 | jar 120 | 121 | 122 | 123 | 124 | 125 | org.apache.maven.plugins 126 | maven-release-plugin 127 | 2.5.3 128 | 129 | true 130 | false 131 | release 132 | deploy 133 | 134 | 135 | 136 | com.amashchenko.maven.plugin 137 | gitflow-maven-plugin 138 | 1.15.0 139 | 140 | 141 | ${project.artifactId}- 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | org.apache.maven.plugins 150 | maven-site-plugin 151 | 3.9.0 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | release 160 | 161 | 162 | 163 | 164 | org.apache.maven.plugins 165 | maven-source-plugin 166 | 3.0.1 167 | 168 | 169 | package 170 | 171 | jar-no-fork 172 | 173 | 174 | 175 | 176 | 177 | 178 | org.apache.maven.plugins 179 | maven-javadoc-plugin 180 | 3.0.0 181 | 182 | 183 | package 184 | 185 | jar 186 | 187 | 188 | 189 | 190 | 191 | 192 | org.apache.maven.plugins 193 | maven-gpg-plugin 194 | 1.6 195 | 196 | 197 | verify 198 | 199 | sign 200 | 201 | 202 | 203 | 204 | 205 | org.sonatype.plugins 206 | nexus-staging-maven-plugin 207 | 1.6.8 208 | true 209 | 210 | sonatype-nexus-staging 211 | https://oss.sonatype.org/ 212 | true 213 | 214 | 215 | 216 | 217 | 218 | 219 | sonatype-nexus-snapshots 220 | https://oss.sonatype.org/content/repositories/snapshots/ 221 | 222 | 223 | sonatype-nexus-staging 224 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 225 | 226 | 227 | 228 | 229 | 230 | 231 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/XElement.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public interface XElement { 9 | 10 | String get(); 11 | 12 | Element getElement(); 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/XElements.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import java.util.List; 4 | import org.jsoup.select.Elements; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public interface XElements { 10 | 11 | String get(); 12 | 13 | List list(); 14 | 15 | Elements getElements(); 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/XPathEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public interface XPathEvaluator { 9 | 10 | XElements evaluate(Element element); 11 | 12 | boolean hasAttribute(); 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/XTokenQueue.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.regex.Pattern; 6 | 7 | import org.jsoup.helper.Validate; 8 | 9 | /** 10 | * A character queue with parsing helpers. 11 | *
12 | * Most code borrowed from {@link org.jsoup.parser.TokenQueue} 13 | * 14 | * @author Jonathan Hedley 15 | * @see org.jsoup.parser.TokenQueue 16 | */ 17 | public class XTokenQueue { 18 | private static final char ESC = '\\'; // escape char for chomp balanced. 19 | private static final String[] quotes = {"\"", "'"}; 20 | private static final char singleQuote = '\''; 21 | private static final char doubleQuote = '"'; 22 | private String queue; 23 | private int pos = 0; 24 | 25 | /** 26 | * Create a new TokenQueue. 27 | * 28 | * @param data string of data to back queue. 29 | */ 30 | public XTokenQueue(String data) { 31 | Validate.notNull(data); 32 | queue = data; 33 | } 34 | 35 | /** 36 | * Unescaped a \ escaped string. 37 | * 38 | * @param in backslash escaped string 39 | * 40 | * @return unescaped string 41 | */ 42 | public static String unescape(String in) { 43 | StringBuilder out = new StringBuilder(); 44 | char last = 0; 45 | for (char c : in.toCharArray()) { 46 | if (c == ESC) { 47 | if (last != 0 && last == ESC) out.append(c); 48 | } 49 | else { 50 | out.append(c); 51 | } 52 | last = c; 53 | } 54 | return out.toString(); 55 | } 56 | 57 | public static String trimQuotes(String str) { 58 | Validate.isTrue(str != null && str.length() > 0); 59 | String quote = str.substring(0, 1); 60 | if (in(quote, "\"", "'")) { 61 | Validate.isTrue(str.endsWith(quote), "Quote" + " for " + str + " is incomplete!"); 62 | str = str.substring(1, str.length() - 1); 63 | } 64 | return str; 65 | } 66 | 67 | public static List trimQuotes(List strs) { 68 | Validate.isTrue(strs != null); 69 | List list = new ArrayList(); 70 | for (String str : strs) { 71 | list.add(trimQuotes(str)); 72 | } 73 | return list; 74 | } 75 | 76 | public static List parseFuncionParams(String paramStr) { 77 | XTokenQueue tq = new XTokenQueue(paramStr); 78 | return tq.parseFuncionParams(); 79 | } 80 | 81 | /** 82 | * Is the queue empty? 83 | * 84 | * @return true if no data left in queue. 85 | */ 86 | public boolean isEmpty() { 87 | return remainingLength() == 0; 88 | } 89 | 90 | private int remainingLength() { 91 | return queue.length() - pos; 92 | } 93 | 94 | /** 95 | * Retrieves but does not remove the first character from the queue. 96 | * 97 | * @return First character, or 0 if empty. 98 | */ 99 | public char peek() { 100 | return isEmpty() ? 0 : queue.charAt(pos); 101 | } 102 | 103 | /** 104 | * Add a character to the start of the queue (will be the next character retrieved). 105 | * 106 | * @param c character to add 107 | */ 108 | public void addFirst(Character c) { 109 | addFirst(c.toString()); 110 | } 111 | 112 | /** 113 | * Add a string to the start of the queue. 114 | * 115 | * @param seq string to add. 116 | */ 117 | public void addFirst(String seq) { 118 | // not very performant, but an edge case 119 | queue = seq + queue.substring(pos); 120 | pos = 0; 121 | } 122 | 123 | /** 124 | * Tests if the next characters on the queue match the sequence. Case insensitive. 125 | * 126 | * @param seq String to check queue for. 127 | * 128 | * @return true if the next characters match. 129 | */ 130 | public boolean matches(String seq) { 131 | return queue.regionMatches(true, pos, seq, 0, seq.length()); 132 | } 133 | 134 | public boolean matchesRegex(String seq) { 135 | return Pattern.matches(seq, queue.substring(pos)); 136 | } 137 | 138 | /** 139 | * Case sensitive match test. 140 | * 141 | * @param seq string to case sensitively check for 142 | * 143 | * @return true if matched, false if not 144 | */ 145 | public boolean matchesCS(String seq) { 146 | return queue.startsWith(seq, pos); 147 | } 148 | 149 | /** 150 | * Tests if the next characters match any of the sequences. Case insensitive. 151 | * 152 | * @param seq list of strings to case insensitively check for 153 | * 154 | * @return true of any matched, false if none did 155 | */ 156 | public boolean matchesAny(String... seq) { 157 | for (String s : seq) { 158 | if (matches(s)) return true; 159 | } 160 | return false; 161 | } 162 | 163 | public boolean matchesAny(char... seq) { 164 | if (isEmpty()) return false; 165 | 166 | for (char c : seq) { 167 | if (queue.charAt(pos) == c) return true; 168 | } 169 | return false; 170 | } 171 | 172 | public boolean matchesStartTag() { 173 | // micro opt for matching "= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos + 1))); 175 | } 176 | 177 | /** 178 | * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the 179 | * queue. 180 | * 181 | * @param seq String to search for, and if found, remove from queue. 182 | * 183 | * @return true if found and removed, false if not found. 184 | */ 185 | public boolean matchChomp(String seq) { 186 | if (matches(seq)) { 187 | pos += seq.length(); 188 | return true; 189 | } 190 | else { 191 | return false; 192 | } 193 | } 194 | 195 | /** 196 | * Tests if queue starts with a whitespace character. 197 | * 198 | * @return if starts with whitespace 199 | */ 200 | public boolean matchesWhitespace() { 201 | return !isEmpty() && isWhitespace(queue.charAt(pos)); 202 | } 203 | 204 | /** 205 | * Test if the queue matches a word character (letter or digit). 206 | * 207 | * @return if matches a word character 208 | */ 209 | public boolean matchesWord() { 210 | return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); 211 | } 212 | 213 | /** 214 | * Drops the next character off the queue. 215 | */ 216 | public void advance() { 217 | if (!isEmpty()) pos++; 218 | } 219 | 220 | /** 221 | * Consume one character off queue. 222 | * 223 | * @return first character on queue. 224 | */ 225 | public char consume() { 226 | return queue.charAt(pos++); 227 | } 228 | 229 | /** 230 | * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will 231 | * throw an illegal state exception -- but you should be running match() against that condition. 232 | *
233 | * Case insensitive. 234 | * 235 | * @param seq sequence to remove from head of queue. 236 | */ 237 | public void consume(String seq) { 238 | if (!matches(seq)) throw new IllegalStateException("Queue did not match expected sequence"); 239 | int len = seq.length(); 240 | if (len > remainingLength()) throw new IllegalStateException("Queue not long enough to consume sequence"); 241 | 242 | pos += len; 243 | } 244 | 245 | /** 246 | * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. 247 | * 248 | * @param seq String to end on (and not include in return, but leave on queue). Case sensitive. 249 | * 250 | * @return The matched data consumed from queue. 251 | */ 252 | public String consumeTo(String seq) { 253 | int offset = queue.indexOf(seq, pos); 254 | if (offset != -1) { 255 | String consumed = queue.substring(pos, offset); 256 | pos += consumed.length(); 257 | return consumed; 258 | } 259 | else { 260 | return remainder(); 261 | } 262 | } 263 | 264 | public String consumeToIgnoreCase(String seq) { 265 | int start = pos; 266 | String first = seq.substring(0, 1); 267 | boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of 268 | while (!isEmpty()) { 269 | if (matches(seq)) break; 270 | 271 | if (canScan) { 272 | int skip = queue.indexOf(first, pos) - pos; 273 | if (skip == 0) // this char is the skip char, but not match, so force advance of pos 274 | { 275 | pos++; 276 | } 277 | else if (skip < 0) // no chance of finding, grab to end 278 | { 279 | pos = queue.length(); 280 | } 281 | else { 282 | pos += skip; 283 | } 284 | } 285 | else { 286 | pos++; 287 | } 288 | } 289 | 290 | String data = queue.substring(start, pos); 291 | return data; 292 | } 293 | 294 | /** 295 | * Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. 296 | * 297 | * @param seq any number of terminators to consume to. Case insensitive. 298 | * 299 | * @return consumed string 300 | */ 301 | // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this 302 | // is is a case sensitive time... 303 | public String consumeToAny(String... seq) { 304 | int start = pos; 305 | while (!isEmpty() && !matchesAny(seq)) { 306 | pos++; 307 | } 308 | 309 | String data = queue.substring(start, pos); 310 | return data; 311 | } 312 | 313 | public String consumeAny(String... seq) { 314 | for (String s : seq) { 315 | if (matches(s)) { 316 | pos += s.length(); 317 | return s; 318 | } 319 | } 320 | return ""; 321 | } 322 | 323 | /** 324 | * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). 325 | *
326 | * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go 327 | * isEmpty() == true). 328 | * 329 | * @param seq String to match up to, and not include in return, and to pull off queue. Case sensitive. 330 | * 331 | * @return Data matched from queue. 332 | */ 333 | public String chompTo(String seq) { 334 | String data = consumeTo(seq); 335 | matchChomp(seq); 336 | return data; 337 | } 338 | 339 | public String chompToIgnoreCase(String seq) { 340 | String data = consumeToIgnoreCase(seq); // case insensitive scan 341 | matchChomp(seq); 342 | return data; 343 | } 344 | 345 | public String chompBalancedQuotes() { 346 | String quote = consumeAny(quotes); 347 | if (quote.length() == 0) { 348 | return ""; 349 | } 350 | StringBuilder accum = new StringBuilder(quote); 351 | accum.append(consumeToUnescaped(quote)); 352 | accum.append(consume()); 353 | return accum.toString(); 354 | } 355 | 356 | public String chompBalancedNotInQuotes(char open, char close) { 357 | StringBuilder accum = new StringBuilder(); 358 | int depth = 0; 359 | char last = 0; 360 | boolean inQuotes = false; 361 | Character quote = null; 362 | 363 | do { 364 | if (isEmpty()) break; 365 | Character c = consume(); 366 | if (last == 0 || last != ESC) { 367 | if (!inQuotes) { 368 | if (c.equals(singleQuote) || c.equals(doubleQuote)) { 369 | inQuotes = true; 370 | quote = c; 371 | } 372 | else if (c.equals(open)) { 373 | depth++; 374 | } 375 | else if (c.equals(close)) depth--; 376 | } 377 | else { 378 | if (c.equals(quote)) { 379 | inQuotes = false; 380 | } 381 | } 382 | } 383 | 384 | if (depth > 0 && last != 0) accum.append(c); // don't include the outer match pair in the return 385 | last = c; 386 | } while (depth > 0); 387 | return accum.toString(); 388 | } 389 | 390 | /** 391 | * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", 392 | * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left 393 | * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for 394 | * contains text strings; use unescape for that. 395 | * 396 | * @param open opener 397 | * @param close closer 398 | * 399 | * @return data matched from the queue 400 | */ 401 | public String chompBalanced(char open, char close) { 402 | StringBuilder accum = new StringBuilder(); 403 | int depth = 0; 404 | char last = 0; 405 | 406 | do { 407 | if (isEmpty()) break; 408 | Character c = consume(); 409 | if (last == 0 || last != ESC) { 410 | if (c.equals(open)) { 411 | depth++; 412 | } 413 | else if (c.equals(close)) depth--; 414 | } 415 | 416 | if (depth > 0 && last != 0) accum.append(c); // don't include the outer match pair in the return 417 | last = c; 418 | } while (depth > 0); 419 | return accum.toString(); 420 | } 421 | 422 | /** 423 | * Pulls the next run of whitespace characters of the queue. 424 | * 425 | * @return seen 426 | */ 427 | public boolean consumeWhitespace() { 428 | boolean seen = false; 429 | while (matchesWhitespace()) { 430 | pos++; 431 | seen = true; 432 | } 433 | return seen; 434 | } 435 | 436 | /** 437 | * Retrieves the next run of word type (letter or digit) off the queue. 438 | * 439 | * @return String of word characters from queue, or empty string if none. 440 | */ 441 | public String consumeWord() { 442 | int start = pos; 443 | while (matchesWord()) pos++; 444 | return queue.substring(start, pos); 445 | } 446 | 447 | /** 448 | * Consume an tag name off the queue (word or :, _, -) 449 | * 450 | * @return tag name 451 | */ 452 | public String consumeTagName() { 453 | int start = pos; 454 | while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) pos++; 455 | 456 | return queue.substring(start, pos); 457 | } 458 | 459 | /** 460 | * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects). 461 | * 462 | * @return tag name 463 | */ 464 | public String consumeElementSelector() { 465 | int start = pos; 466 | while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) pos++; 467 | 468 | return queue.substring(start, pos); 469 | } 470 | 471 | public void unConsume(int length) { 472 | Validate.isTrue(length <= pos, "length " + length + " is larger than consumed chars " + pos); 473 | pos -= length; 474 | } 475 | 476 | public void unConsume(String word) { 477 | unConsume(word.length()); 478 | } 479 | 480 | /** 481 | * Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) 482 | * http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier 483 | * 484 | * @return identifier 485 | */ 486 | public String consumeCssIdentifier() { 487 | int start = pos; 488 | while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) pos++; 489 | 490 | return queue.substring(start, pos); 491 | } 492 | 493 | /** 494 | * Consume an attribute key off the queue (letter, digit, -, _, :") 495 | * 496 | * @return attribute key 497 | */ 498 | public String consumeAttributeKey() { 499 | int start = pos; 500 | while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) pos++; 501 | 502 | return queue.substring(start, pos); 503 | } 504 | 505 | /** 506 | * Consume and return whatever is left on the queue. 507 | * 508 | * @return remained of queue. 509 | */ 510 | public String remainder() { 511 | StringBuilder accum = new StringBuilder(); 512 | while (!isEmpty()) { 513 | accum.append(consume()); 514 | } 515 | return accum.toString(); 516 | } 517 | 518 | @Override 519 | public String toString() { 520 | return queue.substring(pos); 521 | } 522 | 523 | public boolean containsAny(String... seq) { 524 | for (String s : seq) { 525 | if (queue.contains(s)) { 526 | return true; 527 | } 528 | } 529 | return false; 530 | } 531 | 532 | public String consumeToUnescaped(String str) { 533 | String s = consumeToAny(str); 534 | if (s.length() > 0 && s.charAt(s.length() - 1) == '\\') { 535 | s += consume(); 536 | s += consumeToUnescaped(str); 537 | } 538 | Validate.isTrue(pos < queue.length(), "Unclosed quotes! " + queue); 539 | return s; 540 | } 541 | 542 | public List parseFuncionParams() { 543 | List params = new ArrayList(); 544 | StringBuilder accum = new StringBuilder(); 545 | while (!isEmpty()) { 546 | consumeWhitespace(); 547 | if (matchChomp(",")) { 548 | params.add(accum.toString()); 549 | accum = new StringBuilder(); 550 | } 551 | else if (matchesAny(quotes)) { 552 | String quoteUsed = consumeAny(quotes); 553 | accum.append(quoteUsed); 554 | accum.append(consumeToUnescaped(quoteUsed)); 555 | accum.append(consume()); 556 | } 557 | else { 558 | accum.append(consumeToAny("\"", "'", ",")); 559 | } 560 | } 561 | if (accum.length() > 0) { 562 | params.add(accum.toString()); 563 | } 564 | return params; 565 | } 566 | 567 | /** 568 | * Tests if a code point is "whitespace" as defined in the HTML spec. Used for output HTML. 569 | * Copied from jsoup's org.jsoup.internal.StringUtil. 570 | * @param c code point to test 571 | * @return true if code point is whitespace, false otherwise 572 | * @see #isActuallyWhitespace(int) 573 | */ 574 | private static boolean isWhitespace(int c){ 575 | return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; 576 | } 577 | 578 | // also copied from jsoup's org.jsoup.internal.StringUtil. 579 | private static boolean in(final String needle, final String... haystack) { 580 | final int len = haystack.length; 581 | for (int i = 0; i < len; i++) { 582 | if (haystack[i].equals(needle)) 583 | return true; 584 | } 585 | return false; 586 | } 587 | } 588 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/Xsoup.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.NodeTraversor; 7 | import us.codecraft.xsoup.w3c.NodeAdaptors; 8 | import us.codecraft.xsoup.xevaluator.FormattingVisitor; 9 | import us.codecraft.xsoup.xevaluator.XPathParser; 10 | 11 | /** 12 | * @author code4crafter@gmail.com 13 | */ 14 | public class Xsoup { 15 | 16 | /*------------- XEvaluator --------------- */ 17 | 18 | public static XElements select(Element element, String xpathStr) { 19 | return XPathParser.parse(xpathStr).evaluate(element); 20 | } 21 | 22 | public static XElements select(String html, String xpathStr) { 23 | return XPathParser.parse(xpathStr).evaluate(Jsoup.parse(html)); 24 | } 25 | 26 | public static XPathEvaluator compile(String xpathStr) { 27 | return XPathParser.parse(xpathStr); 28 | } 29 | 30 | /*------------- W3cAdaptor --------------- */ 31 | 32 | public static org.w3c.dom.Element convertElement(Element element) { 33 | return NodeAdaptors.getElement(element); 34 | } 35 | 36 | public static org.w3c.dom.Document convertDocument(Document document) { 37 | return NodeAdaptors.getDocument(document); 38 | } 39 | 40 | public static String HtmlToPlainText(Element element) { 41 | FormattingVisitor formatter = new FormattingVisitor(); 42 | NodeTraversor.traverse(formatter, element); 43 | return formatter.toString(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/AttributeAdaptor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import org.jsoup.nodes.Attribute; 4 | import org.w3c.dom.Attr; 5 | import org.w3c.dom.DOMException; 6 | import org.w3c.dom.Document; 7 | import org.w3c.dom.Element; 8 | import org.w3c.dom.NamedNodeMap; 9 | import org.w3c.dom.Node; 10 | import org.w3c.dom.NodeList; 11 | import org.w3c.dom.TypeInfo; 12 | 13 | /** 14 | * @author code4crafer@gmail.com 15 | */ 16 | public class AttributeAdaptor extends NodeAdaptor implements Attr { 17 | 18 | private Attribute attribute; 19 | 20 | private org.jsoup.nodes.Element element; 21 | 22 | public AttributeAdaptor(Attribute attribute, org.jsoup.nodes.Element element) { 23 | this.attribute = attribute; 24 | this.element = element; 25 | } 26 | 27 | @Override 28 | public String getName() { 29 | return attribute.getKey(); 30 | } 31 | 32 | @Override 33 | public boolean getSpecified() { 34 | return false; 35 | } 36 | 37 | @Override 38 | public String getValue() { 39 | return attribute.getValue(); 40 | } 41 | 42 | @Override 43 | public void setValue(String value) throws DOMException { 44 | throw new UnsupportedOperationException(); 45 | } 46 | 47 | @Override 48 | public Element getOwnerElement() { 49 | return NodeAdaptors.getElement(element); 50 | } 51 | 52 | @Override 53 | public TypeInfo getSchemaTypeInfo() { 54 | return new DummyTypeInfo(); 55 | } 56 | 57 | @Override 58 | public boolean isId() { 59 | return false; 60 | } 61 | 62 | @Override 63 | public String getNodeName() { 64 | return attribute.getKey(); 65 | } 66 | 67 | @Override 68 | public String getNodeValue() throws DOMException { 69 | return attribute.getValue(); 70 | } 71 | 72 | @Override 73 | public short getNodeType() { 74 | return ATTRIBUTE_NODE; 75 | } 76 | 77 | @Override 78 | public Node getParentNode() { 79 | return new ElementAdaptor(element); 80 | } 81 | 82 | @Override 83 | public NodeList getChildNodes() { 84 | return null; 85 | } 86 | 87 | @Override 88 | public Node getFirstChild() { 89 | return null; 90 | } 91 | 92 | @Override 93 | public Node getLastChild() { 94 | return null; 95 | } 96 | 97 | @Override 98 | public Node getPreviousSibling() { 99 | return null; 100 | } 101 | 102 | @Override 103 | public Node getNextSibling() { 104 | return null; 105 | } 106 | 107 | @Override 108 | public NamedNodeMap getAttributes() { 109 | return null; 110 | } 111 | 112 | @Override 113 | public Document getOwnerDocument() { 114 | return new DocumentAdaptor(element.ownerDocument()); 115 | } 116 | 117 | @Override 118 | public boolean hasChildNodes() { 119 | return false; 120 | } 121 | 122 | @Override 123 | public Node cloneNode(boolean deep) { 124 | throw new UnsupportedOperationException(); 125 | } 126 | 127 | @Override 128 | public boolean hasAttributes() { 129 | return false; 130 | } 131 | 132 | @Override 133 | public short compareDocumentPosition(Node other) throws DOMException { 134 | return 0; 135 | } 136 | 137 | @Override 138 | public String getTextContent() throws DOMException { 139 | return attribute.getValue(); 140 | } 141 | 142 | @Override 143 | public boolean isSameNode(Node other) { 144 | return false; 145 | } 146 | 147 | @Override 148 | public boolean isEqualNode(Node arg) { 149 | return false; 150 | } 151 | 152 | @Override 153 | public Object getUserData(String key) { 154 | return null; 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/AttributesAdaptor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import org.jsoup.nodes.Attribute; 6 | import org.jsoup.nodes.Attributes; 7 | import org.jsoup.nodes.Element; 8 | import org.w3c.dom.Attr; 9 | 10 | /** 11 | * @author code4crafer@gmail.com 12 | */ 13 | public class AttributesAdaptor { 14 | 15 | private Attributes attributes; 16 | 17 | private org.jsoup.nodes.Element element; 18 | 19 | private List attrList; 20 | 21 | public AttributesAdaptor(Attributes attributes, Element element) { 22 | this.attributes = attributes; 23 | this.element = element; 24 | attrList = new ArrayList(); 25 | for (Attribute attribute : attributes) { 26 | attrList.add(new AttributeAdaptor(attribute, element)); 27 | } 28 | } 29 | 30 | public List get() { 31 | return attrList; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/DocumentAdaptor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import java.nio.charset.Charset; 4 | import org.w3c.dom.Attr; 5 | import org.w3c.dom.CDATASection; 6 | import org.w3c.dom.Comment; 7 | import org.w3c.dom.DOMConfiguration; 8 | import org.w3c.dom.DOMException; 9 | import org.w3c.dom.DOMImplementation; 10 | import org.w3c.dom.Document; 11 | import org.w3c.dom.DocumentFragment; 12 | import org.w3c.dom.DocumentType; 13 | import org.w3c.dom.Element; 14 | import org.w3c.dom.EntityReference; 15 | import org.w3c.dom.Node; 16 | import org.w3c.dom.ProcessingInstruction; 17 | import org.w3c.dom.Text; 18 | 19 | /** 20 | * @author code4crafer@gmail.com 21 | */ 22 | public class DocumentAdaptor extends ElementAdaptor implements Document { 23 | 24 | private org.jsoup.nodes.Document document; 25 | 26 | public DocumentAdaptor(org.jsoup.nodes.Document document) { 27 | super(document); 28 | this.document = document; 29 | } 30 | 31 | @Override 32 | public DocumentType getDoctype() { 33 | return new HtmlDocumentType(document); 34 | } 35 | 36 | @Override 37 | public DOMImplementation getImplementation() { 38 | return null; 39 | } 40 | 41 | @Override 42 | public short getNodeType() { 43 | return DOCUMENT_NODE; 44 | } 45 | 46 | @Override 47 | public Element getDocumentElement() { 48 | return this; 49 | } 50 | 51 | @Override 52 | public Element getElementById(String elementId) { 53 | return NodeAdaptors.getElement(document.getElementById(elementId)); 54 | } 55 | 56 | @Override 57 | public String getInputEncoding() { 58 | return Charset.defaultCharset().name(); 59 | } 60 | 61 | @Override 62 | public String getXmlEncoding() { 63 | return Charset.defaultCharset().name(); 64 | } 65 | 66 | @Override 67 | public boolean getXmlStandalone() { 68 | return false; 69 | } 70 | 71 | @Override 72 | public void setXmlStandalone(boolean xmlStandalone) throws DOMException { 73 | throw new UnsupportedOperationException(); 74 | } 75 | 76 | @Override 77 | public String getXmlVersion() { 78 | //TODO 79 | return null; 80 | } 81 | 82 | @Override 83 | public void setXmlVersion(String xmlVersion) throws DOMException { 84 | throw new UnsupportedOperationException(); 85 | } 86 | 87 | @Override 88 | public boolean getStrictErrorChecking() { 89 | return false; 90 | } 91 | 92 | @Override 93 | public void setStrictErrorChecking(boolean strictErrorChecking) { 94 | throw new UnsupportedOperationException(); 95 | } 96 | 97 | @Override 98 | public String getDocumentURI() { 99 | return document.baseUri(); 100 | } 101 | 102 | @Override 103 | public void setDocumentURI(String documentURI) { 104 | throw new UnsupportedOperationException(); 105 | } 106 | 107 | @Override 108 | public Node adoptNode(Node source) throws DOMException { 109 | throw new UnsupportedOperationException(); 110 | } 111 | 112 | @Override 113 | public DOMConfiguration getDomConfig() { 114 | return null; 115 | } 116 | 117 | @Override 118 | public void normalizeDocument() { 119 | 120 | } 121 | 122 | @Override 123 | public Node renameNode(Node n, String namespaceURI, String qualifiedName) throws DOMException { 124 | throw new UnsupportedOperationException(); 125 | } 126 | 127 | @Override 128 | public Element createElement(String tagName) throws DOMException { 129 | throw new UnsupportedOperationException(); 130 | } 131 | 132 | @Override 133 | public DocumentFragment createDocumentFragment() { 134 | throw new UnsupportedOperationException(); 135 | } 136 | 137 | @Override 138 | public Text createTextNode(String data) { 139 | throw new UnsupportedOperationException(); 140 | } 141 | 142 | @Override 143 | public Comment createComment(String data) { 144 | throw new UnsupportedOperationException(); 145 | } 146 | 147 | @Override 148 | public CDATASection createCDATASection(String data) throws DOMException { 149 | throw new UnsupportedOperationException(); 150 | } 151 | 152 | @Override 153 | public ProcessingInstruction createProcessingInstruction(String target, String data) throws DOMException { 154 | throw new UnsupportedOperationException(); 155 | } 156 | 157 | @Override 158 | public Attr createAttribute(String name) throws DOMException { 159 | throw new UnsupportedOperationException(); 160 | } 161 | 162 | @Override 163 | public EntityReference createEntityReference(String name) throws DOMException { 164 | throw new UnsupportedOperationException(); 165 | } 166 | 167 | @Override 168 | public Node importNode(Node importedNode, boolean deep) throws DOMException { 169 | throw new UnsupportedOperationException(); 170 | } 171 | 172 | @Override 173 | public Element createElementNS(String namespaceURI, String qualifiedName) throws DOMException { 174 | throw new UnsupportedOperationException(); 175 | } 176 | 177 | @Override 178 | public Attr createAttributeNS(String namespaceURI, String qualifiedName) throws DOMException { 179 | throw new UnsupportedOperationException(); 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/DummyTypeInfo.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import org.w3c.dom.TypeInfo; 4 | 5 | /** 6 | * @author code4crafer@gmail.com 7 | */ 8 | public class DummyTypeInfo implements TypeInfo { 9 | 10 | private static final DummyTypeInfo INSTANCE = new DummyTypeInfo(); 11 | 12 | public static DummyTypeInfo getInstance() { 13 | return INSTANCE; 14 | } 15 | 16 | @Override 17 | public String getTypeName() { 18 | return null; 19 | } 20 | 21 | @Override 22 | public String getTypeNamespace() { 23 | return null; 24 | } 25 | 26 | @Override 27 | public boolean isDerivedFrom(String typeNamespaceArg, String typeNameArg, int derivationMethod) { 28 | return false; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/ElementAdaptor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import org.jsoup.nodes.Attribute; 4 | import org.w3c.dom.Attr; 5 | import org.w3c.dom.DOMException; 6 | import org.w3c.dom.Document; 7 | import org.w3c.dom.Element; 8 | import org.w3c.dom.NamedNodeMap; 9 | import org.w3c.dom.Node; 10 | import org.w3c.dom.NodeList; 11 | import org.w3c.dom.TypeInfo; 12 | 13 | /** 14 | * @author code4crafer@gmail.com 15 | */ 16 | public class ElementAdaptor extends NodeAdaptor implements Element { 17 | 18 | private org.jsoup.nodes.Element element; 19 | 20 | public ElementAdaptor(org.jsoup.nodes.Element element) { 21 | this.element = element; 22 | } 23 | 24 | @Override 25 | public String getTagName() { 26 | return element.tagName(); 27 | } 28 | 29 | @Override 30 | public String getAttribute(String name) { 31 | return element.attr(name); 32 | } 33 | 34 | @Override 35 | public Attr getAttributeNode(String name) { 36 | if (element.attr(name) == null) { 37 | return null; 38 | } 39 | return NodeAdaptors.getAttr(new Attribute(name, element.attr(name)), element); 40 | } 41 | 42 | @Override 43 | public NodeList getElementsByTagName(String name) { 44 | return NodeAdaptors.getNodeList(element.getElementsByTag(name)); 45 | } 46 | 47 | @Override 48 | public boolean hasAttribute(String name) { 49 | return element.hasAttr(name); 50 | } 51 | 52 | @Override 53 | public TypeInfo getSchemaTypeInfo() { 54 | return DummyTypeInfo.getInstance(); 55 | } 56 | 57 | @Override 58 | public String getNodeName() { 59 | return element.nodeName(); 60 | } 61 | 62 | @Override 63 | public String getNodeValue() throws DOMException { 64 | return element.outerHtml(); 65 | } 66 | 67 | @Override 68 | public short getNodeType() { 69 | return ELEMENT_NODE; 70 | } 71 | 72 | @Override 73 | public Node getParentNode() { 74 | return NodeAdaptors.getElement(element.parent()); 75 | } 76 | 77 | @Override 78 | public NodeList getChildNodes() { 79 | return NodeAdaptors.getNodeList(element.childNodes()); 80 | } 81 | 82 | @Override 83 | public Node getFirstChild() { 84 | if (element.children().isEmpty()) { 85 | return null; 86 | } 87 | return NodeAdaptors.getNode(element.child(0)); 88 | } 89 | 90 | @Override 91 | public Node getLastChild() { 92 | if (element.children().isEmpty()) { 93 | return null; 94 | } 95 | return NodeAdaptors.getNode(element.child(element.childNodeSize())); 96 | } 97 | 98 | @Override 99 | public Node getPreviousSibling() { 100 | return NodeAdaptors.getNode(element.previousSibling()); 101 | } 102 | 103 | @Override 104 | public Node getNextSibling() { 105 | return NodeAdaptors.getNode(element.nextSibling()); 106 | } 107 | 108 | @Override 109 | public NamedNodeMap getAttributes() { 110 | return NodeAdaptors.getNamedNodeMap(NodeAdaptors.getAttributes(element.attributes(), element)); 111 | } 112 | 113 | @Override 114 | public String getTextContent() throws DOMException { 115 | return element.text(); 116 | } 117 | 118 | @Override 119 | public Document getOwnerDocument() { 120 | return NodeAdaptors.getDocument(element.ownerDocument()); 121 | } 122 | 123 | @Override 124 | public boolean hasChildNodes() { 125 | return !element.children().isEmpty(); 126 | } 127 | 128 | @Override 129 | public Node cloneNode(boolean deep) { 130 | return null; 131 | } 132 | 133 | @Override 134 | public boolean hasAttributes() { 135 | return true; 136 | } 137 | 138 | @Override 139 | public short compareDocumentPosition(Node other) throws DOMException { 140 | return 0; 141 | } 142 | 143 | @Override 144 | public boolean isSameNode(Node other) { 145 | return false; 146 | } 147 | 148 | @Override 149 | public boolean isEqualNode(Node arg) { 150 | return false; 151 | } 152 | 153 | @Override 154 | public Object getUserData(String key) { 155 | return null; 156 | } 157 | 158 | /*----------------------------- update - not support-------------*/ 159 | @Override 160 | public void setAttribute(String name, String value) throws DOMException { 161 | throw new UnsupportedOperationException(); 162 | } 163 | 164 | @Override 165 | public void removeAttribute(String name) throws DOMException { 166 | throw new UnsupportedOperationException(); 167 | } 168 | 169 | @Override 170 | public Attr setAttributeNode(Attr newAttr) throws DOMException { 171 | throw new UnsupportedOperationException(); 172 | } 173 | 174 | @Override 175 | public Attr removeAttributeNode(Attr oldAttr) throws DOMException { 176 | throw new UnsupportedOperationException(); 177 | } 178 | 179 | @Override 180 | public void setAttributeNS(String namespaceURI, String qualifiedName, String value) throws DOMException { 181 | throw new UnsupportedOperationException(); 182 | } 183 | 184 | @Override 185 | public void removeAttributeNS(String namespaceURI, String localName) throws DOMException { 186 | throw new UnsupportedOperationException(); 187 | } 188 | 189 | @Override 190 | public Attr setAttributeNodeNS(Attr newAttr) throws DOMException { 191 | throw new UnsupportedOperationException(); 192 | } 193 | 194 | @Override 195 | public void setIdAttribute(String name, boolean isId) throws DOMException { 196 | throw new UnsupportedOperationException(); 197 | } 198 | 199 | @Override 200 | public void setIdAttributeNS(String namespaceURI, String localName, boolean isId) throws DOMException { 201 | throw new UnsupportedOperationException(); 202 | } 203 | 204 | @Override 205 | public void setIdAttributeNode(Attr idAttr, boolean isId) throws DOMException { 206 | throw new UnsupportedOperationException(); 207 | } 208 | 209 | /*--------------------- NS not supported ----------------*/ 210 | 211 | @Override 212 | public String getAttributeNS(String namespaceURI, String localName) throws DOMException { 213 | throw new UnsupportedOperationException(); 214 | } 215 | 216 | @Override 217 | public Attr getAttributeNodeNS(String namespaceURI, String localName) throws DOMException { 218 | throw new UnsupportedOperationException(); 219 | } 220 | 221 | @Override 222 | public NodeList getElementsByTagNameNS(String namespaceURI, String localName) throws DOMException { 223 | throw new UnsupportedOperationException(); 224 | } 225 | 226 | @Override 227 | public boolean hasAttributeNS(String namespaceURI, String localName) throws DOMException { 228 | throw new UnsupportedOperationException(); 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/HtmlDocumentType.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import org.w3c.dom.DOMException; 4 | import org.w3c.dom.Document; 5 | import org.w3c.dom.DocumentType; 6 | import org.w3c.dom.NamedNodeMap; 7 | import org.w3c.dom.Node; 8 | import org.w3c.dom.NodeList; 9 | 10 | /** 11 | * @author code4crafer@gmail.com 12 | */ 13 | public class HtmlDocumentType extends NodeAdaptor implements DocumentType { 14 | 15 | private org.jsoup.nodes.Document document; 16 | 17 | public HtmlDocumentType(org.jsoup.nodes.Document document) { 18 | this.document = document; 19 | } 20 | 21 | @Override 22 | public String getNodeName() { 23 | return "html"; 24 | } 25 | 26 | @Override 27 | public String getNodeValue() throws DOMException { 28 | return null; 29 | } 30 | 31 | @Override 32 | public short getNodeType() { 33 | return DOCUMENT_TYPE_NODE; 34 | } 35 | 36 | @Override 37 | public Node getParentNode() { 38 | return null; 39 | } 40 | 41 | @Override 42 | public NodeList getChildNodes() { 43 | return null; 44 | } 45 | 46 | @Override 47 | public Node getFirstChild() { 48 | return null; 49 | } 50 | 51 | @Override 52 | public Node getLastChild() { 53 | return null; 54 | } 55 | 56 | @Override 57 | public Node getPreviousSibling() { 58 | return null; 59 | } 60 | 61 | @Override 62 | public Node getNextSibling() { 63 | return null; 64 | } 65 | 66 | @Override 67 | public NamedNodeMap getAttributes() { 68 | return null; 69 | } 70 | 71 | @Override 72 | public Document getOwnerDocument() { 73 | return NodeAdaptors.getDocument(document); 74 | } 75 | 76 | @Override 77 | public boolean hasChildNodes() { 78 | return false; 79 | } 80 | 81 | @Override 82 | public Node cloneNode(boolean deep) { 83 | return null; 84 | } 85 | 86 | @Override 87 | public boolean hasAttributes() { 88 | return false; 89 | } 90 | 91 | @Override 92 | public short compareDocumentPosition(Node other) throws DOMException { 93 | return 0; 94 | } 95 | 96 | @Override 97 | public String getTextContent() throws DOMException { 98 | return document.text(); 99 | } 100 | 101 | @Override 102 | public boolean isSameNode(Node other) { 103 | return false; 104 | } 105 | 106 | @Override 107 | public boolean isEqualNode(Node arg) { 108 | return false; 109 | } 110 | 111 | @Override 112 | public Object getUserData(String key) { 113 | return null; 114 | } 115 | 116 | @Override 117 | public String getName() { 118 | return "html"; 119 | } 120 | 121 | @Override 122 | public NamedNodeMap getEntities() { 123 | return null; 124 | } 125 | 126 | @Override 127 | public NamedNodeMap getNotations() { 128 | return null; 129 | } 130 | 131 | @Override 132 | public String getPublicId() { 133 | return null; 134 | } 135 | 136 | @Override 137 | public String getSystemId() { 138 | return null; 139 | } 140 | 141 | @Override 142 | public String getInternalSubset() { 143 | return null; 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/NamedNodeMapAdaptor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | import org.w3c.dom.DOMException; 7 | import org.w3c.dom.NamedNodeMap; 8 | import org.w3c.dom.Node; 9 | 10 | /** 11 | * @author code4crafer@gmail.com 12 | */ 13 | public class NamedNodeMapAdaptor implements NamedNodeMap { 14 | 15 | private List nodeList; 16 | private Map nodeMap; 17 | 18 | public NamedNodeMapAdaptor(List nodeList) { 19 | this.nodeList = nodeList; 20 | nodeMap = new HashMap(nodeList.size()); 21 | for (Node node : nodeList) { 22 | nodeMap.put(node.getNodeName(), node); 23 | } 24 | } 25 | 26 | @Override 27 | public Node getNamedItem(String name) { 28 | return nodeMap.get(name); 29 | } 30 | 31 | @Override 32 | public Node setNamedItem(Node arg) throws DOMException { 33 | throw new UnsupportedOperationException(); 34 | } 35 | 36 | @Override 37 | public Node removeNamedItem(String name) throws DOMException { 38 | throw new UnsupportedOperationException(); 39 | } 40 | 41 | @Override 42 | public Node item(int index) { 43 | return nodeList.get(index); 44 | } 45 | 46 | @Override 47 | public int getLength() { 48 | return nodeList.size(); 49 | } 50 | 51 | @Override 52 | public Node getNamedItemNS(String namespaceURI, String localName) throws DOMException { 53 | throw new UnsupportedOperationException(); 54 | } 55 | 56 | @Override 57 | public Node setNamedItemNS(Node arg) throws DOMException { 58 | throw new UnsupportedOperationException(); 59 | } 60 | 61 | @Override 62 | public Node removeNamedItemNS(String namespaceURI, String localName) throws DOMException { 63 | throw new UnsupportedOperationException(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/NodeAdaptor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import org.w3c.dom.DOMException; 4 | import org.w3c.dom.Node; 5 | import org.w3c.dom.UserDataHandler; 6 | 7 | /** 8 | * @author code4crafer@gmail.com 9 | */ 10 | public abstract class NodeAdaptor implements Node { 11 | 12 | @Override 13 | public void setNodeValue(String nodeValue) throws DOMException { 14 | throw new UnsupportedOperationException(); 15 | } 16 | 17 | @Override 18 | public Node insertBefore(Node newChild, Node refChild) throws DOMException { 19 | throw new UnsupportedOperationException(); 20 | } 21 | 22 | @Override 23 | public Node replaceChild(Node newChild, Node oldChild) throws DOMException { 24 | throw new UnsupportedOperationException(); 25 | } 26 | 27 | @Override 28 | public Node removeChild(Node oldChild) throws DOMException { 29 | throw new UnsupportedOperationException(); 30 | } 31 | 32 | @Override 33 | public Node appendChild(Node newChild) throws DOMException { 34 | throw new UnsupportedOperationException(); 35 | } 36 | 37 | @Override 38 | public void normalize() { 39 | 40 | } 41 | 42 | @Override 43 | public boolean isSupported(String feature, String version) { 44 | return false; 45 | } 46 | 47 | @Override 48 | public String getNamespaceURI() { 49 | return null; 50 | } 51 | 52 | @Override 53 | public String getPrefix() { 54 | return null; 55 | } 56 | 57 | @Override 58 | public void setPrefix(String prefix) throws DOMException { 59 | throw new UnsupportedOperationException(); 60 | } 61 | 62 | @Override 63 | public String getLocalName() { 64 | return null; 65 | } 66 | 67 | @Override 68 | public String getBaseURI() { 69 | return null; 70 | } 71 | 72 | @Override 73 | public void setTextContent(String textContent) throws DOMException { 74 | throw new UnsupportedOperationException(); 75 | } 76 | 77 | @Override 78 | public String lookupPrefix(String namespaceURI) { 79 | return null; 80 | } 81 | 82 | @Override 83 | public boolean isDefaultNamespace(String namespaceURI) { 84 | return false; 85 | } 86 | 87 | @Override 88 | public String lookupNamespaceURI(String prefix) { 89 | return null; 90 | } 91 | 92 | @Override 93 | public Object getFeature(String feature, String version) { 94 | return null; 95 | } 96 | 97 | @Override 98 | public Object setUserData(String key, Object data, UserDataHandler handler) { 99 | throw new UnsupportedOperationException(); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/NodeAdaptors.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import java.util.List; 4 | import org.jsoup.nodes.Attribute; 5 | import org.jsoup.nodes.Attributes; 6 | import org.jsoup.nodes.Element; 7 | import org.jsoup.select.Elements; 8 | import org.w3c.dom.Attr; 9 | import org.w3c.dom.Document; 10 | import org.w3c.dom.NamedNodeMap; 11 | import org.w3c.dom.Node; 12 | import org.w3c.dom.NodeList; 13 | 14 | /** 15 | * @author code4crafer@gmail.com 16 | */ 17 | public class NodeAdaptors { 18 | 19 | public static Node getNode(org.jsoup.nodes.Node node) { 20 | if (node == null) { 21 | return null; 22 | } 23 | if (node instanceof Element) { 24 | return new ElementAdaptor((Element) node); 25 | } 26 | return null; 27 | } 28 | 29 | public static org.w3c.dom.Element getElement(Element element) { 30 | if (element == null) { 31 | return null; 32 | } 33 | return new ElementAdaptor(element); 34 | } 35 | 36 | public static Document getDocument(org.jsoup.nodes.Document document) { 37 | if (document == null) { 38 | return null; 39 | } 40 | return new DocumentAdaptor(document); 41 | } 42 | 43 | public static NodeList getNodeList(Elements elements) { 44 | if (elements == null || elements.size() == 0) { 45 | return null; 46 | } 47 | return new NodeListAdaptor(elements); 48 | } 49 | 50 | public static NodeList getNodeList(List elements) { 51 | if (elements == null || elements.size() == 0) { 52 | return null; 53 | } 54 | return new NodeListAdaptor(elements); 55 | } 56 | 57 | public static Attr getAttr(Attribute attr, Element element) { 58 | if (attr == null || element == null) { 59 | return null; 60 | } 61 | return new AttributeAdaptor(attr, element); 62 | } 63 | 64 | public static NamedNodeMap getNamedNodeMap(List nodeList) { 65 | if (nodeList == null || nodeList == null) { 66 | return null; 67 | } 68 | return new NamedNodeMapAdaptor(nodeList); 69 | } 70 | 71 | public static List getAttributes(Attributes attrs, Element element) { 72 | if (attrs == null || element == null) { 73 | return null; 74 | } 75 | return new AttributesAdaptor(attrs, element).get(); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/w3c/NodeListAdaptor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import java.util.List; 4 | import org.w3c.dom.Node; 5 | import org.w3c.dom.NodeList; 6 | 7 | /** 8 | * @author code4crafer@gmail.com 9 | */ 10 | public class NodeListAdaptor implements NodeList { 11 | 12 | private List nodes; 13 | 14 | public NodeListAdaptor(List nodes) { 15 | this.nodes = nodes; 16 | } 17 | 18 | @Override 19 | public Node item(int index) { 20 | return NodeAdaptors.getNode(nodes.get(index)); 21 | } 22 | 23 | @Override 24 | public int getLength() { 25 | return nodes.size(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/CombingXPathEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import org.jsoup.nodes.Element; 7 | import us.codecraft.xsoup.XElements; 8 | import us.codecraft.xsoup.XPathEvaluator; 9 | 10 | /** 11 | * @author code4crafter@gmail.com 12 | */ 13 | public class CombingXPathEvaluator implements XPathEvaluator { 14 | 15 | private List xPathEvaluators; 16 | 17 | public CombingXPathEvaluator(List xPathEvaluators) { 18 | this.xPathEvaluators = xPathEvaluators; 19 | } 20 | 21 | public CombingXPathEvaluator(XPathEvaluator... xPathEvaluators) { 22 | this.xPathEvaluators = Arrays.asList(xPathEvaluators); 23 | } 24 | 25 | @Override 26 | public XElements evaluate(Element element) { 27 | List xElementses = new ArrayList(); 28 | for (XPathEvaluator xPathEvaluator : xPathEvaluators) { 29 | xElementses.add(xPathEvaluator.evaluate(element)); 30 | } 31 | return new CombiningDefaultXElements(xElementses); 32 | } 33 | 34 | @Override 35 | public boolean hasAttribute() { 36 | for (XPathEvaluator xPathEvaluator : xPathEvaluators) { 37 | if (xPathEvaluator.hasAttribute()) { 38 | return true; 39 | } 40 | } 41 | return false; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/CombiningDefaultXElements.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import org.jsoup.select.Elements; 7 | import us.codecraft.xsoup.XElements; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | public class CombiningDefaultXElements implements XElements { 13 | 14 | private List elementsList; 15 | 16 | public CombiningDefaultXElements(List elementsList) { 17 | this.elementsList = elementsList; 18 | } 19 | 20 | public CombiningDefaultXElements(XElements... elementsList) { 21 | this.elementsList = Arrays.asList(elementsList); 22 | } 23 | 24 | @Override 25 | public String get() { 26 | for (XElements xElements : elementsList) { 27 | String result = xElements.get(); 28 | if (result != null) { 29 | return result; 30 | } 31 | } 32 | return null; 33 | } 34 | 35 | @Override 36 | public List list() { 37 | List results = new ArrayList(); 38 | for (XElements xElements : elementsList) { 39 | results.addAll(xElements.list()); 40 | } 41 | return results; 42 | } 43 | 44 | public Elements getElements() { 45 | Elements elements = new Elements(); 46 | for (XElements xElements : elementsList) { 47 | elements.addAll(xElements.getElements()); 48 | } 49 | return elements; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/CombiningEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.Collection; 6 | import java.util.List; 7 | 8 | import org.jsoup.internal.StringUtil; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.select.Evaluator; 11 | 12 | /** 13 | * Base combining (and, or) evaluator. 14 | *

15 | * Copy from {@link org.jsoup.select.CombiningEvaluator} because it is package visible 16 | * 17 | * @see org.jsoup.select.CombiningEvaluator 18 | */ 19 | abstract class CombiningEvaluator extends Evaluator { 20 | final List evaluators; 21 | 22 | CombiningEvaluator() { 23 | super(); 24 | evaluators = new ArrayList(); 25 | } 26 | 27 | CombiningEvaluator(Collection evaluators) { 28 | this(); 29 | this.evaluators.addAll(evaluators); 30 | } 31 | 32 | Evaluator rightMostEvaluator() { 33 | return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) : null; 34 | } 35 | 36 | void replaceRightMostEvaluator(Evaluator replacement) { 37 | evaluators.set(evaluators.size() - 1, replacement); 38 | } 39 | 40 | static final class And extends CombiningEvaluator { 41 | And(Collection evaluators) { 42 | super(evaluators); 43 | } 44 | 45 | And(Evaluator... evaluators) { 46 | this(Arrays.asList(evaluators)); 47 | } 48 | 49 | @Override 50 | public boolean matches(Element root, Element node) { 51 | for (int i = 0; i < evaluators.size(); i++) { 52 | Evaluator s = evaluators.get(i); 53 | if (!s.matches(root, node)) return false; 54 | } 55 | return true; 56 | } 57 | 58 | @Override 59 | public String toString() { 60 | return StringUtil.join(evaluators, " "); 61 | } 62 | } 63 | 64 | static final class Or extends CombiningEvaluator { 65 | 66 | Or(Collection evaluators) { 67 | super(); 68 | this.evaluators.addAll(evaluators); 69 | } 70 | 71 | Or(Evaluator... evaluators) { 72 | this(Arrays.asList(evaluators)); 73 | } 74 | 75 | Or() { 76 | super(); 77 | } 78 | 79 | public void add(Evaluator e) { 80 | evaluators.add(e); 81 | } 82 | 83 | @Override 84 | public boolean matches(Element root, Element node) { 85 | for (int i = 0; i < evaluators.size(); i++) { 86 | Evaluator s = evaluators.get(i); 87 | if (s.matches(root, node)) return true; 88 | } 89 | return false; 90 | } 91 | 92 | @Override 93 | public String toString() { 94 | return String.format(":or%s", evaluators); 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/DefaultXElement.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import org.jsoup.nodes.Element; 4 | import us.codecraft.xsoup.XElement; 5 | 6 | /** 7 | * XPath result. 8 | * 9 | * @author code4crafter@gmail.com 10 | */ 11 | public class DefaultXElement implements XElement { 12 | 13 | private Element element; 14 | 15 | private ElementOperator elementOperator; 16 | 17 | public DefaultXElement(Element element, ElementOperator elementOperator) { 18 | this.element = element; 19 | this.elementOperator = elementOperator; 20 | } 21 | 22 | @Override 23 | public String get() { 24 | return get(elementOperator); 25 | } 26 | 27 | protected String get(ElementOperator elementOperator) { 28 | if (elementOperator == null) { 29 | return element.toString(); 30 | } 31 | else { 32 | return elementOperator.operate(element); 33 | } 34 | } 35 | 36 | public String toString() { 37 | return get(); 38 | } 39 | 40 | @Override 41 | public Element getElement() { 42 | return element; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/DefaultXElements.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | import us.codecraft.xsoup.XElement; 8 | import us.codecraft.xsoup.XElements; 9 | 10 | /** 11 | * XPath results. 12 | * 13 | * @author code4crafter@gmail.com 14 | */ 15 | public class DefaultXElements extends ArrayList implements XElements { 16 | 17 | private Elements elements; 18 | 19 | private ElementOperator elementOperator; 20 | 21 | public DefaultXElements(Elements elements, ElementOperator elementOperator) { 22 | this.elements = elements; 23 | this.elementOperator = elementOperator; 24 | initList(); 25 | } 26 | 27 | private void initList() { 28 | for (Element element : elements) { 29 | this.add(new DefaultXElement(element, elementOperator)); 30 | } 31 | } 32 | 33 | @Override 34 | public String get() { 35 | if (size() < 1) { 36 | return null; 37 | } 38 | else { 39 | return get(0).get(); 40 | } 41 | } 42 | 43 | @Override 44 | public List list() { 45 | List resultStrings = new ArrayList(); 46 | for (XElement xElement : this) { 47 | String text = xElement.get(); 48 | if (text != null) { 49 | resultStrings.add(text); 50 | } 51 | } 52 | return resultStrings; 53 | } 54 | 55 | @Override 56 | public String toString() { 57 | return get(); 58 | } 59 | 60 | @Override 61 | public Elements getElements() { 62 | return elements; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/DefaultXPathEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import org.jsoup.nodes.Element; 4 | import org.jsoup.select.Collector; 5 | import org.jsoup.select.Elements; 6 | import org.jsoup.select.Evaluator; 7 | import us.codecraft.xsoup.XElements; 8 | import us.codecraft.xsoup.XPathEvaluator; 9 | 10 | /** 11 | * @author code4crafter@gmail.com 12 | */ 13 | public class DefaultXPathEvaluator implements XPathEvaluator { 14 | 15 | private Evaluator evaluator; 16 | 17 | private ElementOperator elementOperator; 18 | 19 | public DefaultXPathEvaluator(Evaluator evaluator, ElementOperator elementOperator) { 20 | this.evaluator = evaluator; 21 | this.elementOperator = elementOperator; 22 | } 23 | 24 | @Override 25 | public XElements evaluate(Element element) { 26 | Elements elements; 27 | if (evaluator != null) { 28 | elements = Collector.collect(evaluator, element); 29 | } 30 | else { 31 | elements = new Elements(); 32 | elements.add(element); 33 | } 34 | return new DefaultXElements(elements, elementOperator); 35 | } 36 | 37 | @Override 38 | public boolean hasAttribute() { 39 | return elementOperator != null; 40 | } 41 | 42 | public Evaluator getEvaluator() { 43 | return evaluator; 44 | } 45 | 46 | public String getAttribute() { 47 | if (elementOperator == null) { 48 | return null; 49 | } 50 | return elementOperator.toString(); 51 | } 52 | 53 | public ElementOperator getElementOperator() { 54 | return elementOperator; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/ElementOperator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | import org.jsoup.helper.Validate; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.nodes.Node; 9 | import org.jsoup.nodes.TextNode; 10 | 11 | import us.codecraft.xsoup.Xsoup; 12 | 13 | /** 14 | * Operate on element to get XPath result. 15 | * 16 | * @author code4crafter@gmail.com 17 | */ 18 | public abstract class ElementOperator { 19 | 20 | public abstract String operate(Element element); 21 | 22 | public static class AttributeGetter extends ElementOperator { 23 | 24 | private String attribute; 25 | 26 | public AttributeGetter(String attribute) { 27 | this.attribute = attribute; 28 | } 29 | 30 | @Override 31 | public String operate(Element element) { 32 | return element.attr(attribute); 33 | } 34 | 35 | @Override 36 | public String toString() { 37 | return "@" + attribute; 38 | } 39 | } 40 | 41 | public static class AllText extends ElementOperator { 42 | 43 | @Override 44 | public String operate(Element element) { 45 | return element.text(); 46 | } 47 | 48 | @Override 49 | public String toString() { 50 | return "allText()"; 51 | } 52 | } 53 | 54 | public static class Html extends ElementOperator { 55 | 56 | @Override 57 | public String operate(Element element) { 58 | return element.html(); 59 | } 60 | 61 | @Override 62 | public String toString() { 63 | return "html()"; 64 | } 65 | } 66 | 67 | public static class OuterHtml extends ElementOperator { 68 | 69 | @Override 70 | public String operate(Element element) { 71 | return element.outerHtml(); 72 | } 73 | 74 | @Override 75 | public String toString() { 76 | return "outerHtml()"; 77 | } 78 | } 79 | 80 | public static class TidyText extends ElementOperator { 81 | 82 | @Override 83 | public String operate(Element element) { 84 | //FormattingVisitor formatter = new FormattingVisitor(); 85 | //NodeTraversor.traverse(formatter, element); 86 | //return formatter.toString(); 87 | //return new HtmlToPlainText().getPlainText(element); 88 | return Xsoup.HtmlToPlainText(element); 89 | } 90 | 91 | @Override 92 | public String toString() { 93 | return "tidyText()"; 94 | } 95 | } 96 | 97 | public static class GroupedText extends ElementOperator { 98 | 99 | private int group; 100 | 101 | public GroupedText(int group) { 102 | this.group = group; 103 | } 104 | 105 | @Override 106 | public String operate(Element element) { 107 | int index = 0; 108 | StringBuilder accum = new StringBuilder(); 109 | for (Node node : element.childNodes()) { 110 | if (node instanceof TextNode) { 111 | TextNode textNode = (TextNode) node; 112 | if (group == 0) { 113 | accum.append(textNode.text()); 114 | } 115 | else if (++index == group) { 116 | return textNode.text(); 117 | } 118 | } 119 | } 120 | return accum.toString(); 121 | } 122 | 123 | @Override 124 | public String toString() { 125 | return String.format("text(%d)", group); 126 | } 127 | } 128 | 129 | /** 130 | * usage: 131 | *
132 | * regex('.*') 133 | * regex(@attr,'.*') 134 | * regex(@attr,'.*',group) 135 | */ 136 | public static class Regex extends ElementOperator { 137 | 138 | private Pattern pattern; 139 | 140 | private String attribute; 141 | 142 | private int group; 143 | 144 | public Regex(String expr) { 145 | this.pattern = Pattern.compile(expr); 146 | } 147 | 148 | public Regex(String expr, String attribute) { 149 | this.attribute = attribute; 150 | this.pattern = Pattern.compile(expr); 151 | } 152 | 153 | public Regex(String expr, String attribute, int group) { 154 | this.attribute = attribute; 155 | this.pattern = Pattern.compile(expr); 156 | this.group = group; 157 | } 158 | 159 | @Override 160 | public String operate(Element element) { 161 | Matcher matcher = pattern.matcher(getSource(element)); 162 | if (matcher.find()) { 163 | return matcher.group(group); 164 | } 165 | return null; 166 | } 167 | 168 | protected String getSource(Element element) { 169 | if (attribute == null) { 170 | return element.outerHtml(); 171 | } 172 | else { 173 | String attr = element.attr(attribute); 174 | Validate.notNull(attr, "Attribute " + attribute + " of " + element + " is not exist!"); 175 | return attr; 176 | } 177 | } 178 | 179 | @Override 180 | public String toString() { 181 | return String.format("regex(%s%s%s)", 182 | attribute != null ? "@" + attribute + "," : "", pattern.toString(), 183 | group != 0 ? "," + group : ""); 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/FormattingVisitor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import org.jsoup.internal.StringUtil; 4 | import org.jsoup.nodes.Node; 5 | import org.jsoup.nodes.TextNode; 6 | import org.jsoup.select.NodeVisitor; 7 | 8 | /** 9 | * @author waincent 10 | * @since 2018-04-08 11 | */ 12 | public class FormattingVisitor implements NodeVisitor { 13 | private static final int maxWidth = 80; 14 | private int width = 0; 15 | private StringBuilder accum = new StringBuilder(); // holds the accumulated text 16 | 17 | // hit when the node is first seen 18 | @Override 19 | public void head(Node node, int depth) { 20 | String name = node.nodeName(); 21 | if (node instanceof TextNode) { 22 | append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. 23 | } 24 | else if (name.equals("li")) { 25 | append("\n * "); 26 | } 27 | else if (name.equals("dt")) { 28 | append(" "); 29 | } 30 | else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) append("\n"); 31 | } 32 | 33 | // hit when all of the node's children (if any) have been visited 34 | @Override 35 | public void tail(Node node, int depth) { 36 | String name = node.nodeName(); 37 | if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) { 38 | append("\n"); 39 | } 40 | else if (name.equals("a")) append(String.format(" <%s>", node.absUrl("href"))); 41 | } 42 | 43 | // appends text to the string builder with a simple word wrap method 44 | private void append(String text) { 45 | if (text.startsWith("\n")) { 46 | width = 0; // reset counter if starts with a newline. only from formats above, not in natural text 47 | } 48 | if (text.equals(" ") && (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() 49 | - 1), " ", "\n"))) { 50 | return; // don't accumulate long runs of empty spaces 51 | } 52 | 53 | if (text.length() + width > maxWidth) { // won't fit, needs to wrap 54 | String words[] = text.split("\\s+"); 55 | for (int i = 0; i < words.length; i++) { 56 | String word = words[i]; 57 | boolean last = i == words.length - 1; 58 | if (!last) // insert a space if not the last word 59 | { 60 | word = word + " "; 61 | } 62 | if (word.length() + width > maxWidth) { // wrap and reset counter 63 | accum.append("\n").append(word); 64 | width = word.length(); 65 | } 66 | else { 67 | accum.append(word); 68 | width += word.length(); 69 | } 70 | } 71 | } 72 | else { // fits as is, without need to wrap text 73 | accum.append(text); 74 | width += text.length(); 75 | } 76 | } 77 | 78 | @Override 79 | public String toString() { 80 | return accum.toString(); 81 | } 82 | } -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/HtmlToPlainText.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.helper.Validate; 7 | import org.jsoup.internal.StringUtil; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.nodes.Node; 11 | import org.jsoup.nodes.TextNode; 12 | import org.jsoup.select.Elements; 13 | import org.jsoup.select.NodeTraversor; 14 | import org.jsoup.select.NodeVisitor; 15 | 16 | /** 17 | * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted 18 | * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a 19 | * scrape. 20 | *

21 | * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend. 22 | *

23 | *

24 | * To invoke from the command line, assuming you've downloaded the jsoup jar to your current directory:

25 | *

java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]

26 | * where url is the URL to fetch, and selector is an optional CSS selector. 27 | * 28 | * @author Jonathan Hedley, jonathan@hedley.net 29 | */ 30 | public class HtmlToPlainText { 31 | private static final String userAgent = "Mozilla/5.0 (jsoup)"; 32 | private static final int timeout = 5 * 1000; 33 | 34 | public static void main(String... args) throws IOException { 35 | Validate.isTrue(args.length == 1 || args.length == 2, "usage: java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]"); 36 | final String url = args[0]; 37 | final String selector = args.length == 2 ? args[1] : null; 38 | 39 | // fetch the specified URL and parse to a HTML DOM 40 | Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get(); 41 | 42 | HtmlToPlainText formatter = new HtmlToPlainText(); 43 | 44 | if (selector != null) { 45 | Elements elements = doc.select(selector); // get each element that matches the CSS selector 46 | for (Element element : elements) { 47 | String plainText = formatter.getPlainText(element); // format that element to plain text 48 | System.out.println(plainText); 49 | } 50 | } else { // format the whole doc 51 | String plainText = formatter.getPlainText(doc); 52 | System.out.println(plainText); 53 | } 54 | } 55 | 56 | /** 57 | * Format an Element to plain-text 58 | * @param element the root element to format 59 | * @return formatted text 60 | */ 61 | public String getPlainText(Element element) { 62 | FormattingVisitor formatter = new FormattingVisitor(); 63 | NodeTraversor.traverse(formatter, element); // walk the DOM, and call .head() and .tail() for each node 64 | 65 | return formatter.toString(); 66 | } 67 | 68 | // the formatting rules, implemented in a breadth-first DOM traverse 69 | private static class FormattingVisitor implements NodeVisitor { 70 | private static final int maxWidth = 80; 71 | private int width = 0; 72 | private StringBuilder accum = new StringBuilder(); // holds the accumulated text 73 | 74 | // hit when the node is first seen 75 | @Override 76 | public void head(Node node, int depth) { 77 | String name = node.nodeName(); 78 | if (node instanceof TextNode) 79 | append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. 80 | else if (name.equals("li")) 81 | append("\n * "); 82 | else if (name.equals("dt")) 83 | append(" "); 84 | else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) 85 | append("\n"); 86 | } 87 | 88 | // hit when all of the node's children (if any) have been visited 89 | @Override 90 | public void tail(Node node, int depth) { 91 | String name = node.nodeName(); 92 | if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) 93 | append("\n"); 94 | else if (name.equals("a")) 95 | append(String.format(" <%s>", node.absUrl("href"))); 96 | } 97 | 98 | // appends text to the string builder with a simple word wrap method 99 | private void append(String text) { 100 | if (text.startsWith("\n")) 101 | width = 0; // reset counter if starts with a newline. only from formats above, not in natural text 102 | if (text.equals(" ") && 103 | (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n"))) 104 | return; // don't accumulate long runs of empty spaces 105 | 106 | if (text.length() + width > maxWidth) { // won't fit, needs to wrap 107 | String[] words = text.split("\\s+"); 108 | for (int i = 0; i < words.length; i++) { 109 | String word = words[i]; 110 | boolean last = i == words.length - 1; 111 | if (!last) // insert a space if not the last word 112 | word = word + " "; 113 | if (word.length() + width > maxWidth) { // wrap and reset counter 114 | accum.append("\n").append(word); 115 | width = word.length(); 116 | } else { 117 | accum.append(word); 118 | width += word.length(); 119 | } 120 | } 121 | } else { // fits as is, without need to wrap text 122 | accum.append(text); 123 | width += text.length(); 124 | } 125 | } 126 | 127 | @Override 128 | public String toString() { 129 | return accum.toString(); 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/StructuralEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import org.jsoup.nodes.Element; 4 | import org.jsoup.select.Evaluator; 5 | 6 | /** 7 | * Base structural evaluator. 8 | * Copy from {@link org.jsoup.select.StructuralEvaluator} because it is package visible 9 | * 10 | * @see org.jsoup.select.StructuralEvaluator 11 | */ 12 | abstract class StructuralEvaluator extends Evaluator { 13 | Evaluator evaluator; 14 | 15 | static class Root extends Evaluator { 16 | public boolean matches(Element root, Element element) { 17 | return root == element; 18 | } 19 | 20 | public String toString() { 21 | return ":root"; 22 | } 23 | } 24 | 25 | static class Has extends StructuralEvaluator { 26 | public Has(Evaluator evaluator) { 27 | this.evaluator = evaluator; 28 | } 29 | 30 | public boolean matches(Element root, Element element) { 31 | for (Element e : element.getAllElements()) { 32 | if (e != element && evaluator.matches(root, e)) return true; 33 | } 34 | return false; 35 | } 36 | 37 | public String toString() { 38 | return String.format(":has(%s)", evaluator); 39 | } 40 | } 41 | 42 | static class Not extends StructuralEvaluator { 43 | public Not(Evaluator evaluator) { 44 | this.evaluator = evaluator; 45 | } 46 | 47 | public boolean matches(Element root, Element node) { 48 | return !evaluator.matches(root, node); 49 | } 50 | 51 | public String toString() { 52 | return String.format(":not%s", evaluator); 53 | } 54 | } 55 | 56 | static class Parent extends StructuralEvaluator { 57 | public Parent(Evaluator evaluator) { 58 | this.evaluator = evaluator; 59 | } 60 | 61 | public boolean matches(Element root, Element element) { 62 | Element parent = element.parent(); 63 | while (parent != null) { 64 | if (evaluator.matches(root, parent)) return true; 65 | parent = parent.parent(); 66 | } 67 | return false; 68 | } 69 | 70 | public String toString() { 71 | return String.format(":parent%s", evaluator); 72 | } 73 | } 74 | 75 | static class ImmediateParent extends StructuralEvaluator { 76 | public ImmediateParent(Evaluator evaluator) { 77 | this.evaluator = evaluator; 78 | } 79 | 80 | public boolean matches(Element root, Element element) { 81 | Element parent = element.parent(); 82 | return parent != null && evaluator.matches(root, parent); 83 | } 84 | 85 | public String toString() { 86 | return String.format(":ImmediateParent%s", evaluator); 87 | } 88 | } 89 | 90 | static class PreviousSibling extends StructuralEvaluator { 91 | public PreviousSibling(Evaluator evaluator) { 92 | this.evaluator = evaluator; 93 | } 94 | 95 | public boolean matches(Element root, Element element) { 96 | if (root == element) return false; 97 | 98 | Element prev = element.previousElementSibling(); 99 | 100 | while (prev != null) { 101 | if (evaluator.matches(root, prev)) return true; 102 | 103 | prev = prev.previousElementSibling(); 104 | } 105 | return false; 106 | } 107 | 108 | public String toString() { 109 | return String.format(":prev*%s", evaluator); 110 | } 111 | } 112 | 113 | static class ImmediatePreviousSibling extends StructuralEvaluator { 114 | public ImmediatePreviousSibling(Evaluator evaluator) { 115 | this.evaluator = evaluator; 116 | } 117 | 118 | public boolean matches(Element root, Element element) { 119 | if (root == element) return false; 120 | 121 | Element prev = element.previousElementSibling(); 122 | return prev != null && evaluator.matches(root, prev); 123 | } 124 | 125 | public String toString() { 126 | return String.format(":prev%s", evaluator); 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/XEvaluators.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import org.jsoup.nodes.Element; 4 | import org.jsoup.select.Elements; 5 | import org.jsoup.select.Evaluator; 6 | 7 | /** 8 | * Evaluators in Xsoup. 9 | * 10 | * @author code4crafter@gmail.com 11 | */ 12 | public abstract class XEvaluators { 13 | 14 | public static class HasAnyAttribute extends Evaluator { 15 | 16 | @Override 17 | public boolean matches(Element root, Element element) { 18 | return element.attributes().size() > 0; 19 | } 20 | } 21 | 22 | public static class IsNthOfType extends Evaluator.CssNthEvaluator { 23 | public IsNthOfType(int a, int b) { 24 | super(a, b); 25 | } 26 | 27 | protected int calculatePosition(Element root, Element element) { 28 | int pos = 0; 29 | Elements family = element.parent().children(); 30 | for (int i = 0; i < family.size(); i++) { 31 | if (family.get(i).tag().equals(element.tag())) pos++; 32 | if (family.get(i) == element) break; 33 | } 34 | return pos; 35 | } 36 | 37 | @Override 38 | protected String getPseudoClass() { 39 | return "nth-of-type"; 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/us/codecraft/xsoup/xevaluator/XPathParser.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.xevaluator; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Stack; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | import org.jsoup.helper.Validate; 11 | import org.jsoup.select.Evaluator; 12 | import org.jsoup.select.Selector; 13 | import us.codecraft.xsoup.XPathEvaluator; 14 | import us.codecraft.xsoup.XTokenQueue; 15 | 16 | /** 17 | * Parser of XPath. 18 | * 19 | * @author code4crafter@gmail.com 20 | */ 21 | public class XPathParser { 22 | 23 | private static final String[] COMBINATORS = new String[] {"//", "/", "|"}; 24 | 25 | private static final String[] ESCAPED_QUOTES = new String[] {"\\\"", "\\'"}; 26 | 27 | private static final String[] QUOTES = new String[] {"\"", "'"}; 28 | 29 | private static final String[] HIERARCHY_COMBINATORS = new String[] {"//", "/", "|"}; 30 | 31 | private static final Map FUNCTION_MAPPING = new HashMap(); 32 | private static final String OR_COMBINATOR = "|"; 33 | 34 | static { 35 | FUNCTION_MAPPING.put("contains", new FunctionEvaluator() { 36 | @Override 37 | public Evaluator call(String... param) { 38 | Validate.isTrue(param.length == 2, String.format("Error argument of %s", "contains")); 39 | return new Evaluator.AttributeWithValueContaining(param[0], param[1]); 40 | } 41 | }); 42 | FUNCTION_MAPPING.put("starts-with", new FunctionEvaluator() { 43 | @Override 44 | public Evaluator call(String... param) { 45 | Validate.isTrue(param.length == 2, String.format("Error argument of %s", "starts-with")); 46 | return new Evaluator.AttributeWithValueStarting(param[0], param[1]); 47 | } 48 | }); 49 | FUNCTION_MAPPING.put("ends-with", new FunctionEvaluator() { 50 | @Override 51 | public Evaluator call(String... param) { 52 | Validate.isTrue(param.length == 2, String.format("Error argument of %s", "ends-with")); 53 | return new Evaluator.AttributeWithValueEnding(param[0], param[1]); 54 | } 55 | }); 56 | } 57 | 58 | private XTokenQueue tq; 59 | private String query; 60 | private List evals = new ArrayList(); 61 | private ElementOperator elementOperator; 62 | private boolean noEvalAllow = false; 63 | private Pattern patternForText = Pattern.compile("text\\((\\d*)\\)"); 64 | 65 | public XPathParser(String xpathStr) { 66 | this.query = xpathStr; 67 | this.tq = new XTokenQueue(xpathStr); 68 | } 69 | 70 | public static XPathEvaluator parse(String xpathStr) { 71 | XPathParser xPathParser = new XPathParser(xpathStr); 72 | return xPathParser.parse(); 73 | } 74 | 75 | public XPathEvaluator parse() { 76 | 77 | while (!tq.isEmpty()) { 78 | Validate.isFalse(noEvalAllow, "XPath error! No operator allowed after attribute or function!" + tq); 79 | if (tq.matchChomp(OR_COMBINATOR)) { 80 | tq.consumeWhitespace(); 81 | return combineXPathEvaluator(tq.remainder()); 82 | } 83 | else if (tq.matchesAny(HIERARCHY_COMBINATORS)) { 84 | combinator(tq.consumeAny(HIERARCHY_COMBINATORS)); 85 | } 86 | else { 87 | findElements(); 88 | } 89 | tq.consumeWhitespace(); 90 | } 91 | return collectXPathEvaluator(); 92 | } 93 | 94 | private XPathEvaluator combineXPathEvaluator(String subQuery) { 95 | XPathEvaluator xPathEvaluator = collectXPathEvaluator(); 96 | return new CombingXPathEvaluator(xPathEvaluator, parse(subQuery)); 97 | } 98 | 99 | private XPathEvaluator collectXPathEvaluator() { 100 | if (noEvalAllow) { 101 | return new DefaultXPathEvaluator(null, elementOperator); 102 | } 103 | 104 | if (evals.size() == 1) return new DefaultXPathEvaluator(evals.get(0), elementOperator); 105 | 106 | return new DefaultXPathEvaluator(new CombiningEvaluator.And(evals), elementOperator); 107 | } 108 | 109 | private void combinator(String combinator) { 110 | Evaluator currentEval; 111 | if (evals.size() == 0) { 112 | currentEval = new StructuralEvaluator.Root(); 113 | } 114 | else if (evals.size() == 1) { 115 | currentEval = evals.get(0); 116 | } 117 | else { 118 | currentEval = new CombiningEvaluator.And(evals); 119 | } 120 | evals.clear(); 121 | String subQuery = consumeSubQuery(); 122 | XPathEvaluator tmpEval = parse(subQuery); 123 | if (!(tmpEval instanceof DefaultXPathEvaluator)) { 124 | throw new IllegalArgumentException(String.format("Error XPath in %s", subQuery)); 125 | } 126 | DefaultXPathEvaluator newEval = (DefaultXPathEvaluator) tmpEval; 127 | if (newEval.getElementOperator() != null) { 128 | elementOperator = newEval.getElementOperator(); 129 | } 130 | // attribute expr does not return Evaluator 131 | if (newEval.getEvaluator() != null) { 132 | if (combinator.equals("//")) { 133 | currentEval = 134 | new CombiningEvaluator.And(newEval.getEvaluator(), new StructuralEvaluator.Parent(currentEval)); 135 | } 136 | else if (combinator.equals("/")) { 137 | currentEval = 138 | new CombiningEvaluator.And(newEval.getEvaluator(), new StructuralEvaluator.ImmediateParent(currentEval)); 139 | } 140 | } 141 | evals.add(currentEval); 142 | } 143 | 144 | private String consumeSubQuery() { 145 | StringBuilder sq = new StringBuilder(); 146 | while (!tq.isEmpty()) { 147 | tq.consumeWhitespace(); 148 | if (tq.matches("(")) { 149 | sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); 150 | } 151 | else if (tq.matches("[")) { 152 | sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); 153 | } 154 | else if (tq.matchesAny(ESCAPED_QUOTES)) { 155 | sq.append(tq.consumeAny(ESCAPED_QUOTES)); 156 | } 157 | else if (tq.matchesAny(QUOTES)) { 158 | sq.append(tq.chompBalancedQuotes()); 159 | } 160 | else if (tq.matchesAny(COMBINATORS)) { 161 | break; 162 | } 163 | else if (!tq.isEmpty()) { 164 | sq.append(tq.consume()); 165 | } 166 | } 167 | return sq.toString(); 168 | } 169 | 170 | private void findElements() { 171 | if (tq.matches("@")) { 172 | consumeAttribute(); 173 | } 174 | else if (tq.matches("*")) { 175 | allElements(); 176 | } 177 | else if (tq.matchesRegex("\\w+\\(.*\\).*")) { 178 | consumeOperatorFunction(); 179 | } 180 | else if (tq.matchesWord()) { 181 | byTag(); 182 | } 183 | else if (tq.matchesRegex("\\[\\d+\\]")) { 184 | byNth(); 185 | } 186 | else if (tq.matches("[")) { 187 | evals.add(consumePredicates(tq.chompBalanced('[', ']'))); 188 | } 189 | else { 190 | // unhandled 191 | throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 192 | } 193 | } 194 | 195 | private Evaluator consumePredicates(String queue) { 196 | XTokenQueue predicatesQueue = new XTokenQueue(queue); 197 | EvaluatorStack evaluatorStack = new EvaluatorStack(); 198 | Operation currentOperation = null; 199 | predicatesQueue.consumeWhitespace(); 200 | while (!predicatesQueue.isEmpty()) { 201 | if (predicatesQueue.matchChomp("and")) { 202 | currentOperation = Operation.AND; 203 | } 204 | else if (predicatesQueue.matchChomp("or")) { 205 | currentOperation = Operation.OR; 206 | } 207 | else { 208 | if (currentOperation == null && evaluatorStack.size() > 0) { 209 | throw new IllegalArgumentException(String.format("Need AND/OR between two predicate! %s", predicatesQueue.remainder())); 210 | } 211 | Evaluator evaluator; 212 | if (predicatesQueue.matches("(")) { 213 | evaluator = consumePredicates(predicatesQueue.chompBalanced('(', ')')); 214 | } 215 | else if (predicatesQueue.matches("@")) { 216 | evaluator = byAttribute(predicatesQueue); 217 | } 218 | else if (predicatesQueue.matchesRegex("\\w+.*")) { 219 | evaluator = byFunction(predicatesQueue); 220 | } 221 | else { 222 | throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, predicatesQueue.remainder()); 223 | } 224 | evaluatorStack.calc(evaluator, currentOperation); 225 | //consume operator 226 | currentOperation = null; 227 | } 228 | predicatesQueue.consumeWhitespace(); 229 | } 230 | evaluatorStack.mergeOr(); 231 | return evaluatorStack.peek(); 232 | } 233 | 234 | private Evaluator byFunction(XTokenQueue predicatesQueue) { 235 | for (Map.Entry entry : FUNCTION_MAPPING.entrySet()) { 236 | if (predicatesQueue.matchChomp(entry.getKey())) { 237 | String paramString = predicatesQueue.chompBalanced('(', ')'); 238 | List params = XTokenQueue.trimQuotes(XTokenQueue.parseFuncionParams(paramString)); 239 | 240 | if (params.get(0).startsWith("@")) { 241 | params.set(0, params.get(0).substring(1)); 242 | return entry.getValue().call(params.toArray(new String[0])); 243 | } 244 | else { 245 | return null; 246 | } 247 | } 248 | } 249 | 250 | throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, predicatesQueue.remainder()); 251 | } 252 | 253 | private void allElements() { 254 | tq.consume(); 255 | evals.add(new Evaluator.AllElements()); 256 | } 257 | 258 | private void byNth() { 259 | String nth = tq.chompBalanced('[', ']'); 260 | evals.add(new XEvaluators.IsNthOfType(0, Integer.parseInt(nth))); 261 | } 262 | 263 | private void consumeAttribute() { 264 | tq.consume("@"); 265 | elementOperator = new ElementOperator.AttributeGetter(tq.remainder()); 266 | noEvalAllow = true; 267 | } 268 | 269 | private void consumeOperatorFunction() { 270 | String remainder = consumeSubQuery(); 271 | if (remainder.startsWith("text(")) { 272 | functionText(remainder); 273 | } 274 | else if (remainder.startsWith("regex(")) { 275 | functionRegex(remainder); 276 | } 277 | else if (remainder.equals("allText()")) { 278 | elementOperator = new ElementOperator.AllText(); 279 | } 280 | else if (remainder.equals("tidyText()")) { 281 | elementOperator = new ElementOperator.TidyText(); 282 | } 283 | else if (remainder.equals("html()")) { 284 | elementOperator = new ElementOperator.Html(); 285 | } 286 | else if (remainder.equals("outerHtml()")) { 287 | elementOperator = new ElementOperator.OuterHtml(); 288 | } 289 | else { 290 | throw new IllegalArgumentException("Unsupported function " + remainder); 291 | } 292 | if (elementOperator != null) { 293 | noEvalAllow = true; 294 | } 295 | } 296 | 297 | private void functionRegex(String remainder) { 298 | Validate.isTrue(remainder.endsWith(")"), "Unclosed bracket for function! " + remainder); 299 | List params = 300 | XTokenQueue.trimQuotes(XTokenQueue.parseFuncionParams(remainder.substring("regex(".length(), remainder.length() 301 | - 1))); 302 | if (params.size() == 1) { 303 | elementOperator = new ElementOperator.Regex(params.get(0)); 304 | } 305 | else if (params.size() == 2) { 306 | if (params.get(0).startsWith("@")) { 307 | elementOperator = new ElementOperator.Regex(params.get(1), params.get(0).substring(1)); 308 | } 309 | else { 310 | elementOperator = new ElementOperator.Regex(params.get(0), null, Integer.parseInt(params.get(1))); 311 | } 312 | } 313 | else if (params.size() == 3) { 314 | elementOperator = 315 | new ElementOperator.Regex(params.get(1), params.get(0).substring(1), Integer.parseInt(params.get(2))); 316 | } 317 | else { 318 | throw new Selector.SelectorParseException("Unknown usage for regex()" + remainder); 319 | } 320 | } 321 | 322 | private void functionText(String remainder) { 323 | Matcher matcher = patternForText.matcher(remainder); 324 | if (matcher.matches()) { 325 | int attributeGroup; 326 | String group = matcher.group(1); 327 | if (group.equals("")) { 328 | attributeGroup = 0; 329 | } 330 | else { 331 | attributeGroup = Integer.parseInt(group); 332 | } 333 | elementOperator = new ElementOperator.GroupedText(attributeGroup); 334 | } 335 | } 336 | 337 | private void byTag() { 338 | String tagName = tq.consumeElementSelector(); 339 | Validate.notEmpty(tagName); 340 | 341 | // namespaces: if element name is "abc:def", selector must be "abc|def", so flip: 342 | if (tagName.contains("|")) tagName = tagName.replace("|", ":"); 343 | 344 | evals.add(new Evaluator.Tag(tagName.trim().toLowerCase())); 345 | } 346 | 347 | private Evaluator byAttribute(XTokenQueue cq) { 348 | cq.matchChomp("@"); 349 | String key = 350 | cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, not, start, end, contain, match, (no val) 351 | Validate.notEmpty(key); 352 | cq.consumeWhitespace(); 353 | Evaluator evaluator; 354 | if (cq.isEmpty()) { 355 | if ("*".equals(key)) { 356 | evaluator = new XEvaluators.HasAnyAttribute(); 357 | } 358 | else { 359 | evaluator = new Evaluator.Attribute(key); 360 | } 361 | } 362 | else { 363 | if (cq.matchChomp("=")) { 364 | String value = chompEqualValue(cq); 365 | //to support select one class out of all 366 | if (key.equals("class")) { 367 | String className = XTokenQueue.trimQuotes(value); 368 | if (!className.contains(" ")) { 369 | evaluator = new Evaluator.Class(className); 370 | } 371 | else { 372 | evaluator = new Evaluator.AttributeWithValue(key, className); 373 | } 374 | } 375 | else { 376 | evaluator = new Evaluator.AttributeWithValue(key, XTokenQueue.trimQuotes(value)); 377 | } 378 | } 379 | else if (cq.matchChomp("!=")) { 380 | evaluator = new Evaluator.AttributeWithValueNot(key, XTokenQueue.trimQuotes(chompEqualValue(cq))); 381 | } 382 | 383 | else if (cq.matchChomp("^=")) { 384 | evaluator = new Evaluator.AttributeWithValueStarting(key, XTokenQueue.trimQuotes(chompEqualValue(cq))); 385 | } 386 | 387 | else if (cq.matchChomp("$=")) { 388 | evaluator = new Evaluator.AttributeWithValueEnding(key, XTokenQueue.trimQuotes(chompEqualValue(cq))); 389 | } 390 | 391 | else if (cq.matchChomp("*=")) { 392 | evaluator = 393 | new Evaluator.AttributeWithValueContaining(key, XTokenQueue.trimQuotes(chompEqualValue(cq))); 394 | } 395 | 396 | else if (cq.matchChomp("~=")) { 397 | evaluator = 398 | new Evaluator.AttributeWithValueMatching(key, Pattern.compile(XTokenQueue.trimQuotes(chompEqualValue(cq)))); 399 | } 400 | else { 401 | throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, chompEqualValue(cq)); 402 | } 403 | } 404 | return evaluator; 405 | } 406 | 407 | private String chompEqualValue(XTokenQueue cq) { 408 | String value; 409 | if (cq.matchChomp("'")) { 410 | value = cq.chompTo("'"); 411 | } 412 | else if (cq.matchChomp("\"")) { 413 | value = cq.chompTo("\""); 414 | } 415 | else if (cq.containsAny(" ")) { 416 | value = cq.chompTo(" "); 417 | } 418 | else { 419 | value = cq.remainder(); 420 | } 421 | return value; 422 | } 423 | 424 | enum Operation { 425 | AND, 426 | OR 427 | } 428 | 429 | interface FunctionEvaluator { 430 | Evaluator call(String... param); 431 | } 432 | 433 | /** 434 | * EvaluatorStack for logic calculate. 435 | * Priority: AND > OR, Regardless of bracket. 436 | *
437 | * Calculate AND immediately. 438 | * Store evaluator with OR, until there are two evaluator in stack, then calculate it. 439 | */ 440 | static class EvaluatorStack extends Stack { 441 | 442 | public void calc(Evaluator evaluator, Operation operation) { 443 | if (size() == 0) { 444 | push(evaluator); 445 | } 446 | else { 447 | if (operation == Operation.AND) { 448 | evaluator = new CombiningEvaluator.And(pop(), evaluator); 449 | } 450 | else { 451 | mergeOr(); 452 | } 453 | push(evaluator); 454 | } 455 | } 456 | 457 | public void mergeOr() { 458 | if (size() >= 2) { 459 | Evaluator pop1 = pop(); 460 | Evaluator pop2 = pop(); 461 | Evaluator tempEvaluator = new CombiningEvaluator.Or(pop2, pop1); 462 | push(tempEvaluator); 463 | } 464 | } 465 | } 466 | } 467 | -------------------------------------------------------------------------------- /src/test/java/us/codecraft/xsoup/XTokenQueueTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.List; 6 | 7 | import static org.assertj.core.api.Assertions.assertThat; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | public class XTokenQueueTest { 13 | 14 | @Test 15 | public void testParseFunctionParams(){ 16 | List list = XTokenQueue.parseFuncionParams("a,b,c"); 17 | assertThat(list).hasSize(3); 18 | 19 | list = XTokenQueue.parseFuncionParams("'a,b',c"); 20 | assertThat(list).hasSize(2); 21 | 22 | list = XTokenQueue.parseFuncionParams("'a,\\'b',c"); 23 | assertThat(list).hasSize(2); 24 | 25 | list = XTokenQueue.parseFuncionParams("@a,1,c"); 26 | assertThat(list).hasSize(3); 27 | 28 | } 29 | 30 | @Test 31 | public void testChompBalancedQuotes() throws Exception { 32 | XTokenQueue xTokenQueue = new XTokenQueue("\"aaaaa\""); 33 | String chomp = xTokenQueue.chompBalancedQuotes(); 34 | assertThat(chomp).isEqualTo("\"aaaaa\""); 35 | 36 | xTokenQueue = new XTokenQueue("\"aaaaa\"aabb"); 37 | chomp = xTokenQueue.chompBalancedQuotes(); 38 | assertThat(chomp).isEqualTo("\"aaaaa\""); 39 | 40 | xTokenQueue = new XTokenQueue("a\"aaaaa\"aabb"); 41 | chomp = xTokenQueue.chompBalancedQuotes(); 42 | assertThat(chomp).isEqualTo(""); 43 | 44 | } 45 | 46 | @Test 47 | public void testChompBalancedInQuotes() throws Exception { 48 | XTokenQueue xTokenQueue = new XTokenQueue("(\")\")"); 49 | String chomp = xTokenQueue.chompBalancedNotInQuotes('(',')'); 50 | assertThat(chomp).isEqualTo("\")\""); 51 | 52 | xTokenQueue = new XTokenQueue("(\"')\")"); 53 | chomp = xTokenQueue.chompBalancedNotInQuotes('(',')'); 54 | assertThat(chomp).isEqualTo("\"')\""); 55 | 56 | xTokenQueue = new XTokenQueue("(''')')"); 57 | chomp = xTokenQueue.chompBalancedNotInQuotes('(',')'); 58 | assertThat(chomp).isEqualTo("''')'"); 59 | 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/test/java/us/codecraft/xsoup/XsoupTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.junit.Test; 7 | import us.codecraft.xsoup.xevaluator.XPathParser; 8 | 9 | import java.util.List; 10 | 11 | import static org.assertj.core.api.Assertions.assertThat; 12 | 13 | /** 14 | * @author code4crafter@gmail.com 15 | */ 16 | public class XsoupTest { 17 | 18 | private String html = ""; 19 | 20 | private String htmlClass = "
b
"; 21 | 22 | @Test 23 | public void testSelect() { 24 | 25 | String html = "" + 26 | "
ab
"; 27 | 28 | Document document = Jsoup.parse(html); 29 | 30 | String result = Xsoup.compile("//a/@href").evaluate(document).get(); 31 | assertThat(result).isEqualTo("https://github.com"); 32 | 33 | XPathEvaluator xPathEvaluator = Xsoup.compile("//tr/td/text()"); 34 | List list = xPathEvaluator.evaluate(document).list(); 35 | assertThat(list).contains("a","b"); 36 | assertThat(xPathEvaluator.hasAttribute()).isTrue(); 37 | 38 | } 39 | 40 | @Test 41 | public void testParent() { 42 | 43 | Document document = Jsoup.parse(html); 44 | 45 | String result = Xsoup.select(document, "/html/body/div/div/a").get(); 46 | assertThat(result).isEqualTo("github.com"); 47 | 48 | result = Xsoup.select(document, "/html//div/div/a").get(); 49 | assertThat(result).isEqualTo("github.com"); 50 | 51 | result = Xsoup.select(document, "/html/div/div/a").get(); 52 | assertThat(result).isNull(); 53 | 54 | } 55 | 56 | @Test 57 | public void testByAttribute() { 58 | 59 | Document document = Jsoup.parse(html); 60 | 61 | XPathEvaluator xPathEvaluator = XPathParser.parse("//a[@href]"); 62 | assertThat(xPathEvaluator.hasAttribute()).isFalse(); 63 | XElements select = xPathEvaluator.evaluate(document); 64 | assertThat(select.get()).isEqualTo("github.com"); 65 | 66 | String result = Xsoup.select(document, "//a[@id]").get(); 67 | assertThat(result).isNull(); 68 | 69 | result = Xsoup.select(document, "//div[@id=test]").get(); 70 | String expectedDiv = "
\n" + 71 | " aaa\n" + 72 | "
\n" + 73 | " github.com\n" + 74 | "
\n" + 75 | "
"; 76 | assertThat(result).isEqualTo(expectedDiv); 77 | 78 | result = Xsoup.select(document, "//div[@id='test']").get(); 79 | assertThat(result).isEqualTo(expectedDiv); 80 | result = Xsoup.select(document, "//div[@id=\"test\"]").get(); 81 | assertThat(result).isEqualTo(expectedDiv); 82 | } 83 | 84 | @Test 85 | public void testClass() { 86 | 87 | Document document = Jsoup.parse(htmlClass); 88 | 89 | String result = Xsoup.select(document, "//div[@class=a]").get(); 90 | assertThat(result).isEqualTo("
\n" + 91 | "
\n" + 92 | " github.com\n" + 93 | "
\n" + 94 | "
"); 95 | 96 | result = Xsoup.select(document, "//div[@class=d]").get(); 97 | assertThat(result).isNull(); 98 | 99 | } 100 | 101 | @Test 102 | public void testNth() { 103 | 104 | Document document = Jsoup.parse(htmlClass); 105 | 106 | String result = Xsoup.select(document, "//body/div[1]").get(); 107 | assertThat(result).isEqualTo("
\n" + 108 | "
\n" + 109 | " github.com\n" + 110 | "
\n" + 111 | "
"); 112 | 113 | result = Xsoup.select(document, "//body/div[2]").get(); 114 | assertThat(result).isEqualTo("
\n" + 115 | " b\n" + 116 | "
"); 117 | 118 | String htmlSVG = "
12
"; 119 | result = Xsoup.select(htmlSVG, "//div/svg[1]/text()").get(); 120 | assertThat(result).isEqualTo("1"); 121 | result = Xsoup.select(htmlSVG, "//div/svg[2]/text()").get(); 122 | assertThat(result).isEqualTo("2"); 123 | } 124 | 125 | @Test 126 | public void testAttribute() { 127 | 128 | Document document = Jsoup.parse(htmlClass); 129 | 130 | String result = Xsoup.select(document, "//a/@href").get(); 131 | assertThat(result).isEqualTo("https://github.com"); 132 | 133 | result = Xsoup.select(document, "//a/text()").get(); 134 | assertThat(result).isEqualTo("github.com"); 135 | 136 | result = Xsoup.select(document, "//div[@class=a]/html()").get(); 137 | assertThat(result).isEqualTo("
\n" + 138 | " github.com\n" + 139 | "
"); 140 | 141 | } 142 | 143 | @Test 144 | public void testWildcard() { 145 | 146 | Document document = Jsoup.parse(htmlClass); 147 | 148 | String result = Xsoup.select(document, "//*[@href]/@href").get(); 149 | assertThat(result).isEqualTo("https://github.com"); 150 | 151 | result = Xsoup.select(document, "//*[@class=a]/html()").get(); 152 | assertThat(result).isEqualTo("
\n" + 153 | " github.com\n" + 154 | "
"); 155 | 156 | List list = Xsoup.select(document, "//*[@*]/html()").list(); 157 | assertThat(list.get(0)).isEqualTo("
\n" + 158 | " github.com\n" + 159 | "
"); 160 | assertThat(list.get(1)).isEqualTo("github.com"); 161 | } 162 | 163 | @Test 164 | public void testFuzzyValueMatch() { 165 | 166 | Document document = Jsoup.parse(html); 167 | 168 | String result = Xsoup.select(document, "//*[@id~=te]/text()").get(); 169 | assertThat(result).isEqualTo("aaa"); 170 | result = Xsoup.select(document, "//*[@id$=st]/text()").get(); 171 | assertThat(result).isEqualTo("aaa"); 172 | result = Xsoup.select(document, "//*[@id*=es]/text()").get(); 173 | assertThat(result).isEqualTo("aaa"); 174 | result = Xsoup.select(document, "//*[@id~='tes[t]+']/text()").get(); 175 | assertThat(result).isEqualTo("aaa"); 176 | 177 | result = Xsoup.select(document, "//*[@id~=te]/allText()").get(); 178 | assertThat(result).isEqualTo("aaa github.com"); 179 | } 180 | 181 | @Test 182 | public void testLogicOperation() { 183 | 184 | Document document = Jsoup.parse(html); 185 | 186 | String result = Xsoup.select(document, "//*[@id=te or @id=test]/text()").get(); 187 | assertThat(result).isEqualTo("aaa"); 188 | 189 | result = Xsoup.select(document, "//*[@id=test or @id=te]/text()").get(); 190 | assertThat(result).isEqualTo("aaa"); 191 | 192 | result = Xsoup.select(document, "//*[@id=te and @id=test]/text()").get(); 193 | assertThat(result).isNull(); 194 | 195 | result = Xsoup.select(document, "//*[(@id=te or @id=test) and @id=test]/text()").get(); 196 | assertThat(result).isEqualTo("aaa"); 197 | 198 | result = Xsoup.select(document, "//*[@id=te or (@id=test and @id=id)]/text()").get(); 199 | assertThat(result).isNull(); 200 | } 201 | 202 | @Test 203 | public void testRegex() { 204 | 205 | Document document = Jsoup.parse(html); 206 | 207 | String result = Xsoup.select(document, "//*[@id~=te]/regex('gi\\w+ub')").get(); 208 | assertThat(result).isEqualTo("github"); 209 | 210 | result = Xsoup.select(document, "//a/regex('@href','.*gi\\w+ub.*')").get(); 211 | assertThat(result).isEqualTo("https://github.com"); 212 | 213 | result = Xsoup.select(document, "//a/regex('@href','.*(gi\\w+ub).*',1").get(); 214 | assertThat(result).isEqualTo("github"); 215 | } 216 | 217 | @Test 218 | public void testContains() { 219 | 220 | Document document = Jsoup.parse(html); 221 | 222 | String result = Xsoup.select(document, "//div[contains(@id,'te')]").get(); 223 | assertThat(result).isEqualTo("
\n" + 224 | " aaa\n" + 225 | "
\n" + 226 | " github.com\n" + 227 | "
\n" + 228 | "
"); 229 | 230 | } 231 | 232 | @Test 233 | public void testCombingXPath() { 234 | 235 | String html2 = ""; 236 | 237 | Document document = Jsoup.parse(html); 238 | 239 | XPathEvaluator xPathEvaluator = XPathParser.parse("//div[@id='test']/text() | //div[@id='test2']/text()"); 240 | assertThat(xPathEvaluator.hasAttribute()).isTrue(); 241 | XElements select = xPathEvaluator.evaluate(document); 242 | assertThat(select.get()).isEqualTo("aaa"); 243 | 244 | 245 | select = Xsoup.select(html2, "//div[@id='test']/text() | //div[@id='test2']/text()"); 246 | assertThat(select.get()).isEqualTo("aa"); 247 | 248 | select = Xsoup.select(html + html2, "//div[@id='test']/text() | //div[@id='test2']/text()"); 249 | assertThat(select.list()).contains("aaa", "aa"); 250 | 251 | xPathEvaluator = XPathParser.parse("//div[@id='test'] | //div[@id='test2']"); 252 | assertThat(xPathEvaluator.hasAttribute()).isFalse(); 253 | } 254 | 255 | @Test 256 | public void testSeparatorInQuotes() { 257 | 258 | String html2 = "
/list/12345github.com
"; 259 | 260 | Document document = Jsoup.parse(html2); 261 | 262 | String result = Xsoup.select(document, "//div[@id='test2']/regex(\"/list/(\\d+)\",1)").get(); 263 | assertThat(result).isEqualTo("12345"); 264 | 265 | } 266 | 267 | @Test 268 | public void testEmptyElementEvaluator() { 269 | 270 | String html2 = "github.com"; 271 | 272 | Element element = Jsoup.parse(html2).getElementsByTag("a").get(0); 273 | 274 | String result = Xsoup.select(element, "@href").get(); 275 | assertThat(result).isEqualTo("https://github.com"); 276 | 277 | } 278 | 279 | } 280 | -------------------------------------------------------------------------------- /src/test/java/us/codecraft/xsoup/w3c/DocumentAdaptorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.junit.Test; 5 | import org.w3c.dom.Document; 6 | 7 | import javax.xml.xpath.XPath; 8 | import javax.xml.xpath.XPathExpression; 9 | import javax.xml.xpath.XPathFactory; 10 | 11 | import static org.assertj.core.api.Assertions.assertThat; 12 | 13 | /** 14 | * @author code4crafer@gmail.com 15 | */ 16 | public class DocumentAdaptorTest { 17 | 18 | private String html = ""; 19 | 20 | @Test 21 | public void testDocumentAdaptor() throws Exception { 22 | Document document = new DocumentAdaptor(Jsoup.parse(html)); 23 | XPathFactory xPathfactory = XPathFactory.newInstance(); 24 | XPath target = xPathfactory.newXPath(); 25 | XPathExpression xPathExpression = target.compile("//div/a/@href"); 26 | String result = xPathExpression.evaluate(document); 27 | assertThat(result).isEqualTo("https://github.com"); 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/test/java/us/codecraft/xsoup/w3c/W3cEvaluatorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup.w3c; 2 | 3 | import static org.assertj.core.api.Assertions.assertThat; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import javax.xml.xpath.XPath; 9 | import javax.xml.xpath.XPathConstants; 10 | import javax.xml.xpath.XPathExpression; 11 | import javax.xml.xpath.XPathExpressionException; 12 | import javax.xml.xpath.XPathFactory; 13 | 14 | import org.jsoup.Jsoup; 15 | import org.junit.Test; 16 | import org.w3c.dom.Node; 17 | import org.w3c.dom.NodeList; 18 | 19 | import us.codecraft.xsoup.Xsoup; 20 | 21 | /** 22 | * @author code4crafer@gmail.com 23 | */ 24 | public class W3cEvaluatorTest { 25 | 26 | private String html = ""; 27 | 28 | private String htmlClass = "
b
"; 29 | 30 | @Test 31 | public void testSelect() throws XPathExpressionException { 32 | 33 | String html = "" + 34 | "
ab
"; 35 | 36 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(html)); 37 | 38 | assertThat(getStringValue(document, "//div/a/@href")).isEqualTo("https://github.com"); 39 | 40 | List nodeListValue = getNodeListValue(document, "//tr/td"); 41 | assertThat(nodeListValue.get(0)).isEqualTo("a"); 42 | assertThat(nodeListValue.get(1)).isEqualTo("b"); 43 | } 44 | 45 | private String getStringValue(org.w3c.dom.Document document, String expression) throws XPathExpressionException { 46 | XPathExpression xPathExpression = newXPathExpression(expression); 47 | return xPathExpression.evaluate(document); 48 | } 49 | 50 | private XPathExpression newXPathExpression(String expression) throws XPathExpressionException { 51 | XPathExpression xPathExpression; 52 | XPathFactory xPathfactory = XPathFactory.newInstance(); 53 | XPath target = xPathfactory.newXPath(); 54 | xPathExpression = target.compile(expression); 55 | return xPathExpression; 56 | } 57 | 58 | private String getNodeValue(org.w3c.dom.Document document, String expression) throws XPathExpressionException { 59 | XPathExpression xPathExpression = newXPathExpression(expression); 60 | Object evaluate = xPathExpression.evaluate(document, XPathConstants.NODE); 61 | if (evaluate == null) { 62 | return null; 63 | } 64 | Node node = (Node) evaluate; 65 | return node.getNodeValue(); 66 | } 67 | 68 | private List getNodeListValue(org.w3c.dom.Document document, String expression) throws XPathExpressionException { 69 | XPathExpression xPathExpression = newXPathExpression(expression); 70 | Object evaluate = xPathExpression.evaluate(document, XPathConstants.NODESET); 71 | if (evaluate == null) { 72 | return null; 73 | } 74 | NodeList nodeList = (NodeList) evaluate; 75 | List nodeStrings = new ArrayList(nodeList.getLength()); 76 | for (int i = 0; i < nodeList.getLength(); i++) { 77 | nodeStrings.add(nodeList.item(i).getNodeValue()); 78 | } 79 | return nodeStrings; 80 | } 81 | 82 | @Test 83 | public void testParent() throws XPathExpressionException { 84 | 85 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(html)); 86 | 87 | assertThat(getNodeValue(document, "/html/body/div/div/a")).isEqualTo("github.com"); 88 | assertThat(getNodeValue(document, "/html//div/div/a")).isEqualTo("github.com"); 89 | 90 | assertThat(getNodeValue(document, "/html/div/div/a")).isNull(); 91 | 92 | } 93 | 94 | @Test 95 | public void testByAttribute() throws XPathExpressionException { 96 | 97 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(html)); 98 | 99 | assertThat(getNodeValue(document, "//a[@href]")).isEqualTo("github.com"); 100 | 101 | assertThat(getNodeValue(document, "//a[@id]")).isNull(); 102 | 103 | String expectedDiv = "
\n" + 104 | " aaa\n" + 105 | "
\n" + 106 | " github.com\n" + 107 | "
\n" + 108 | "
"; 109 | 110 | 111 | //TODO: illegal 112 | //assertThat(getNodeValue(document,"//div[@id=test]")).isEqualTo(expectedDiv); 113 | 114 | assertThat(getNodeValue(document, "//div[@id='test']")).isEqualTo(expectedDiv); 115 | assertThat(getNodeValue(document, "//div[@id=\"test\"]")).isEqualTo(expectedDiv); 116 | } 117 | 118 | @Test 119 | public void testClass() throws XPathExpressionException { 120 | 121 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(htmlClass)); 122 | 123 | 124 | assertThat(getNodeListValue(document,"//div[@class='a b c']").get(0)).isEqualTo("
\n" + 125 | "
\n" + 126 | " github.com\n" + 127 | "
\n" + 128 | "
"); 129 | 130 | assertThat(getNodeListValue(document, "//div[@class='b']")).isNullOrEmpty(); 131 | 132 | assertThat(getNodeListValue(document, "//div[@class='d']")).isNullOrEmpty(); 133 | 134 | 135 | } 136 | 137 | @Test 138 | public void testNth() throws XPathExpressionException { 139 | 140 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(htmlClass)); 141 | 142 | assertThat(getNodeValue(document, "//body/div[1]")).isEqualTo("
\n" + 143 | "
\n" + 144 | " github.com\n" + 145 | "
\n" + 146 | "
"); 147 | 148 | assertThat(getNodeValue(document, "//body/div[2]")).isEqualTo("
\n" + 149 | " b\n" + 150 | "
"); 151 | 152 | String htmlSVG = "
12
"; 153 | 154 | document = Xsoup.convertDocument(Jsoup.parse(htmlSVG)); 155 | assertThat(getNodeValue(document, "//div/svg[1]")).isEqualTo("\n" + 156 | " 1\n" + 157 | ""); 158 | assertThat(getNodeValue(document, "//div/svg[2]")).isEqualTo("\n" + 159 | " 2\n" + 160 | ""); 161 | } 162 | 163 | @Test 164 | public void testAttribute() throws XPathExpressionException { 165 | 166 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(htmlClass)); 167 | 168 | assertThat(getStringValue(document,"//a/@href")).isEqualTo("https://github.com"); 169 | 170 | //TODO: not support 171 | //assertThat(getStringValue(document,"//a/text()")).isEqualTo("github.com"); 172 | 173 | //TODO: not support 174 | //assertThat(getStringValue(document,"//div[@class=a]/html()")).isEqualTo("
\n" + 175 | // " github.com\n" + 176 | // "
"); 177 | 178 | } 179 | 180 | @Test 181 | public void testLogicOperation() throws XPathExpressionException { 182 | 183 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(html)); 184 | 185 | String expectedDiv = "
\n" + 186 | " aaa\n" + 187 | "
\n" + 188 | " github.com\n" + 189 | "
\n" + 190 | "
"; 191 | 192 | assertThat(getNodeValue(document, "//*[@id='te' or @id='test']")).isEqualTo(expectedDiv); 193 | 194 | assertThat(getNodeValue(document, "//*[@id='te' and @id='test']")).isNullOrEmpty(); 195 | 196 | assertThat(getNodeValue(document, "//*[@id='te' and @id='test']")).isNullOrEmpty(); 197 | 198 | assertThat(getNodeValue(document,"//*[(@id='te' or @id='test') and @id='test']")).isEqualTo(expectedDiv); 199 | 200 | assertThat(getNodeValue(document,"//*[@id='te' or (@id='test' and @id='id')]")).isNull(); 201 | } 202 | 203 | @Test 204 | public void testContains() throws XPathExpressionException { 205 | 206 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(html)); 207 | 208 | assertThat(getNodeValue(document,"//div[contains(@id,'te')]")).isEqualTo("
\n" + 209 | " aaa\n" + 210 | "
\n" + 211 | " github.com\n" + 212 | "
\n" + 213 | "
"); 214 | 215 | } 216 | 217 | @Test 218 | public void testCombingXPath() throws XPathExpressionException { 219 | 220 | String html2 = ""; 221 | 222 | String expectedDiv1 = "
\n" + 223 | " aaa\n" + 224 | "
\n" + 225 | " github.com\n" + 226 | "
\n" + 227 | "
"; 228 | 229 | String expectedDiv2 = "
\n" + 230 | " aa" + 231 | "github.com\n" + 232 | "
"; 233 | 234 | org.w3c.dom.Document document = Xsoup.convertDocument(Jsoup.parse(html)); 235 | 236 | assertThat(getNodeValue(document, "//div[@id='test'] | //div[@id='test2']")).isEqualTo(expectedDiv1); 237 | 238 | document = Xsoup.convertDocument(Jsoup.parse(html2)); 239 | 240 | assertThat(getNodeValue(document, "//div[@id='test'] | //div[@id='test2']")).isEqualTo(expectedDiv2); 241 | 242 | document = Xsoup.convertDocument(Jsoup.parse(html+html2)); 243 | 244 | assertThat(getNodeListValue(document, "//div[@id='test'] | //div[@id='test2']")).contains(expectedDiv1,expectedDiv2); 245 | 246 | } 247 | 248 | } 249 | --------------------------------------------------------------------------------