├── .gitignore ├── .classpath ├── .project ├── README ├── .settings └── org.eclipse.jdt.core.prefs └── src └── org └── htmlcleaner ├── HtmlNode.java ├── TagNodeVisitor.java ├── BaseToken.java ├── FastHtmlSerializer.java ├── ITagInfoProvider.java ├── TagToken.java ├── XPatherException.java ├── HtmlCleanerException.java ├── EndTagToken.java ├── CommentNode.java ├── ContentNode.java ├── CleanerTransformations.java ├── SimpleHtmlSerializer.java ├── SimpleXmlSerializer.java ├── CompactXmlSerializer.java ├── DoctypeToken.java ├── BrowserCompactXmlSerializer.java ├── CompactHtmlSerializer.java ├── Html5TagProvider.java ├── TagTransformation.java ├── DomSerializer.java ├── XmlSerializer.java ├── PrettyHtmlSerializer.java ├── CleanerProperties.java ├── HtmlSerializer.java ├── Serializer.java ├── TagInfo.java ├── SpecialEntity.java ├── Utils.java └── XPather.java /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | release/ 3 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | HtmlCleaner 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | HtmlCleaner is a project originally developed by Vladimir Nikic (http://htmlcleaner.sourceforge.net/). 2 | 3 | This version is modified by Zheng Sun. 4 | 5 | Briefly speaking, the modifications are 6 | 7 | * Added *final* keyword to variables if possible, to avoid memory leaks 8 | * Changed some methods for better performance 9 | * Add new class *FastHtmlSerializer* to output the HTML tree non-recursively, to avoid stack overflow (especially for Android) 10 | * Other minor changes 11 | 12 | HtmlCleaner is used as HTML parser in EasyRSS (http://easyrss.pursuer.me/). 13 | Author: Zheng Sun (http://pursuer.me). 14 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.6 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.6 12 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/HtmlNode.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | package org.htmlcleaner; 18 | 19 | /** 20 | * Marker interface denoting nodes of the document tree 21 | */ 22 | public interface HtmlNode { 23 | } 24 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/TagNodeVisitor.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | package org.htmlcleaner; 18 | 19 | /** 20 | * Defines action to be performed on TagNodes 21 | */ 22 | public interface TagNodeVisitor { 23 | 24 | /** 25 | * Action to be performed on single node in the tree 26 | * 27 | * @param parentNode 28 | * Parent of tagNode 29 | * @param htmlNode 30 | * node visited 31 | * @return True if tree traversal should be continued, false if it has to 32 | * stop. 33 | */ 34 | boolean visit(TagNode parentNode, HtmlNode htmlNode); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/BaseToken.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | 59 | /** 60 | *

61 | * Base token interface. Tokens are individual entities recognized by HTML 62 | * parser. 63 | *

64 | */ 65 | public interface BaseToken { 66 | void serialize(Serializer serializer, Writer writer) throws IOException; 67 | } 68 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/FastHtmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | package org.htmlcleaner; 18 | 19 | import java.io.IOException; 20 | import java.io.Writer; 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | import java.util.Stack; 24 | 25 | public class FastHtmlSerializer extends HtmlSerializer { 26 | public FastHtmlSerializer(final CleanerProperties props) { 27 | super(props); 28 | } 29 | 30 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException { 31 | final Stack tagStack = new Stack(); 32 | final Stack> childStack = new Stack>(); 33 | serializeOpenTag(tagNode, writer, false); 34 | if (!isMinimizedTagSyntax(tagNode)) { 35 | tagStack.push(tagNode); 36 | childStack.push(new ArrayList(tagNode.getChildren())); 37 | while (!tagStack.isEmpty()) { 38 | final TagNode tag = tagStack.peek(); 39 | final List children = childStack.peek(); 40 | if (children.isEmpty()) { 41 | tagStack.pop(); 42 | childStack.pop(); 43 | if (!isMinimizedTagSyntax(tag)) { 44 | serializeEndTag(tag, writer, false); 45 | } 46 | } else { 47 | final Object item = children.get(0); 48 | children.remove(0); 49 | if (item instanceof ContentNode) { 50 | final String content = item.toString(); 51 | writer.write(dontEscape(tag) ? content : escapeText(content)); 52 | } else if (item instanceof TagNode) { 53 | final TagNode currentTag = (TagNode) item; 54 | serializeOpenTag(currentTag, writer, false); 55 | tagStack.push(currentTag); 56 | childStack.push(new ArrayList(currentTag.getChildren())); 57 | } else if (item instanceof BaseToken) { 58 | ((BaseToken) item).serialize(this, writer); 59 | } 60 | } 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/ITagInfoProvider.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | /** 57 | *

58 | * Provides set of TagInfo instances. The instance of this interface is used as 59 | * a collection of tag definitions used in cleanup process. Implementing this 60 | * interface desired behaviour of cleaner can be achived.
61 | * In most cases implementation will be or contain a kind of Map. 62 | *

63 | */ 64 | public interface ITagInfoProvider { 65 | TagInfo getTagInfo(String tagName); 66 | } 67 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/TagToken.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | /** 57 | *

58 | * HTML tag token - descendants are start (TagNode) and end token (EndTagToken). 59 | *

60 | */ 61 | public abstract class TagToken implements BaseToken { 62 | protected String name; 63 | 64 | public TagToken() { 65 | // TODO empty method 66 | } 67 | 68 | public TagToken(final String name) { 69 | this.name = name; 70 | } 71 | 72 | public String getName() { 73 | return name; 74 | } 75 | 76 | public String toString() { 77 | return name; 78 | } 79 | 80 | abstract public void setAttribute(String attName, String attValue); 81 | } 82 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/XPatherException.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | /** 57 | *

58 | * Exception that could occure during XPather evaluation. 59 | *

60 | */ 61 | public class XPatherException extends Exception { 62 | private static final long serialVersionUID = 1L; 63 | 64 | public XPatherException() { 65 | this("Error in evaluating XPath expression!"); 66 | } 67 | 68 | public XPatherException(final String message) { 69 | super(message); 70 | } 71 | 72 | public XPatherException(final String message, final Throwable cause) { 73 | super(message, cause); 74 | } 75 | 76 | public XPatherException(final Throwable cause) { 77 | super(cause); 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/HtmlCleanerException.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | /** 57 | *

58 | * General HtmlCleaner runtime exception. 59 | *

60 | */ 61 | public class HtmlCleanerException extends RuntimeException { 62 | private static final long serialVersionUID = 1L; 63 | 64 | public HtmlCleanerException() { 65 | this("HtmlCleaner expression occureed!"); 66 | } 67 | 68 | public HtmlCleanerException(final String message) { 69 | super(message); 70 | } 71 | 72 | public HtmlCleanerException(final String message, final Throwable cause) { 73 | super(message, cause); 74 | } 75 | 76 | public HtmlCleanerException(final Throwable cause) { 77 | super(cause); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/EndTagToken.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.Writer; 57 | 58 | /** 59 | *

60 | * HTML tag end token. 61 | *

62 | */ 63 | public class EndTagToken extends TagToken { 64 | public EndTagToken() { 65 | super(); 66 | } 67 | 68 | public EndTagToken(final String name) { 69 | super(name == null ? null : name.toLowerCase()); 70 | } 71 | 72 | public void setAttribute(final String attName, final String attValue) { 73 | // do nothing - simply ignore attributes in closing tag 74 | } 75 | 76 | public void serialize(final Serializer serializer, final Writer writer) { 77 | // do nothing - simply ignore serialization 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/CommentNode.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | 59 | /** 60 | *

61 | * HTML comment token. 62 | *

63 | */ 64 | public class CommentNode implements BaseToken, HtmlNode { 65 | final private StringBuilder content; 66 | 67 | public CommentNode(final String content) { 68 | this.content = new StringBuilder(content); 69 | } 70 | 71 | public String getCommentedContent() { 72 | return ""; 73 | } 74 | 75 | public StringBuilder getContent() { 76 | return content; 77 | } 78 | 79 | public void serialize(final Serializer serializer, final Writer writer) throws IOException { 80 | writer.write(getCommentedContent()); 81 | } 82 | 83 | public String toString() { 84 | return getCommentedContent(); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/ContentNode.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | 59 | /** 60 | *

61 | * HTML text token. 62 | *

63 | */ 64 | public class ContentNode implements BaseToken, HtmlNode { 65 | final private StringBuilder content; 66 | 67 | public ContentNode(final char content[], final int len) { 68 | this.content = new StringBuilder(len + 16); 69 | this.content.append(content, 0, len); 70 | } 71 | 72 | public ContentNode(final String content) { 73 | this.content = new StringBuilder(content); 74 | } 75 | 76 | public StringBuilder getContent() { 77 | return content; 78 | } 79 | 80 | public void serialize(final Serializer serializer, final Writer writer) throws IOException { 81 | writer.write(content.toString()); 82 | } 83 | 84 | public String toString() { 85 | return content.toString(); 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/CleanerTransformations.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.util.HashMap; 57 | import java.util.Map; 58 | 59 | /** 60 | * Contains transformation collection. 61 | */ 62 | public class CleanerTransformations { 63 | 64 | final private Map mappings = new HashMap(); 65 | 66 | /** 67 | * Adds specified tag transformation to the collection. 68 | * 69 | * @param tagTransformation 70 | */ 71 | public void addTransformation(final TagTransformation tagTransformation) { 72 | if (tagTransformation != null) { 73 | mappings.put(tagTransformation.getSourceTag(), tagTransformation); 74 | } 75 | } 76 | 77 | public TagTransformation getTransformation(final String tagName) { 78 | return tagName == null ? null : mappings.get(tagName.toLowerCase()); 79 | } 80 | 81 | public boolean hasTransformationForTag(final String tagName) { 82 | return tagName != null && mappings.containsKey(tagName.toLowerCase()); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/SimpleHtmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | 59 | /** 60 | *

61 | * Simple HTML serializer - creates resulting HTML without indenting and/or 62 | * compacting. 63 | *

64 | */ 65 | public class SimpleHtmlSerializer extends HtmlSerializer { 66 | public SimpleHtmlSerializer(final CleanerProperties props) { 67 | super(props); 68 | } 69 | 70 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException { 71 | serializeOpenTag(tagNode, writer, false); 72 | 73 | if (!isMinimizedTagSyntax(tagNode)) { 74 | for (final Object item : tagNode.getChildren()) { 75 | if (item instanceof ContentNode) { 76 | final String content = item.toString(); 77 | writer.write(dontEscape(tagNode) ? content : escapeText(content)); 78 | } else if (item instanceof BaseToken) { 79 | ((BaseToken) item).serialize(this, writer); 80 | } 81 | } 82 | 83 | serializeEndTag(tagNode, writer, false); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/SimpleXmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | 59 | /** 60 | *

61 | * Simple XML serializer - creates resulting XML without indenting lines. 62 | *

63 | */ 64 | public class SimpleXmlSerializer extends XmlSerializer { 65 | 66 | public SimpleXmlSerializer(final CleanerProperties props) { 67 | super(props); 68 | } 69 | 70 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException { 71 | serializeOpenTag(tagNode, writer, false); 72 | 73 | if (!isMinimizedTagSyntax(tagNode)) { 74 | for (final Object item : tagNode.getChildren()) { 75 | if (item instanceof ContentNode) { 76 | final String content = item.toString(); 77 | writer.write(dontEscape(tagNode) ? content.replaceAll("]]>", "]]>") : escapeXml(content)); 78 | } else if (item instanceof BaseToken) { 79 | ((BaseToken) item).serialize(this, writer); 80 | } 81 | } 82 | 83 | serializeEndTag(tagNode, writer, false); 84 | } 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/CompactXmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | import java.util.*; 59 | 60 | /** 61 | *

62 | * Compact XML serializer - creates resulting XML by stripping whitespaces. 63 | *

64 | */ 65 | public class CompactXmlSerializer extends XmlSerializer { 66 | public CompactXmlSerializer(final CleanerProperties props) { 67 | super(props); 68 | } 69 | 70 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException { 71 | serializeOpenTag(tagNode, writer, false); 72 | 73 | final List tagChildren = tagNode.getChildren(); 74 | if (!isMinimizedTagSyntax(tagNode)) { 75 | final ListIterator childrenIt = tagChildren.listIterator(); 76 | while (childrenIt.hasNext()) { 77 | final Object item = childrenIt.next(); 78 | if (item instanceof ContentNode) { 79 | final String content = item.toString().trim(); 80 | writer.write(dontEscape(tagNode) ? content.replaceAll("]]>", "]]>") : escapeXml(content)); 81 | 82 | if (childrenIt.hasNext()) { 83 | if (!Utils.isWhitespaceString(childrenIt.next())) { 84 | writer.write("\n"); 85 | } 86 | childrenIt.previous(); 87 | } 88 | } else if (item instanceof CommentNode) { 89 | final String content = ((CommentNode) item).getCommentedContent().trim(); 90 | writer.write(content); 91 | } else if (item instanceof BaseToken) { 92 | ((BaseToken) item).serialize(this, writer); 93 | } 94 | } 95 | 96 | serializeEndTag(tagNode, writer, false); 97 | } 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/DoctypeToken.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | 59 | /** 60 | *

61 | * HTML doctype token. 62 | *

63 | */ 64 | public class DoctypeToken implements BaseToken { 65 | private static String clean(String s) { 66 | if (s != null) { 67 | s = s.replace('>', ' '); 68 | s = s.replace('<', ' '); 69 | s = s.replace('&', ' '); 70 | s = s.replace('\'', ' '); 71 | s = s.replace('\"', ' '); 72 | } 73 | 74 | return s; 75 | } 76 | 77 | final private String part1; 78 | final private String part2; 79 | final private String part3; 80 | final private String part4; 81 | 82 | public DoctypeToken(final String part1, final String part2, final String part3, final String part4) { 83 | this.part1 = part1 != null ? part1.toUpperCase() : part1; 84 | this.part2 = part2 != null ? part2.toUpperCase() : part2; 85 | this.part3 = clean(part3); 86 | this.part4 = clean(part4); 87 | } 88 | 89 | public String getContent() { 90 | String result = ""; 96 | return result; 97 | } 98 | 99 | public String getName() { 100 | return ""; 101 | } 102 | 103 | public String getPart1() { 104 | return part1; 105 | } 106 | 107 | public String getPart2() { 108 | return part2; 109 | } 110 | 111 | public String getPart3() { 112 | return part3; 113 | } 114 | 115 | public String getPart4() { 116 | return part4; 117 | } 118 | 119 | public boolean isValid() { 120 | if (part1 == null || "".equals(part1)) { 121 | return false; 122 | } 123 | 124 | if (!"public".equalsIgnoreCase(part2) && !"system".equalsIgnoreCase(part2)) { 125 | return false; 126 | } 127 | 128 | if ("system".equalsIgnoreCase(part2) && part4 != null && !"".equals(part4)) { 129 | return false; 130 | } 131 | 132 | if ("public".equalsIgnoreCase(part2) && (part4 == null || "".equals(part4))) { 133 | return false; 134 | } 135 | 136 | return true; 137 | } 138 | 139 | public void serialize(final Serializer serializer, final Writer writer) throws IOException { 140 | writer.write(getContent() + "\n"); 141 | } 142 | 143 | public String toString() { 144 | return getContent(); 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/BrowserCompactXmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.Writer; 57 | import java.io.IOException; 58 | import java.util.List; 59 | import java.util.ListIterator; 60 | 61 | /** 62 | *

63 | * Broswer compact XML serializer - creates resulting XML by stripping 64 | * whitespaces wherever possible, but preserving single whitespace where at 65 | * least one exists. This behaviour is well suited for web-browsers, which 66 | * usualy treat multiple whitespaces as single one, but make diffrence between 67 | * single whitespace and empty text. 68 | *

69 | */ 70 | public class BrowserCompactXmlSerializer extends XmlSerializer { 71 | public BrowserCompactXmlSerializer(final CleanerProperties props) { 72 | super(props); 73 | } 74 | 75 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException { 76 | serializeOpenTag(tagNode, writer, false); 77 | 78 | final List tagChildren = tagNode.getChildren(); 79 | if (!isMinimizedTagSyntax(tagNode)) { 80 | final ListIterator childrenIt = tagChildren.listIterator(); 81 | while (childrenIt.hasNext()) { 82 | final Object item = childrenIt.next(); 83 | if (item instanceof ContentNode) { 84 | String content = item.toString(); 85 | final boolean startsWithSpace = content.length() > 0 && Character.isWhitespace(content.charAt(0)); 86 | final boolean endsWithSpace = content.length() > 1 87 | && Character.isWhitespace(content.charAt(content.length() - 1)); 88 | content = dontEscape(tagNode) ? content.trim().replaceAll("]]>", "]]>") : escapeXml(content 89 | .trim()); 90 | 91 | if (startsWithSpace) { 92 | writer.write(' '); 93 | } 94 | 95 | if (content.length() != 0) { 96 | writer.write(content); 97 | if (endsWithSpace) { 98 | writer.write(' '); 99 | } 100 | } 101 | 102 | if (childrenIt.hasNext()) { 103 | if (!Utils.isWhitespaceString(childrenIt.next())) { 104 | writer.write("\n"); 105 | } 106 | childrenIt.previous(); 107 | } 108 | } else if (item instanceof CommentNode) { 109 | final String content = ((CommentNode) item).getCommentedContent().trim(); 110 | writer.write(content); 111 | } else if (item instanceof BaseToken) { 112 | ((BaseToken) item).serialize(this, writer); 113 | } 114 | } 115 | 116 | serializeEndTag(tagNode, writer, false); 117 | } 118 | } 119 | 120 | } 121 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/CompactHtmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | import java.util.List; 59 | import java.util.ListIterator; 60 | 61 | /** 62 | *

63 | * Compact HTML serializer - creates resulting HTML by stripping whitespaces 64 | * wherever possible. 65 | *

66 | */ 67 | public class CompactHtmlSerializer extends HtmlSerializer { 68 | private int openPreTags = 0; 69 | 70 | public CompactHtmlSerializer(final CleanerProperties props) { 71 | super(props); 72 | } 73 | 74 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException { 75 | final boolean isPreTag = "pre".equalsIgnoreCase(tagNode.getName()); 76 | if (isPreTag) { 77 | openPreTags++; 78 | } 79 | 80 | serializeOpenTag(tagNode, writer, false); 81 | 82 | final List tagChildren = tagNode.getChildren(); 83 | if (!isMinimizedTagSyntax(tagNode)) { 84 | final ListIterator childrenIt = tagChildren.listIterator(); 85 | while (childrenIt.hasNext()) { 86 | final Object item = childrenIt.next(); 87 | if (item instanceof ContentNode) { 88 | String content = item.toString(); 89 | if (openPreTags > 0) { 90 | writer.write(content); 91 | } else { 92 | final boolean startsWithSpace = content.length() > 0 93 | && Character.isWhitespace(content.charAt(0)); 94 | final boolean endsWithSpace = content.length() > 1 95 | && Character.isWhitespace(content.charAt(content.length() - 1)); 96 | content = dontEscape(tagNode) ? content.trim() : escapeText(content.trim()); 97 | 98 | if (startsWithSpace) { 99 | writer.write(' '); 100 | } 101 | 102 | if (content.length() != 0) { 103 | writer.write(content); 104 | if (endsWithSpace) { 105 | writer.write(' '); 106 | } 107 | } 108 | 109 | if (childrenIt.hasNext()) { 110 | if (!Utils.isWhitespaceString(childrenIt.next())) { 111 | writer.write("\n"); 112 | } 113 | childrenIt.previous(); 114 | } 115 | } 116 | } else if (item instanceof CommentNode) { 117 | final String content = ((CommentNode) item).getCommentedContent().trim(); 118 | writer.write(content); 119 | } else if (item instanceof BaseToken) { 120 | ((BaseToken) item).serialize(this, writer); 121 | } 122 | } 123 | 124 | serializeEndTag(tagNode, writer, false); 125 | if (isPreTag) { 126 | openPreTags--; 127 | } 128 | } 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/Html5TagProvider.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | public class Html5TagProvider extends DefaultTagProvider { 57 | private static Html5TagProvider instance; 58 | private static final long serialVersionUID = 1L; 59 | 60 | /** 61 | * @return Singleton instance of this class. 62 | */ 63 | public static synchronized Html5TagProvider getInstance() { 64 | if (instance == null) { 65 | instance = new Html5TagProvider(); 66 | } 67 | return instance; 68 | } 69 | 70 | protected Html5TagProvider() { 71 | super(); 72 | 73 | TagInfo tagInfo; 74 | 75 | tagInfo = new TagInfo("time", TagInfo.CONTENT_TEXT, TagInfo.BODY, false, false, false); 76 | tagInfo.defineCloseBeforeCopyInsideTags("bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); 77 | tagInfo.defineCloseBeforeTags("address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); 78 | this.put("time", tagInfo); 79 | 80 | tagInfo = new TagInfo("article", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 81 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); 82 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); 83 | this.put("article", tagInfo); 84 | 85 | tagInfo = new TagInfo("section", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 86 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); 87 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); 88 | this.put("section", tagInfo); 89 | 90 | tagInfo = new TagInfo("header", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 91 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); 92 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); 93 | this.put("header", tagInfo); 94 | 95 | tagInfo = new TagInfo("footer", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 96 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); 97 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); 98 | this.put("footer", tagInfo); 99 | 100 | tagInfo = new TagInfo("aside", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 101 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); 102 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); 103 | this.put("aside", tagInfo); 104 | 105 | tagInfo = new TagInfo("video", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 106 | tagInfo.defineCloseBeforeTags("object"); 107 | this.put("video", tagInfo); 108 | 109 | tagInfo = new TagInfo("audio", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 110 | tagInfo.defineCloseBeforeTags("object"); 111 | this.put("audio", tagInfo); 112 | 113 | tagInfo = new TagInfo("source", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); 114 | tagInfo.defineCloseBeforeTags("source"); 115 | this.put("source", tagInfo); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/TagTransformation.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.util.Map; 57 | import java.util.LinkedHashMap; 58 | 59 | /** 60 | * Describes how specified tag is transformed to another one, or is ignored 61 | * during parsing 62 | */ 63 | public class TagTransformation { 64 | 65 | private String sourceTag; 66 | private String destTag; 67 | private boolean preserveSourceAttributes; 68 | private Map attributeTransformations; 69 | 70 | /** 71 | * Creates new tag transformation from source tag to target tag specifying 72 | * whether source tag attributes are preserved. 73 | * 74 | * @param sourceTag 75 | * Name of the tag to be transformed. 76 | * @param destTag 77 | * Name of tag to which source tag is to be transformed. 78 | * @param preserveSourceAttributes 79 | * Tells whether source tag attributes are preserved in 80 | * transformation. 81 | */ 82 | public TagTransformation(final String sourceTag, final String destTag, final boolean preserveSourceAttributes) { 83 | this.sourceTag = sourceTag.toLowerCase(); 84 | if (destTag == null) { 85 | this.destTag = null; 86 | } else { 87 | this.destTag = Utils.isValidXmlIdentifier(destTag) ? destTag.toLowerCase() : sourceTag; 88 | } 89 | this.preserveSourceAttributes = preserveSourceAttributes; 90 | } 91 | 92 | /** 93 | * Creates new tag transformation from source tag to target tag preserving 94 | * all source tag attributes. 95 | * 96 | * @param sourceTag 97 | * Name of the tag to be transformed. 98 | * @param destTag 99 | * Name of tag to which source tag is to be transformed. 100 | */ 101 | public TagTransformation(final String sourceTag, final String destTag) { 102 | this(sourceTag, destTag, true); 103 | } 104 | 105 | /** 106 | * Creates new tag transformation in which specified tag will be skipped 107 | * (ignored) during parsing process. 108 | * 109 | * @param sourceTag 110 | */ 111 | public TagTransformation(final String sourceTag) { 112 | this(sourceTag, null); 113 | } 114 | 115 | /** 116 | * Adds new attribute transformation to this tag transformation. It tells 117 | * how destination attribute will look like. Small templating mechanism is 118 | * used to describe attribute value: all names between ${ and } inside the 119 | * template are evaluated against source tag attributes. That way one can 120 | * make attribute values consist of mix of source tag attributes. 121 | * 122 | * @param targetAttName 123 | * Name of the destination attribute 124 | * @param transformationDesc 125 | * Template describing attribute value. 126 | */ 127 | public void addAttributeTransformation(final String targetAttName, final String transformationDesc) { 128 | if (attributeTransformations == null) { 129 | attributeTransformations = new LinkedHashMap(); 130 | } 131 | attributeTransformations.put(targetAttName.toLowerCase(), transformationDesc); 132 | } 133 | 134 | /** 135 | * Adds new attribute transformation in which destination attrbute will not 136 | * exists (simply removes it from list of attributes). 137 | * 138 | * @param targetAttName 139 | */ 140 | public void addAttributeTransformation(final String targetAttName) { 141 | addAttributeTransformation(targetAttName, null); 142 | } 143 | 144 | boolean hasAttributeTransformations() { 145 | return attributeTransformations != null; 146 | } 147 | 148 | String getSourceTag() { 149 | return sourceTag; 150 | } 151 | 152 | String getDestTag() { 153 | return destTag; 154 | } 155 | 156 | boolean isPreserveSourceAttributes() { 157 | return preserveSourceAttributes; 158 | } 159 | 160 | Map getAttributeTransformations() { 161 | return attributeTransformations; 162 | } 163 | 164 | } 165 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/DomSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | package org.htmlcleaner; 18 | 19 | import org.w3c.dom.Comment; 20 | import org.w3c.dom.Document; 21 | import org.w3c.dom.Element; 22 | 23 | import javax.xml.parsers.DocumentBuilderFactory; 24 | import javax.xml.parsers.ParserConfigurationException; 25 | import java.util.Iterator; 26 | import java.util.List; 27 | import java.util.Map; 28 | 29 | /** 30 | *

31 | * DOM serializer - creates xml DOM. 32 | *

33 | */ 34 | public class DomSerializer { 35 | protected CleanerProperties props; 36 | protected boolean escapeXml = true; 37 | 38 | public DomSerializer(final CleanerProperties props, final boolean escapeXml) { 39 | this.props = props; 40 | this.escapeXml = escapeXml; 41 | } 42 | 43 | public DomSerializer(final CleanerProperties props) { 44 | this(props, true); 45 | } 46 | 47 | public Document createDOM(final TagNode rootNode) throws ParserConfigurationException { 48 | final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 49 | final Document document = factory.newDocumentBuilder().newDocument(); 50 | final Element rootElement = createElement(rootNode, document); 51 | document.appendChild(rootElement); 52 | setAttributes(rootNode, rootElement); 53 | createSubnodes(document, rootElement, rootNode.getChildren()); 54 | return document; 55 | } 56 | 57 | private Element createElement(final TagNode node, final Document document) { 58 | String name = node.getName(); 59 | final boolean nsAware = props.isNamespacesAware(); 60 | final String prefix = Utils.getXmlNSPrefix(name); 61 | final Map nsDeclarations = node.getNamespaceDeclarations(); 62 | String nsURI = null; 63 | if (prefix != null) { 64 | if (nsAware) { 65 | if (nsDeclarations != null) { 66 | nsURI = nsDeclarations.get(prefix); 67 | } 68 | if (nsURI == null) { 69 | nsURI = node.getNamespaceURIOnPath(prefix); 70 | } 71 | if (nsURI == null) { 72 | nsURI = prefix; 73 | } 74 | } else { 75 | name = Utils.getXmlName(name); 76 | } 77 | } else { 78 | if (nsAware) { 79 | if (nsDeclarations != null) { 80 | nsURI = nsDeclarations.get(""); 81 | } 82 | if (nsURI == null) { 83 | nsURI = node.getNamespaceURIOnPath(prefix); 84 | } 85 | } 86 | } 87 | 88 | if (nsAware && nsURI != null) { 89 | return document.createElementNS(nsURI, name); 90 | } else { 91 | return document.createElement(name); 92 | } 93 | } 94 | 95 | private void setAttributes(final TagNode node, final Element element) { 96 | for (final Map.Entry entry : node.getAttributes().entrySet()) { 97 | final String attrName = entry.getKey(); 98 | String attrValue = entry.getValue(); 99 | if (escapeXml) { 100 | attrValue = Utils.escapeXml(attrValue, props, true); 101 | } 102 | 103 | final String attPrefix = Utils.getXmlNSPrefix(attrName); 104 | if (attPrefix != null) { 105 | if (props.isNamespacesAware()) { 106 | String nsURI = node.getNamespaceURIOnPath(attPrefix); 107 | if (nsURI == null) { 108 | nsURI = attPrefix; 109 | } 110 | element.setAttributeNS(nsURI, attrName, attrValue); 111 | } else { 112 | element.setAttribute(Utils.getXmlName(attrName), attrValue); 113 | } 114 | } else { 115 | element.setAttribute(attrName, attrValue); 116 | } 117 | } 118 | } 119 | 120 | private void createSubnodes(final Document document, final Element element, final List tagChildren) { 121 | if (tagChildren != null) { 122 | final Iterator it = tagChildren.iterator(); 123 | while (it.hasNext()) { 124 | final Object item = it.next(); 125 | if (item instanceof CommentNode) { 126 | final CommentNode commentNode = (CommentNode) item; 127 | final Comment comment = document.createComment(commentNode.getContent().toString()); 128 | element.appendChild(comment); 129 | } else if (item instanceof ContentNode) { 130 | final String nodeName = element.getNodeName(); 131 | String content = item.toString(); 132 | final boolean specialCase = props.isUseCdataForScriptAndStyle() 133 | && ("script".equalsIgnoreCase(nodeName) || "style".equalsIgnoreCase(nodeName)); 134 | if (escapeXml && !specialCase) { 135 | content = Utils.escapeXml(content, props, true); 136 | } 137 | element.appendChild(specialCase ? document.createCDATASection(content) : document 138 | .createTextNode(content)); 139 | } else if (item instanceof TagNode) { 140 | final TagNode subTagNode = (TagNode) item; 141 | final Element subelement = createElement(subTagNode, document); 142 | 143 | setAttributes(subTagNode, subelement); 144 | 145 | // recursively create subnodes 146 | createSubnodes(document, subelement, subTagNode.getChildren()); 147 | 148 | element.appendChild(subelement); 149 | } else if (item instanceof List) { 150 | final List sublist = (List) item; 151 | createSubnodes(document, element, sublist); 152 | } 153 | } 154 | } 155 | } 156 | 157 | } 158 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/XmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.*; 57 | import java.util.*; 58 | 59 | /** 60 | *

61 | * Abstract XML serializer - contains common logic for descendants. 62 | *

63 | */ 64 | public abstract class XmlSerializer extends Serializer { 65 | 66 | protected XmlSerializer(final CleanerProperties props) { 67 | super(props); 68 | } 69 | 70 | /** 71 | * @deprecated Use writeToStream() instead. 72 | */ 73 | @Deprecated 74 | public void writeXmlToStream(final TagNode tagNode, final OutputStream out, final String charset) 75 | throws IOException { 76 | super.writeToStream(tagNode, out, charset); 77 | } 78 | 79 | /** 80 | * @deprecated Use writeToStream() instead. 81 | */ 82 | @Deprecated 83 | public void writeXmlToStream(final TagNode tagNode, final OutputStream out) throws IOException { 84 | super.writeToStream(tagNode, out); 85 | } 86 | 87 | /** 88 | * @deprecated Use writeToFile() instead. 89 | */ 90 | @Deprecated 91 | public void writeXmlToFile(final TagNode tagNode, final String fileName, final String charset) throws IOException { 92 | super.writeToFile(tagNode, fileName, charset); 93 | } 94 | 95 | /** 96 | * @deprecated Use writeToFile() instead. 97 | */ 98 | @Deprecated 99 | public void writeXmlToFile(final TagNode tagNode, final String fileName) throws IOException { 100 | super.writeToFile(tagNode, fileName); 101 | } 102 | 103 | /** 104 | * @deprecated Use getAsString() instead. 105 | */ 106 | @Deprecated 107 | public String getXmlAsString(final TagNode tagNode, final String charset) throws IOException { 108 | return super.getAsString(tagNode, charset); 109 | } 110 | 111 | /** 112 | * @deprecated Use getAsString() instead. 113 | */ 114 | @Deprecated 115 | public String getXmlAsString(final TagNode tagNode) throws IOException { 116 | return super.getAsString(tagNode); 117 | } 118 | 119 | /** 120 | * @deprecated Use write() instead. 121 | */ 122 | @Deprecated 123 | public void writeXml(final TagNode tagNode, final Writer writer, final String charset) throws IOException { 124 | super.write(tagNode, writer, charset); 125 | } 126 | 127 | protected String escapeXml(final String xmlContent) { 128 | return Utils.escapeXml(xmlContent, props, false); 129 | } 130 | 131 | protected boolean dontEscape(final TagNode tagNode) { 132 | return props.isUseCdataForScriptAndStyle() && isScriptOrStyle(tagNode); 133 | } 134 | 135 | protected boolean isMinimizedTagSyntax(final TagNode tagNode) { 136 | final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName()); 137 | return tagNode.getChildren().size() == 0 138 | && (props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag())); 139 | } 140 | 141 | protected void serializeOpenTag(final TagNode tagNode, final Writer writer, final boolean newLine) 142 | throws IOException { 143 | String tagName = tagNode.getName(); 144 | 145 | if (Utils.isEmptyString(tagName)) { 146 | return; 147 | } 148 | 149 | final boolean nsAware = props.isNamespacesAware(); 150 | 151 | Set definedNSPrefixes = null; 152 | Set additionalNSDeclNeeded = null; 153 | 154 | final String tagPrefix = Utils.getXmlNSPrefix(tagName); 155 | if (tagPrefix != null) { 156 | if (nsAware) { 157 | definedNSPrefixes = new HashSet(); 158 | tagNode.collectNamespacePrefixesOnPath(definedNSPrefixes); 159 | if (!definedNSPrefixes.contains(tagPrefix)) { 160 | additionalNSDeclNeeded = new TreeSet(); 161 | additionalNSDeclNeeded.add(tagPrefix); 162 | } 163 | } else { 164 | tagName = Utils.getXmlName(tagName); 165 | } 166 | } 167 | 168 | writer.write("<" + tagName); 169 | 170 | // write attributes 171 | for (Map.Entry entry : tagNode.getAttributes().entrySet()) { 172 | String attName = entry.getKey(); 173 | final String attPrefix = Utils.getXmlNSPrefix(attName); 174 | if (attPrefix != null) { 175 | if (nsAware) { 176 | // collect used namespace prefixes in attributes in order to 177 | // explicitly define 178 | // ns declaration if needed; otherwise it would be 179 | // ill-formed xml 180 | if (definedNSPrefixes == null) { 181 | definedNSPrefixes = new HashSet(); 182 | tagNode.collectNamespacePrefixesOnPath(definedNSPrefixes); 183 | } 184 | if (!definedNSPrefixes.contains(attPrefix)) { 185 | if (additionalNSDeclNeeded == null) { 186 | additionalNSDeclNeeded = new TreeSet(); 187 | } 188 | additionalNSDeclNeeded.add(attPrefix); 189 | } 190 | } else { 191 | attName = Utils.getXmlName(attName); 192 | } 193 | } 194 | writer.write(" " + attName + "=\"" + escapeXml(entry.getValue()) + "\""); 195 | } 196 | 197 | // write namespace declarations 198 | if (nsAware) { 199 | final Map nsDeclarations = tagNode.getNamespaceDeclarations(); 200 | if (nsDeclarations != null) { 201 | for (Map.Entry entry : nsDeclarations.entrySet()) { 202 | final String prefix = entry.getKey(); 203 | String att = "xmlns"; 204 | if (prefix.length() > 0) { 205 | att += ":" + prefix; 206 | } 207 | writer.write(" " + att + "=\"" + escapeXml(entry.getValue()) + "\""); 208 | } 209 | } 210 | } 211 | 212 | // write additional namespace declarations needed for this tag in order 213 | // xml to be well-formed 214 | if (additionalNSDeclNeeded != null) { 215 | for (String prefix : additionalNSDeclNeeded) { 216 | writer.write(" xmlns:" + prefix + "=\"" + prefix + "\""); 217 | } 218 | } 219 | 220 | if (isMinimizedTagSyntax(tagNode)) { 221 | writer.write(" />"); 222 | if (newLine) { 223 | writer.write("\n"); 224 | } 225 | } else if (dontEscape(tagNode)) { 226 | writer.write(">"); 229 | } 230 | } 231 | 232 | protected void serializeEndTag(final TagNode tagNode, final Writer writer, final boolean newLine) 233 | throws IOException { 234 | String tagName = tagNode.getName(); 235 | 236 | if (Utils.isEmptyString(tagName)) { 237 | return; 238 | } 239 | 240 | if (dontEscape(tagNode)) { 241 | writer.write("]]>"); 242 | } 243 | 244 | if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) { 245 | tagName = Utils.getXmlName(tagName); 246 | } 247 | writer.write(""); 248 | 249 | if (newLine) { 250 | writer.write("\n"); 251 | } 252 | } 253 | 254 | } 255 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/PrettyHtmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.*; 57 | import java.util.*; 58 | 59 | /** 60 | *

61 | * Pretty HTML serializer - creates resulting HTML with indenting lines. 62 | *

63 | */ 64 | public class PrettyHtmlSerializer extends HtmlSerializer { 65 | private static final String DEFAULT_INDENTATION_STRING = "\t"; 66 | 67 | private String indentString = DEFAULT_INDENTATION_STRING; 68 | final private List indents = new ArrayList(); 69 | 70 | public PrettyHtmlSerializer(final CleanerProperties props) { 71 | this(props, DEFAULT_INDENTATION_STRING); 72 | } 73 | 74 | public PrettyHtmlSerializer(final CleanerProperties props, final String indentString) { 75 | super(props); 76 | this.indentString = indentString; 77 | } 78 | 79 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException { 80 | serializePrettyHtml(tagNode, writer, 0, false, true); 81 | } 82 | 83 | /** 84 | * @param level 85 | * @return Appropriate indentation for the specified depth. 86 | */ 87 | private synchronized String getIndent(final int level) { 88 | final int size = indents.size(); 89 | if (size <= level) { 90 | String prevIndent = size == 0 ? null : indents.get(size - 1); 91 | for (int i = size; i <= level; i++) { 92 | final String currIndent = prevIndent == null ? "" : prevIndent + indentString; 93 | indents.add(currIndent); 94 | prevIndent = currIndent; 95 | } 96 | } 97 | 98 | return indents.get(level); 99 | } 100 | 101 | private String getIndentedText(final String content, final int level) { 102 | final String indent = getIndent(level); 103 | final StringBuilder result = new StringBuilder(content.length()); 104 | final StringTokenizer tokenizer = new StringTokenizer(content, "\n\r"); 105 | 106 | while (tokenizer.hasMoreTokens()) { 107 | final String line = tokenizer.nextToken().trim(); 108 | if (!"".equals(line)) { 109 | result.append(indent).append(line).append('\n'); 110 | } 111 | } 112 | 113 | return result.toString(); 114 | } 115 | 116 | private String getSingleLineOfChildren(final List children) { 117 | final StringBuilder result = new StringBuilder(); 118 | final Iterator childrenIt = children.iterator(); 119 | boolean isFirst = true; 120 | 121 | while (childrenIt.hasNext()) { 122 | final Object child = childrenIt.next(); 123 | 124 | if (!(child instanceof ContentNode)) { 125 | return null; 126 | } else { 127 | String content = child.toString(); 128 | 129 | // if first item trims it from left 130 | if (isFirst) { 131 | content = Utils.ltrim(content); 132 | } 133 | 134 | // if last item trims it from right 135 | if (!childrenIt.hasNext()) { 136 | content = Utils.rtrim(content); 137 | } 138 | 139 | if (content.indexOf('\n') >= 0 || content.indexOf('\r') >= 0) { 140 | return null; 141 | } 142 | result.append(content); 143 | } 144 | 145 | isFirst = false; 146 | } 147 | 148 | return result.toString(); 149 | } 150 | 151 | protected void serializePrettyHtml(final TagNode tagNode, final Writer writer, final int level, 152 | final boolean isPreserveWhitespaces, final boolean isLastNewLine) throws IOException { 153 | final List tagChildren = tagNode.getChildren(); 154 | final String tagName = tagNode.getName(); 155 | final boolean isHeadlessNode = Utils.isEmptyString(tagName); 156 | final String indent = isHeadlessNode ? "" : getIndent(level); 157 | 158 | if (!isPreserveWhitespaces) { 159 | if (!isLastNewLine) { 160 | writer.write("\n"); 161 | } 162 | writer.write(indent); 163 | } 164 | serializeOpenTag(tagNode, writer, true); 165 | 166 | final boolean preserveWhitespaces = isPreserveWhitespaces || "pre".equalsIgnoreCase(tagName); 167 | 168 | boolean lastWasNewLine = false; 169 | 170 | if (!isMinimizedTagSyntax(tagNode)) { 171 | final String singleLine = getSingleLineOfChildren(tagChildren); 172 | final boolean dontEscape = dontEscape(tagNode); 173 | if (!preserveWhitespaces && singleLine != null) { 174 | writer.write(!dontEscape(tagNode) ? escapeText(singleLine) : singleLine); 175 | } else { 176 | final Iterator childIterator = tagChildren.iterator(); 177 | while (childIterator.hasNext()) { 178 | final Object child = childIterator.next(); 179 | if (child instanceof TagNode) { 180 | serializePrettyHtml((TagNode) child, writer, isHeadlessNode ? level : level + 1, 181 | preserveWhitespaces, lastWasNewLine); 182 | lastWasNewLine = false; 183 | } else if (child instanceof ContentNode) { 184 | final String content = dontEscape ? child.toString() : escapeText(child.toString()); 185 | if (content.length() > 0) { 186 | if (dontEscape || preserveWhitespaces) { 187 | writer.write(content); 188 | } else if (Character.isWhitespace(content.charAt(0))) { 189 | if (!lastWasNewLine) { 190 | writer.write("\n"); 191 | lastWasNewLine = false; 192 | } 193 | if (content.trim().length() > 0) { 194 | writer.write(getIndentedText(Utils.rtrim(content), isHeadlessNode ? level 195 | : level + 1)); 196 | } else { 197 | lastWasNewLine = true; 198 | } 199 | } else { 200 | if (content.trim().length() > 0) { 201 | writer.write(Utils.rtrim(content)); 202 | } 203 | if (!childIterator.hasNext()) { 204 | writer.write("\n"); 205 | lastWasNewLine = true; 206 | } 207 | } 208 | } 209 | } else if (child instanceof CommentNode) { 210 | if (!lastWasNewLine && !preserveWhitespaces) { 211 | writer.write("\n"); 212 | lastWasNewLine = false; 213 | } 214 | final CommentNode commentNode = (CommentNode) child; 215 | final String content = commentNode.getCommentedContent(); 216 | writer.write(dontEscape ? content 217 | : getIndentedText(content, isHeadlessNode ? level : level + 1)); 218 | } 219 | } 220 | } 221 | 222 | if (singleLine == null && !preserveWhitespaces) { 223 | if (!lastWasNewLine) { 224 | writer.write("\n"); 225 | } 226 | writer.write(indent); 227 | } 228 | 229 | serializeEndTag(tagNode, writer, false); 230 | } 231 | } 232 | 233 | } 234 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/CleanerProperties.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | /** 57 | * Properties defining cleaner's behaviour 58 | */ 59 | public class CleanerProperties { 60 | public static final String BOOL_ATT_SELF = "self"; 61 | public static final String BOOL_ATT_EMPTY = "empty"; 62 | public static final String BOOL_ATT_TRUE = "true"; 63 | 64 | private ITagInfoProvider tagInfoProvider = null; 65 | private boolean advancedXmlEscape = true; 66 | private boolean transResCharsToNCR = false; 67 | private boolean useCdataForScriptAndStyle = true; 68 | private boolean translateSpecialEntities = true; 69 | private boolean transSpecialEntitiesToNCR = false; 70 | private boolean recognizeUnicodeChars = true; 71 | private boolean omitUnknownTags = false; 72 | private boolean treatUnknownTagsAsContent = false; 73 | private boolean omitDeprecatedTags = false; 74 | private boolean treatDeprecatedTagsAsContent = false; 75 | private boolean omitComments = false; 76 | private boolean omitXmlDeclaration = false; 77 | private boolean omitDoctypeDeclaration = true; 78 | private boolean omitHtmlEnvelope = false; 79 | private boolean useEmptyElementTags = true; 80 | private boolean allowMultiWordAttributes = true; 81 | private boolean allowHtmlInsideAttributes = false; 82 | private boolean ignoreQuestAndExclam = true; 83 | private boolean namespacesAware = true; 84 | private String hyphenReplacementInComment = "="; 85 | private String booleanAttributeValues = BOOL_ATT_SELF; 86 | private String pruneTags = null; 87 | 88 | public String getBooleanAttributeValues() { 89 | return booleanAttributeValues; 90 | } 91 | 92 | public String getHyphenReplacementInComment() { 93 | return hyphenReplacementInComment; 94 | } 95 | 96 | public String getPruneTags() { 97 | return pruneTags; 98 | } 99 | 100 | public ITagInfoProvider getTagInfoProvider() { 101 | return tagInfoProvider; 102 | } 103 | 104 | public boolean isAdvancedXmlEscape() { 105 | return advancedXmlEscape; 106 | } 107 | 108 | public boolean isAllowHtmlInsideAttributes() { 109 | return allowHtmlInsideAttributes; 110 | } 111 | 112 | public boolean isAllowMultiWordAttributes() { 113 | return allowMultiWordAttributes; 114 | } 115 | 116 | public boolean isIgnoreQuestAndExclam() { 117 | return ignoreQuestAndExclam; 118 | } 119 | 120 | public boolean isNamespacesAware() { 121 | return namespacesAware; 122 | } 123 | 124 | public boolean isOmitComments() { 125 | return omitComments; 126 | } 127 | 128 | public boolean isOmitDeprecatedTags() { 129 | return omitDeprecatedTags; 130 | } 131 | 132 | public boolean isOmitDoctypeDeclaration() { 133 | return omitDoctypeDeclaration; 134 | } 135 | 136 | public boolean isOmitHtmlEnvelope() { 137 | return omitHtmlEnvelope; 138 | } 139 | 140 | public boolean isOmitUnknownTags() { 141 | return omitUnknownTags; 142 | } 143 | 144 | public boolean isOmitXmlDeclaration() { 145 | return omitXmlDeclaration; 146 | } 147 | 148 | public boolean isRecognizeUnicodeChars() { 149 | return recognizeUnicodeChars; 150 | } 151 | 152 | public boolean isTranslateSpecialEntities() { 153 | return translateSpecialEntities; 154 | } 155 | 156 | public boolean isTransResCharsToNCR() { 157 | return transResCharsToNCR; 158 | } 159 | 160 | public boolean isTransSpecialEntitiesToNCR() { 161 | return transSpecialEntitiesToNCR; 162 | } 163 | 164 | public boolean isTreatDeprecatedTagsAsContent() { 165 | return treatDeprecatedTagsAsContent; 166 | } 167 | 168 | public boolean isTreatUnknownTagsAsContent() { 169 | return treatUnknownTagsAsContent; 170 | } 171 | 172 | public boolean isUseCdataForScriptAndStyle() { 173 | return useCdataForScriptAndStyle; 174 | } 175 | 176 | public boolean isUseEmptyElementTags() { 177 | return useEmptyElementTags; 178 | } 179 | 180 | public void setAdvancedXmlEscape(final boolean advancedXmlEscape) { 181 | this.advancedXmlEscape = advancedXmlEscape; 182 | } 183 | 184 | public void setAllowHtmlInsideAttributes(final boolean allowHtmlInsideAttributes) { 185 | this.allowHtmlInsideAttributes = allowHtmlInsideAttributes; 186 | } 187 | 188 | public void setAllowMultiWordAttributes(final boolean allowMultiWordAttributes) { 189 | this.allowMultiWordAttributes = allowMultiWordAttributes; 190 | } 191 | 192 | public void setBooleanAttributeValues(final String booleanAttributeValues) { 193 | if (BOOL_ATT_SELF.equalsIgnoreCase(booleanAttributeValues) 194 | || BOOL_ATT_EMPTY.equalsIgnoreCase(booleanAttributeValues) 195 | || BOOL_ATT_TRUE.equalsIgnoreCase(booleanAttributeValues)) { 196 | this.booleanAttributeValues = booleanAttributeValues.toLowerCase(); 197 | } else { 198 | this.booleanAttributeValues = BOOL_ATT_SELF; 199 | } 200 | } 201 | 202 | public void setHyphenReplacementInComment(final String hyphenReplacementInComment) { 203 | this.hyphenReplacementInComment = hyphenReplacementInComment; 204 | } 205 | 206 | public void setIgnoreQuestAndExclam(final boolean ignoreQuestAndExclam) { 207 | this.ignoreQuestAndExclam = ignoreQuestAndExclam; 208 | } 209 | 210 | public void setNamespacesAware(final boolean namespacesAware) { 211 | this.namespacesAware = namespacesAware; 212 | } 213 | 214 | public void setOmitComments(final boolean omitComments) { 215 | this.omitComments = omitComments; 216 | } 217 | 218 | public void setOmitDeprecatedTags(final boolean omitDeprecatedTags) { 219 | this.omitDeprecatedTags = omitDeprecatedTags; 220 | } 221 | 222 | public void setOmitDoctypeDeclaration(final boolean omitDoctypeDeclaration) { 223 | this.omitDoctypeDeclaration = omitDoctypeDeclaration; 224 | } 225 | 226 | public void setOmitHtmlEnvelope(final boolean omitHtmlEnvelope) { 227 | this.omitHtmlEnvelope = omitHtmlEnvelope; 228 | } 229 | 230 | public void setOmitUnknownTags(final boolean omitUnknownTags) { 231 | this.omitUnknownTags = omitUnknownTags; 232 | } 233 | 234 | public void setOmitXmlDeclaration(final boolean omitXmlDeclaration) { 235 | this.omitXmlDeclaration = omitXmlDeclaration; 236 | } 237 | 238 | public void setPruneTags(final String pruneTags) { 239 | this.pruneTags = pruneTags; 240 | } 241 | 242 | public void setRecognizeUnicodeChars(final boolean recognizeUnicodeChars) { 243 | this.recognizeUnicodeChars = recognizeUnicodeChars; 244 | } 245 | 246 | public void setTagInfoProvider(final ITagInfoProvider tagInfoProvider) { 247 | this.tagInfoProvider = tagInfoProvider; 248 | } 249 | 250 | public void setTranslateSpecialEntities(final boolean translateSpecialEntities) { 251 | this.translateSpecialEntities = translateSpecialEntities; 252 | } 253 | 254 | public void setTransResCharsToNCR(final boolean transResCharsToNCR) { 255 | this.transResCharsToNCR = transResCharsToNCR; 256 | } 257 | 258 | public void setTransSpecialEntitiesToNCR(final boolean transSpecialEntitiesToNCR) { 259 | this.transSpecialEntitiesToNCR = transSpecialEntitiesToNCR; 260 | } 261 | 262 | public void setTreatDeprecatedTagsAsContent(final boolean treatDeprecatedTagsAsContent) { 263 | this.treatDeprecatedTagsAsContent = treatDeprecatedTagsAsContent; 264 | } 265 | 266 | public void setTreatUnknownTagsAsContent(final boolean treatUnknownTagsAsContent) { 267 | this.treatUnknownTagsAsContent = treatUnknownTagsAsContent; 268 | } 269 | 270 | public void setUseCdataForScriptAndStyle(final boolean useCdataForScriptAndStyle) { 271 | this.useCdataForScriptAndStyle = useCdataForScriptAndStyle; 272 | } 273 | 274 | public void setUseEmptyElementTags(final boolean useEmptyElementTags) { 275 | this.useEmptyElementTags = useEmptyElementTags; 276 | } 277 | } 278 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/HtmlSerializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.IOException; 57 | import java.io.Writer; 58 | import java.util.Map; 59 | 60 | /** 61 | *

62 | * Abstract HTML serializer - contains common logic for descendants. 63 | *

64 | */ 65 | public abstract class HtmlSerializer extends Serializer { 66 | protected HtmlSerializer(final CleanerProperties props) { 67 | super(props); 68 | } 69 | 70 | protected boolean isMinimizedTagSyntax(final TagNode tagNode) { 71 | final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName()); 72 | return tagInfo != null && !tagNode.hasChildren() && tagInfo.isEmptyTag(); 73 | } 74 | 75 | protected boolean dontEscape(TagNode tagNode) { 76 | return isScriptOrStyle(tagNode); 77 | } 78 | 79 | protected String escapeText(String s) { 80 | final boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars(); 81 | final boolean translateSpecialEntities = props.isTranslateSpecialEntities(); 82 | 83 | if (s != null) { 84 | final int len = s.length(); 85 | final StringBuilder result = new StringBuilder(len); 86 | 87 | for (int i = 0; i < len; i++) { 88 | char ch = s.charAt(i); 89 | 90 | if (ch == '&') { 91 | if (i < len - 2 && s.charAt(i + 1) == '#') { 92 | boolean isHex = Character.toLowerCase(s.charAt(i + 2)) == 'x'; 93 | int charIndex = i + (isHex ? 3 : 2); 94 | int radix = isHex ? 16 : 10; 95 | String unicode = ""; 96 | while (charIndex < len) { 97 | char currCh = s.charAt(charIndex); 98 | if (currCh == ';') { 99 | break; 100 | } else if (Utils.isValidInt(unicode + currCh, radix)) { 101 | unicode += currCh; 102 | charIndex++; 103 | } else { 104 | charIndex--; 105 | break; 106 | } 107 | } 108 | 109 | if (Utils.isValidInt(unicode, radix)) { 110 | char unicodeChar = (char) Integer.parseInt(unicode, radix); 111 | if (!Utils.isValidXmlChar(unicodeChar)) { 112 | i = charIndex; 113 | } else if (!Utils.isReservedXmlChar(unicodeChar)) { 114 | result.append(recognizeUnicodeChars ? String.valueOf(unicodeChar) : "&#" + unicode 115 | + ";"); 116 | i = charIndex; 117 | } else { 118 | i = charIndex; 119 | result.append("&#" + unicode + ";"); 120 | } 121 | } else { 122 | result.append(props.isTransResCharsToNCR() ? "&#" + (int) '&' + ";" : "&"); 123 | } 124 | } else { 125 | // get minimal following sequence required to recognize 126 | // some special entitiy 127 | String seq = s.substring(i, i + Math.min(SpecialEntity.getMaxEntityLength() + 2, len - i)); 128 | int semiIndex = seq.indexOf(';'); 129 | if (semiIndex > 0) { 130 | String entityKey = seq.substring(1, semiIndex); 131 | SpecialEntity entity = SpecialEntity.getEntity(entityKey); 132 | if (entity != null) { 133 | if (translateSpecialEntities) { 134 | result.append(props.isTransSpecialEntitiesToNCR() ? entity.getDecimalNCR() : entity 135 | .getCharacter()); 136 | } else { 137 | result.append(entity.getEscapedValue()); 138 | } 139 | 140 | i += entityKey.length() + 1; 141 | continue; 142 | } 143 | } 144 | 145 | String sub = s.substring(i); 146 | boolean isReservedSeq = false; 147 | for (int j = 0; j < Utils.RESERVED_XML_CHARS_LIST.length; j++) { 148 | final char currentChar = Utils.RESERVED_XML_CHARS_LIST[j]; 149 | seq = Utils.RESERVED_XML_CHARS[currentChar]; 150 | if (sub.startsWith(seq)) { 151 | result.append(props.isTransResCharsToNCR() ? "&#" + (int) currentChar + ";" : seq); 152 | i += seq.length() - 1; 153 | isReservedSeq = true; 154 | break; 155 | } 156 | } 157 | if (!isReservedSeq) { 158 | result.append(props.isTransResCharsToNCR() ? "&#" + (int) '&' + ";" : "&"); 159 | } 160 | } 161 | } else if (Utils.isReservedXmlChar(ch)) { 162 | result.append(props.isTransResCharsToNCR() ? "&#" + (int) ch + ";" : ch); 163 | } else { 164 | result.append(ch); 165 | } 166 | } 167 | 168 | return result.toString(); 169 | } 170 | 171 | return null; 172 | } 173 | 174 | protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException { 175 | String tagName = tagNode.getName(); 176 | 177 | if (Utils.isEmptyString(tagName)) { 178 | return; 179 | } 180 | 181 | boolean nsAware = props.isNamespacesAware(); 182 | 183 | if (!nsAware && Utils.getXmlNSPrefix(tagName) != null) { 184 | tagName = Utils.getXmlName(tagName); 185 | } 186 | 187 | writer.write("<" + tagName); 188 | for (Map.Entry entry : tagNode.getAttributes().entrySet()) { 189 | String attName = entry.getKey(); 190 | if (!nsAware && Utils.getXmlNSPrefix(attName) != null) { 191 | attName = Utils.getXmlName(attName); 192 | } 193 | writer.write(" " + attName + "=\"" + escapeText(entry.getValue()) + "\""); 194 | } 195 | 196 | if (nsAware) { 197 | final Map nsDeclarations = tagNode.getNamespaceDeclarations(); 198 | if (nsDeclarations != null) { 199 | for (Map.Entry entry : nsDeclarations.entrySet()) { 200 | String prefix = entry.getKey(); 201 | String att = "xmlns"; 202 | if (prefix.length() > 0) { 203 | att += ":" + prefix; 204 | } 205 | writer.write(" " + att + "=\"" + escapeText(entry.getValue()) + "\""); 206 | } 207 | } 208 | } 209 | 210 | if (isMinimizedTagSyntax(tagNode)) { 211 | writer.write(" />"); 212 | if (newLine) { 213 | writer.write("\n"); 214 | } 215 | } else { 216 | writer.write(">"); 217 | } 218 | } 219 | 220 | protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException { 221 | String tagName = tagNode.getName(); 222 | 223 | if (Utils.isEmptyString(tagName)) { 224 | return; 225 | } 226 | 227 | if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) { 228 | tagName = Utils.getXmlName(tagName); 229 | } 230 | 231 | writer.write(""); 232 | if (newLine) { 233 | writer.write("\n"); 234 | } 235 | } 236 | 237 | } 238 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/Serializer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.*; 57 | import java.util.*; 58 | 59 | /** 60 | *

61 | * Basic abstract serializer - contains common logic for descendants (methods 62 | * writeXXX(). 63 | *

64 | */ 65 | public abstract class Serializer { 66 | 67 | /** 68 | * Used to implement serialization with missing envelope - omiting open and 69 | * close tags, just serialize children. 70 | */ 71 | private class HeadlessTagNode extends TagNode { 72 | private HeadlessTagNode(final TagNode wrappedNode) { 73 | super(""); 74 | getAttributes().putAll(wrappedNode.getAttributes()); 75 | getChildren().addAll(wrappedNode.getChildren()); 76 | setDocType(wrappedNode.getDocType()); 77 | final Map nsDecls = getNamespaceDeclarations(); 78 | if (nsDecls != null) { 79 | final Map wrappedNSDecls = wrappedNode.getNamespaceDeclarations(); 80 | if (wrappedNSDecls != null) { 81 | nsDecls.putAll(wrappedNSDecls); 82 | } 83 | } 84 | 85 | } 86 | } 87 | 88 | protected CleanerProperties props; 89 | 90 | protected Serializer(final CleanerProperties props) { 91 | this.props = props; 92 | } 93 | 94 | /** 95 | * @param tagNode 96 | * Node to serialize to string 97 | * @return Output as string 98 | * @throws IOException 99 | */ 100 | public String getAsString(final TagNode tagNode) throws IOException { 101 | return getAsString(tagNode, false); 102 | } 103 | 104 | /** 105 | * @param tagNode 106 | * Node to serialize to string 107 | * @param omitEnvelope 108 | * Tells whether to skip open and close tag of the node. 109 | * @return Output as string 110 | * @throws IOException 111 | */ 112 | public String getAsString(final TagNode tagNode, final boolean omitEnvelope) throws IOException { 113 | return getAsString(tagNode, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope); 114 | } 115 | 116 | /** 117 | * @param tagNode 118 | * Node to serialize to string 119 | * @param charset 120 | * Charset of the output - stands in xml declaration part 121 | * @return Output as string 122 | * @throws IOException 123 | */ 124 | public String getAsString(final TagNode tagNode, final String charset) throws IOException { 125 | return getAsString(tagNode, charset, false); 126 | } 127 | 128 | /** 129 | * @param tagNode 130 | * Node to serialize to string 131 | * @param charset 132 | * Charset of the output - stands in xml declaration part 133 | * @param omitEnvelope 134 | * Tells whether to skip open and close tag of the node. 135 | * @return Output as string 136 | * @throws IOException 137 | */ 138 | public String getAsString(final TagNode tagNode, final String charset, final boolean omitEnvelope) 139 | throws IOException { 140 | final StringWriter writer = new StringWriter(); 141 | write(tagNode, writer, charset, omitEnvelope); 142 | return writer.getBuffer().toString(); 143 | } 144 | 145 | protected boolean isScriptOrStyle(final TagNode tagNode) { 146 | final String tagName = tagNode.getName(); 147 | return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName); 148 | } 149 | 150 | protected abstract void serialize(TagNode tagNode, Writer writer) throws IOException; 151 | 152 | /** 153 | * Writes specified node using specified writer. 154 | * 155 | * @param tagNode 156 | * Node to serialize. 157 | * @param writer 158 | * Writer instance 159 | * @param charset 160 | * Charset of the output 161 | * @throws IOException 162 | */ 163 | public void write(final TagNode tagNode, final Writer writer, final String charset) throws IOException { 164 | write(tagNode, writer, charset, false); 165 | } 166 | 167 | /** 168 | * Writes specified node using specified writer. 169 | * 170 | * @param tagNode 171 | * Node to serialize. 172 | * @param writer 173 | * Writer instance 174 | * @param charset 175 | * Charset of the output 176 | * @param omitEnvelope 177 | * Tells whether to skip open and close tag of the node. 178 | * @throws IOException 179 | */ 180 | public void write(TagNode tagNode, Writer writer, final String charset, final boolean omitEnvelope) 181 | throws IOException { 182 | if (omitEnvelope) { 183 | tagNode = new HeadlessTagNode(tagNode); 184 | } 185 | writer = new BufferedWriter(writer); 186 | if (!props.isOmitXmlDeclaration()) { 187 | String declaration = ""; 192 | writer.write(declaration + "\n"); 193 | } 194 | 195 | if (!props.isOmitDoctypeDeclaration()) { 196 | final DoctypeToken doctypeToken = tagNode.getDocType(); 197 | if (doctypeToken != null) { 198 | doctypeToken.serialize(this, writer); 199 | } 200 | } 201 | 202 | serialize(tagNode, writer); 203 | 204 | writer.flush(); 205 | writer.close(); 206 | } 207 | 208 | /** 209 | * Writes specified TagNode to the file, using system default charset. 210 | * 211 | * @param tagNode 212 | * Node to be written 213 | * @param fileName 214 | * Output file name 215 | * @throws IOException 216 | */ 217 | public void writeToFile(final TagNode tagNode, final String fileName) throws IOException { 218 | writeToFile(tagNode, fileName, false); 219 | } 220 | 221 | /** 222 | * Writes specified TagNode to the file, using specified charset and 223 | * optionally omits node envelope (skips open and close tags of the node). 224 | * 225 | * @param tagNode 226 | * Node to be written 227 | * @param fileName 228 | * Output file name 229 | * @param omitEnvelope 230 | * Tells whether to skip open and close tag of the node. 231 | * @throws IOException 232 | */ 233 | public void writeToFile(final TagNode tagNode, final String fileName, final boolean omitEnvelope) 234 | throws IOException { 235 | writeToFile(tagNode, fileName, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope); 236 | } 237 | 238 | /** 239 | * Writes specified TagNode to the file, using specified charset. 240 | * 241 | * @param tagNode 242 | * Node to be written 243 | * @param fileName 244 | * Output file name 245 | * @param charset 246 | * Charset of the output 247 | * @throws IOException 248 | */ 249 | public void writeToFile(final TagNode tagNode, final String fileName, final String charset) throws IOException { 250 | writeToFile(tagNode, fileName, charset, false); 251 | } 252 | 253 | /** 254 | * Writes specified TagNode to the file, using specified charset and 255 | * optionally omits node envelope (skips open and close tags of the node). 256 | * 257 | * @param tagNode 258 | * Node to be written 259 | * @param fileName 260 | * Output file name 261 | * @param charset 262 | * Charset of the output 263 | * @param omitEnvelope 264 | * Tells whether to skip open and close tag of the node. 265 | * @throws IOException 266 | */ 267 | public void writeToFile(final TagNode tagNode, final String fileName, final String charset, 268 | final boolean omitEnvelope) throws IOException { 269 | writeToStream(tagNode, new FileOutputStream(fileName), charset, omitEnvelope); 270 | } 271 | 272 | /** 273 | * Writes specified TagNode to the output stream, using system default 274 | * charset. 275 | * 276 | * @param tagNode 277 | * Node to be written 278 | * @param out 279 | * Output stream 280 | * @throws IOException 281 | */ 282 | public void writeToStream(final TagNode tagNode, final OutputStream out) throws IOException { 283 | writeToStream(tagNode, out, false); 284 | } 285 | 286 | /** 287 | * Writes specified TagNode to the output stream, using system default 288 | * charset and optionally omits node envelope (skips open and close tags of 289 | * the node). 290 | * 291 | * @param tagNode 292 | * Node to be written 293 | * @param out 294 | * Output stream 295 | * @param omitEnvelope 296 | * Tells whether to skip open and close tag of the node. 297 | * @throws IOException 298 | */ 299 | public void writeToStream(final TagNode tagNode, final OutputStream out, final boolean omitEnvelope) 300 | throws IOException { 301 | writeToStream(tagNode, out, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope); 302 | } 303 | 304 | /** 305 | * Writes specified TagNode to the output stream, using specified charset. 306 | * 307 | * @param tagNode 308 | * Node to be written 309 | * @param out 310 | * Output stream 311 | * @param charset 312 | * Charset of the output 313 | * @throws IOException 314 | */ 315 | public void writeToStream(final TagNode tagNode, final OutputStream out, final String charset) throws IOException { 316 | writeToStream(tagNode, out, charset, false); 317 | } 318 | 319 | /** 320 | * Writes specified TagNode to the output stream, using specified charset 321 | * and optionally omits node envelope (skips open and close tags of the 322 | * node). 323 | * 324 | * @param tagNode 325 | * Node to be written 326 | * @param out 327 | * Output stream 328 | * @param charset 329 | * Charset of the output 330 | * @param omitEnvelope 331 | * Tells whether to skip open and close tag of the node. 332 | * @throws IOException 333 | */ 334 | public void writeToStream(final TagNode tagNode, final OutputStream out, final String charset, 335 | final boolean omitEnvelope) throws IOException { 336 | write(tagNode, new OutputStreamWriter(out, charset), charset, omitEnvelope); 337 | } 338 | } 339 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/TagInfo.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011-2013 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | package org.htmlcleaner; 18 | 19 | import java.util.HashSet; 20 | import java.util.Set; 21 | import java.util.StringTokenizer; 22 | 23 | /** 24 | *

25 | * Class contains information about single HTML tag.
26 | * It also contains rules for tag balancing. For each tag, list of dependant 27 | * tags may be defined. There are several kinds of dependancies used to reorder 28 | * tags: 29 | *

    30 | *
  • 31 | * fatal tags - required outer tag - the tag will be ignored during parsing 32 | * (will be skipped) if this fatal tag is missing. For example, most web 33 | * browsers ignore elements TD, TR, TBODY if they are not in the context of 34 | * TABLE tag.
  • 35 | *
  • 36 | * required enclosing tags - if there is no such, it is implicitely created. For 37 | * example if TD is out of TR - open TR is created before.
  • 38 | *
  • 39 | * forbidden tags - it is not allowed to occure inside - for example FORM cannot 40 | * be inside other FORM and it will be ignored during cleanup.
  • 41 | *
  • 42 | * allowed children tags - for example TR allowes TD and TH. If there are some 43 | * dependant allowed tags defined then cleaner ignores other tags, treating them 44 | * as unallowed, unless they are in some other relationship with this tag.
  • 45 | *
  • 46 | * higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT.
  • 47 | *
  • 48 | * tags that must be closed and copied - for example, in 49 | * <a href="#"><div>.... tag A must be closed before 50 | * DIV but copied again inside DIV.
  • 51 | *
  • 52 | * tags that must be closed before closing this tag and copied again after - for 53 | * example, in <i><b>at</i> first</b> text 54 | * tag B must be closed before closing I, but it must be copied again after 55 | * resulting finally in sequence: 56 | * <i><b>at</b></i><b> first</b> text 57 | * .
  • 58 | *
59 | *

60 | * 61 | *

62 | * Tag TR for instance (table row) may define the following dependancies: 63 | *

    64 | *
  • fatal tag is table
  • 65 | *
  • required enclosing tag is tbody
  • 66 | *
  • allowed children tags are td,th
  • 67 | *
  • higher level tags are thead,tfoot
  • 68 | *
  • tags that muste be closed before are 69 | * tr,td,th,caption,colgroup
  • 70 | *
71 | * meaning the following:
72 | *
    73 | *
  • tr must be in context of table, otherwise it 74 | * will be ignored,
  • 75 | *
  • tr may can be directly inside tbody, 76 | * tfoot and thead, otherwise tbody will 77 | * be implicitely created in front of it.
  • 78 | *
  • tr can contain td and th, all 79 | * other tags and content will be pushed out of current limiting context, in the 80 | * case of html tables, in front of enclosing table tag.
  • 81 | *
  • if previous open tag is one of tr, caption or 82 | * colgroup, it will be implicitely closed.
  • 83 | *
84 | *

85 | */ 86 | public class TagInfo { 87 | 88 | protected static final int BODY = 2; 89 | protected static final int CONTENT_ALL = 0; 90 | protected static final int CONTENT_NONE = 1; 91 | 92 | protected static final int CONTENT_TEXT = 2; 93 | protected static final int HEAD = 1; 94 | protected static final int HEAD_AND_BODY = 0; 95 | 96 | private int belongsTo = BODY; 97 | private Set childTags = new HashSet(); 98 | final private int contentType; 99 | private Set continueAfterTags = new HashSet(); 100 | private Set copyTags = new HashSet(); 101 | private boolean deprecated = false; 102 | private String fatalTag = null; 103 | private Set higherTags = new HashSet(); 104 | private boolean ignorePermitted = false; 105 | private Set mustCloseTags = new HashSet(); 106 | private String name; 107 | private Set permittedTags = new HashSet(); 108 | private String requiredParent = null; 109 | private boolean unique = false; 110 | 111 | public TagInfo(final String name, final int contentType, final int belongsTo, final boolean depricated, 112 | final boolean unique, final boolean ignorePermitted) { 113 | this.name = name; 114 | this.contentType = contentType; 115 | this.belongsTo = belongsTo; 116 | this.deprecated = depricated; 117 | this.unique = unique; 118 | this.ignorePermitted = ignorePermitted; 119 | } 120 | 121 | public boolean allowsAnything() { 122 | return CONTENT_ALL == contentType && childTags.isEmpty(); 123 | } 124 | 125 | public boolean allowsBody() { 126 | return CONTENT_NONE != contentType; 127 | } 128 | 129 | public boolean allowsItem(final BaseToken token) { 130 | if (contentType != CONTENT_NONE && token instanceof TagToken) { 131 | final TagToken tagToken = (TagToken) token; 132 | final String tagName = tagToken.getName(); 133 | if ("script".equals(tagName)) { 134 | return true; 135 | } 136 | } 137 | 138 | if (CONTENT_ALL == contentType) { 139 | if (!childTags.isEmpty()) { 140 | return token instanceof TagToken ? childTags.contains(((TagToken) token).getName()) : false; 141 | } else if (!permittedTags.isEmpty()) { 142 | return token instanceof TagToken ? !permittedTags.contains(((TagToken) token).getName()) : true; 143 | } 144 | return true; 145 | } else if (CONTENT_TEXT == contentType) { 146 | return !(token instanceof TagToken); 147 | } 148 | 149 | return false; 150 | } 151 | 152 | public void defineAllowedChildrenTags(final String commaSeparatedListOfTags) { 153 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 154 | while (tokenizer.hasMoreTokens()) { 155 | final String currTag = tokenizer.nextToken(); 156 | this.childTags.add(currTag); 157 | } 158 | } 159 | 160 | public void defineCloseBeforeCopyInsideTags(final String commaSeparatedListOfTags) { 161 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 162 | while (tokenizer.hasMoreTokens()) { 163 | final String currTag = tokenizer.nextToken(); 164 | this.copyTags.add(currTag); 165 | this.mustCloseTags.add(currTag); 166 | } 167 | } 168 | 169 | public void defineCloseBeforeTags(final String commaSeparatedListOfTags) { 170 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 171 | while (tokenizer.hasMoreTokens()) { 172 | final String currTag = tokenizer.nextToken(); 173 | this.mustCloseTags.add(currTag); 174 | } 175 | } 176 | 177 | public void defineCloseInsideCopyAfterTags(final String commaSeparatedListOfTags) { 178 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 179 | while (tokenizer.hasMoreTokens()) { 180 | final String currTag = tokenizer.nextToken(); 181 | this.continueAfterTags.add(currTag); 182 | } 183 | } 184 | 185 | public void defineFatalTags(final String commaSeparatedListOfTags) { 186 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 187 | while (tokenizer.hasMoreTokens()) { 188 | final String currTag = tokenizer.nextToken(); 189 | this.fatalTag = currTag; 190 | this.higherTags.add(currTag); 191 | } 192 | } 193 | 194 | public void defineForbiddenTags(final String commaSeparatedListOfTags) { 195 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 196 | while (tokenizer.hasMoreTokens()) { 197 | final String currTag = tokenizer.nextToken(); 198 | this.permittedTags.add(currTag); 199 | } 200 | } 201 | 202 | public void defineHigherLevelTags(final String commaSeparatedListOfTags) { 203 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 204 | while (tokenizer.hasMoreTokens()) { 205 | final String currTag = tokenizer.nextToken(); 206 | this.higherTags.add(currTag); 207 | } 208 | } 209 | 210 | public void defineRequiredEnclosingTags(final String commaSeparatedListOfTags) { 211 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); 212 | while (tokenizer.hasMoreTokens()) { 213 | final String currTag = tokenizer.nextToken(); 214 | this.requiredParent = currTag; 215 | this.higherTags.add(currTag); 216 | } 217 | } 218 | 219 | public int getBelongsTo() { 220 | return belongsTo; 221 | } 222 | 223 | public Set getChildTags() { 224 | return childTags; 225 | } 226 | 227 | public int getContentType() { 228 | return contentType; 229 | } 230 | 231 | public Set getContinueAfterTags() { 232 | return continueAfterTags; 233 | } 234 | 235 | public Set getCopyTags() { 236 | return copyTags; 237 | } 238 | 239 | public String getFatalTag() { 240 | return fatalTag; 241 | } 242 | 243 | public Set getHigherTags() { 244 | return higherTags; 245 | } 246 | 247 | public Set getMustCloseTags() { 248 | return mustCloseTags; 249 | } 250 | 251 | public String getName() { 252 | return name; 253 | } 254 | 255 | public Set getPermittedTags() { 256 | return permittedTags; 257 | } 258 | 259 | public String getRequiredParent() { 260 | return requiredParent; 261 | } 262 | 263 | public boolean hasCopyTags() { 264 | return !copyTags.isEmpty(); 265 | } 266 | 267 | public boolean hasPermittedTags() { 268 | return !permittedTags.isEmpty(); 269 | } 270 | 271 | public boolean isContinueAfter(final String tagName) { 272 | return continueAfterTags.contains(tagName); 273 | } 274 | 275 | public boolean isCopy(final String tagName) { 276 | return copyTags.contains(tagName); 277 | } 278 | 279 | public boolean isDeprecated() { 280 | return deprecated; 281 | } 282 | 283 | public boolean isEmptyTag() { 284 | return CONTENT_NONE == contentType; 285 | } 286 | 287 | public boolean isHeadAndBodyTag() { 288 | return belongsTo == HEAD || belongsTo == HEAD_AND_BODY; 289 | } 290 | 291 | public boolean isHeadTag() { 292 | return belongsTo == HEAD; 293 | } 294 | 295 | public boolean isHigher(final String tagName) { 296 | return higherTags.contains(tagName); 297 | } 298 | 299 | public boolean isIgnorePermitted() { 300 | return ignorePermitted; 301 | } 302 | 303 | public boolean isMustCloseTag(final TagInfo tagInfo) { 304 | if (tagInfo != null) { 305 | return mustCloseTags.contains(tagInfo.getName()) || tagInfo.contentType == CONTENT_TEXT; 306 | } 307 | 308 | return false; 309 | } 310 | 311 | public boolean isUnique() { 312 | return unique; 313 | } 314 | 315 | public void setBelongsTo(final int belongsTo) { 316 | this.belongsTo = belongsTo; 317 | } 318 | 319 | public void setChildTags(final Set childTags) { 320 | this.childTags = childTags; 321 | } 322 | 323 | // other functionality 324 | 325 | public void setContinueAfterTags(final Set continueAfterTags) { 326 | this.continueAfterTags = continueAfterTags; 327 | } 328 | 329 | public void setCopyTags(final Set copyTags) { 330 | this.copyTags = copyTags; 331 | } 332 | 333 | public void setDeprecated(final boolean deprecated) { 334 | this.deprecated = deprecated; 335 | } 336 | 337 | public void setFatalTag(final String fatalTag) { 338 | this.fatalTag = fatalTag; 339 | } 340 | 341 | public void setHigherTags(final Set higherTags) { 342 | this.higherTags = higherTags; 343 | } 344 | 345 | public void setIgnorePermitted(final boolean ignorePermitted) { 346 | this.ignorePermitted = ignorePermitted; 347 | } 348 | 349 | public void setMustCloseTags(final Set mustCloseTags) { 350 | this.mustCloseTags = mustCloseTags; 351 | } 352 | 353 | public void setName(final String name) { 354 | this.name = name; 355 | } 356 | 357 | public void setPermittedTags(final Set permittedTags) { 358 | this.permittedTags = permittedTags; 359 | } 360 | 361 | public void setRequiredParent(final String requiredParent) { 362 | this.requiredParent = requiredParent; 363 | } 364 | 365 | public void setUnique(final boolean unique) { 366 | this.unique = unique; 367 | } 368 | } 369 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/SpecialEntity.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.util.HashMap; 57 | import java.util.Map; 58 | 59 | /** 60 | *

61 | * This class contains map with special entities used in HTML and their 62 | * unicodes. 63 | *

64 | */ 65 | final public class SpecialEntity { 66 | 67 | private static Map entities = new HashMap(); 68 | 69 | private static int maxEntityLength = 0; 70 | 71 | static { 72 | addEntity("nbsp", 160); 73 | addEntity("iexcl", 161); 74 | addEntity("cent", 162); 75 | addEntity("pound", 163); 76 | addEntity("curren", 164); 77 | addEntity("yen", 165); 78 | addEntity("brvbar", 166); 79 | addEntity("sect", 167); 80 | addEntity("uml", 168); 81 | addEntity("copy", 169); 82 | addEntity("ordf", 170); 83 | addEntity("laquo", 171); 84 | addEntity("not", 172); 85 | addEntity("shy", 173); 86 | addEntity("reg", 174); 87 | addEntity("macr", 175); 88 | addEntity("deg", 176); 89 | addEntity("plusmn", 177); 90 | addEntity("sup2", 178); 91 | addEntity("sup3", 179); 92 | addEntity("acute", 180); 93 | addEntity("micro", 181); 94 | addEntity("para", 182); 95 | addEntity("middot", 183); 96 | addEntity("cedil", 184); 97 | addEntity("sup1", 185); 98 | addEntity("ordm", 186); 99 | addEntity("raquo", 187); 100 | addEntity("frac14", 188); 101 | addEntity("frac12", 189); 102 | addEntity("frac34", 190); 103 | addEntity("iquest", 191); 104 | addEntity("Agrave", 192); 105 | addEntity("Aacute", 193); 106 | addEntity("Acirc", 194); 107 | addEntity("Atilde", 195); 108 | addEntity("Auml", 196); 109 | addEntity("Aring", 197); 110 | addEntity("AElig", 198); 111 | addEntity("Ccedil", 199); 112 | addEntity("Egrave", 200); 113 | addEntity("Eacute", 201); 114 | addEntity("Ecirc", 202); 115 | addEntity("Euml", 203); 116 | addEntity("Igrave", 204); 117 | addEntity("Iacute", 205); 118 | addEntity("Icirc", 206); 119 | addEntity("Iuml", 207); 120 | addEntity("ETH", 208); 121 | addEntity("Ntilde", 209); 122 | addEntity("Ograve", 210); 123 | addEntity("Oacute", 211); 124 | addEntity("Ocirc", 212); 125 | addEntity("Otilde", 213); 126 | addEntity("Ouml", 214); 127 | addEntity("times", 215); 128 | addEntity("Oslash", 216); 129 | addEntity("Ugrave", 217); 130 | addEntity("Uacute", 218); 131 | addEntity("Ucirc", 219); 132 | addEntity("Uuml", 220); 133 | addEntity("Yacute", 221); 134 | addEntity("THORN", 222); 135 | addEntity("szlig", 223); 136 | addEntity("agrave", 224); 137 | addEntity("aacute", 225); 138 | addEntity("acirc", 226); 139 | addEntity("atilde", 227); 140 | addEntity("auml", 228); 141 | addEntity("aring", 229); 142 | addEntity("aelig", 230); 143 | addEntity("ccedil", 231); 144 | addEntity("egrave", 232); 145 | addEntity("eacute", 233); 146 | addEntity("ecirc", 234); 147 | addEntity("euml", 235); 148 | addEntity("igrave", 236); 149 | addEntity("iacute", 237); 150 | addEntity("icirc", 238); 151 | addEntity("iuml", 239); 152 | addEntity("eth", 240); 153 | addEntity("ntilde", 241); 154 | addEntity("ograve", 242); 155 | addEntity("oacute", 243); 156 | addEntity("ocirc", 244); 157 | addEntity("otilde", 245); 158 | addEntity("ouml", 246); 159 | addEntity("divide", 247); 160 | addEntity("oslash", 248); 161 | addEntity("ugrave", 249); 162 | addEntity("uacute", 250); 163 | addEntity("ucirc", 251); 164 | addEntity("uuml", 252); 165 | addEntity("yacute", 253); 166 | addEntity("thorn", 254); 167 | addEntity("yuml", 255); 168 | addEntity("OElig", 338); 169 | addEntity("oelig", 339); 170 | addEntity("Scaron", 352); 171 | addEntity("scaron", 353); 172 | addEntity("Yuml", 376); 173 | addEntity("fnof", 402); 174 | addEntity("circ", 710); 175 | addEntity("tilde", 732); 176 | 177 | // Greek letters 178 | addEntity("Alpha", 913); 179 | addEntity("Beta", 914); 180 | addEntity("Gamma", 915); 181 | addEntity("Delta", 916); 182 | addEntity("Epsilon", 917); 183 | addEntity("Zeta", 918); 184 | addEntity("Eta", 919); 185 | addEntity("Theta", 920); 186 | addEntity("Iota", 921); 187 | addEntity("Kappa", 922); 188 | addEntity("Lambda", 923); 189 | addEntity("Mu", 924); 190 | addEntity("Nu", 925); 191 | addEntity("Xi", 926); 192 | addEntity("Omicron", 927); 193 | addEntity("Pi", 928); 194 | addEntity("Rho", 929); 195 | addEntity("Sigma", 931); 196 | addEntity("Tau", 932); 197 | addEntity("Upsilon", 933); 198 | addEntity("Phi", 934); 199 | addEntity("Chi", 935); 200 | addEntity("Psi", 936); 201 | addEntity("Omega", 937); 202 | addEntity("alpha", 945); 203 | addEntity("beta", 946); 204 | addEntity("gamma", 947); 205 | addEntity("delta", 948); 206 | addEntity("epsilon", 949); 207 | addEntity("zeta", 950); 208 | addEntity("eta", 951); 209 | addEntity("theta", 952); 210 | addEntity("iota", 953); 211 | addEntity("kappa", 954); 212 | addEntity("lambda", 955); 213 | addEntity("mu", 956); 214 | addEntity("nu", 957); 215 | addEntity("xi", 958); 216 | addEntity("omicron", 959); 217 | addEntity("pi", 960); 218 | addEntity("rho", 961); 219 | addEntity("sigmaf", 962); 220 | addEntity("sigma", 963); 221 | addEntity("tau", 964); 222 | addEntity("upsilon", 965); 223 | addEntity("phi", 966); 224 | addEntity("chi", 967); 225 | addEntity("psi", 968); 226 | addEntity("omega", 969); 227 | addEntity("thetasym", 977); 228 | addEntity("upsih", 978); 229 | addEntity("piv", 982); 230 | 231 | addEntity("ensp", 8194); 232 | addEntity("emsp", 8195); 233 | addEntity("thinsp", 8201); 234 | addEntity("zwnj", 8204); 235 | addEntity("zwj", 8205); 236 | addEntity("lrm", 8206); 237 | addEntity("rlm", 8207); 238 | addEntity("ndash", 8211); 239 | addEntity("mdash", 8212); 240 | addEntity("lsquo", 8216); 241 | addEntity("rsquo", 8217); 242 | addEntity("sbquo", 8218); 243 | addEntity("ldquo", 8220); 244 | addEntity("rdquo", 8221); 245 | addEntity("bdquo", 8222); 246 | addEntity("dagger", 8224); 247 | addEntity("Dagger", 8225); 248 | addEntity("bull", 8226); 249 | 250 | addEntity("hellip", 8230); 251 | addEntity("permil", 8240); 252 | addEntity("prime", 8242); 253 | addEntity("Prime", 8243); 254 | addEntity("lsaquo", 8249); 255 | addEntity("rsaquo", 8250); 256 | addEntity("oline", 8254); 257 | addEntity("frasl", 8260); 258 | addEntity("euro", 8364); 259 | addEntity("image", 8465); 260 | addEntity("weierp", 8472); 261 | addEntity("real", 8476); 262 | addEntity("trade", 8482); 263 | addEntity("alefsym", 8501); 264 | addEntity("larr", 8592); 265 | addEntity("uarr", 8593); 266 | addEntity("rarr", 8594); 267 | addEntity("darr", 8595); 268 | addEntity("harr", 8596); 269 | addEntity("crarr", 8629); 270 | addEntity("lArr", 8656); 271 | addEntity("uArr", 8657); 272 | addEntity("rArr", 8658); 273 | addEntity("dArr", 8659); 274 | addEntity("hArr", 8660); 275 | 276 | // math symbols 277 | addEntity("forall", 8704); 278 | addEntity("part", 8706); 279 | addEntity("exist", 8707); 280 | addEntity("empty", 8709); 281 | addEntity("nabla", 8711); 282 | addEntity("isin", 8712); 283 | addEntity("notin", 8713); 284 | addEntity("ni", 8715); 285 | addEntity("prod", 8719); 286 | addEntity("sum", 8721); 287 | addEntity("minus", 8722); 288 | addEntity("lowast", 8727); 289 | addEntity("radic", 8730); 290 | addEntity("prop", 8733); 291 | addEntity("infin", 8734); 292 | addEntity("ang", 8736); 293 | addEntity("and", 8743); 294 | addEntity("or", 8744); 295 | addEntity("cap", 8745); 296 | addEntity("cup", 8746); 297 | addEntity("int", 8747); 298 | addEntity("there4", 8756); 299 | addEntity("sim", 8764); 300 | addEntity("cong", 8773); 301 | addEntity("asymp", 8776); 302 | addEntity("ne", 8800); 303 | addEntity("equiv", 8801); 304 | addEntity("le", 8804); 305 | addEntity("ge", 8805); 306 | addEntity("sub", 8834); 307 | addEntity("sup", 8835); 308 | addEntity("nsub", 8836); 309 | addEntity("sube", 8838); 310 | addEntity("supe", 8839); 311 | addEntity("oplus", 8853); 312 | addEntity("otimes", 8855); 313 | addEntity("perp", 8869); 314 | addEntity("sdot", 8901); 315 | addEntity("lceil", 8968); 316 | addEntity("rceil", 8969); 317 | addEntity("lfloor", 8970); 318 | addEntity("rfloor", 8971); 319 | addEntity("lang", 9001); 320 | addEntity("rang", 9002); 321 | addEntity("loz", 9674); 322 | addEntity("spades", 9824); 323 | addEntity("clubs", 9827); 324 | addEntity("hearts", 9829); 325 | addEntity("diams", 9830); 326 | } 327 | 328 | /** 329 | * Add new entity to the set. 330 | * 331 | * @param entityName 332 | * Entity name, for example "pound" 333 | * @param intCode 334 | * Unicode of the entity, for example 163 335 | * 336 | * @throws org.htmlcleaner.HtmlCleanerException 337 | */ 338 | public static void addEntity(final String entityName, final int intCode) throws HtmlCleanerException { 339 | if (entities.containsKey(entityName)) { 340 | throw new HtmlCleanerException("Entity \"" + entityName + "\" is already defined!"); 341 | } 342 | entities.put(entityName, new SpecialEntity(entityName, intCode)); 343 | final int entityNameLen = entityName.length(); 344 | if (entityNameLen > maxEntityLength) { 345 | maxEntityLength = entityNameLen; 346 | } 347 | } 348 | 349 | public static SpecialEntity getEntity(final String key) { 350 | return entities.get(key); 351 | } 352 | 353 | public static int getMaxEntityLength() { 354 | return maxEntityLength; 355 | } 356 | 357 | final private String key; 358 | final private int intCode; 359 | 360 | private SpecialEntity(final String key, final int intCode) { 361 | this.key = key; 362 | this.intCode = intCode; 363 | } 364 | 365 | public char getCharacter() { 366 | return (char) intCode; 367 | } 368 | 369 | /** 370 | * @return Numeric Character Reference in decimal format 371 | */ 372 | public String getDecimalNCR() { 373 | return "&#" + intCode + ";"; 374 | } 375 | 376 | /** 377 | * @return Escaped value of the entity 378 | */ 379 | public String getEscapedValue() { 380 | return "&" + key + ";"; 381 | } 382 | 383 | /** 384 | * @return Numeric Character Reference in hex format 385 | */ 386 | public String getHexNCR() { 387 | return "&#x" + Integer.toHexString(intCode) + ";"; 388 | } 389 | 390 | public int getIntCode() { 391 | return intCode; 392 | } 393 | 394 | public String getKey() { 395 | return key; 396 | } 397 | 398 | } 399 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/Utils.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.io.*; 57 | import java.net.URL; 58 | import java.nio.charset.Charset; 59 | import java.util.Map; 60 | import java.util.StringTokenizer; 61 | import java.util.regex.Matcher; 62 | import java.util.regex.Pattern; 63 | 64 | /** 65 | *

66 | * Common utilities. 67 | *

68 | */ 69 | final public class Utils { 70 | final public static int RESERVED_XML_CHARS_SIZE = 128; 71 | final public static String VAR_START = "${"; 72 | final public static String VAR_END = "}"; 73 | 74 | public static final String RESERVED_XML_CHARS[] = new String[RESERVED_XML_CHARS_SIZE]; 75 | public static final char RESERVED_XML_CHARS_LIST[] = { '&', '<', '>', '\"', '\'' }; 76 | 77 | static { 78 | RESERVED_XML_CHARS['&'] = "&"; 79 | RESERVED_XML_CHARS['<'] = "<"; 80 | RESERVED_XML_CHARS['>'] = ">"; 81 | RESERVED_XML_CHARS['\"'] = """; 82 | RESERVED_XML_CHARS['\''] = "'"; 83 | } 84 | 85 | /** 86 | * Trims specified string from left. 87 | * 88 | * @param s 89 | */ 90 | public static String ltrim(final String s) { 91 | if (s == null) { 92 | return null; 93 | } 94 | 95 | int index = 0; 96 | final int len = s.length(); 97 | 98 | while (index < len && Character.isWhitespace(s.charAt(index))) { 99 | index++; 100 | } 101 | 102 | return (index >= len) ? "" : s.substring(index); 103 | } 104 | 105 | /** 106 | * Trims specified string from right. 107 | * 108 | * @param s 109 | */ 110 | public static String rtrim(final String s) { 111 | if (s == null) { 112 | return null; 113 | } 114 | 115 | final int len = s.length(); 116 | int index = len; 117 | 118 | while (index > 0 && Character.isWhitespace(s.charAt(index - 1))) { 119 | index--; 120 | } 121 | 122 | return (index <= 0) ? "" : s.substring(0, index); 123 | } 124 | 125 | public static String getCharsetFromContentTypeString(final String contentType) { 126 | if (contentType != null) { 127 | final String pattern = "charset=([a-z\\d\\-]*)"; 128 | final Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(contentType); 129 | if (matcher.find()) { 130 | final String charset = matcher.group(1); 131 | if (Charset.isSupported(charset)) { 132 | return charset; 133 | } 134 | } 135 | } 136 | 137 | return null; 138 | } 139 | 140 | public static String getCharsetFromContent(final URL url) throws IOException { 141 | final InputStream stream = url.openStream(); 142 | final byte chunk[] = new byte[2048]; 143 | final int bytesRead = stream.read(chunk); 144 | if (bytesRead > 0) { 145 | final String startContent = new String(chunk); 146 | final String pattern = "\\]"; 147 | final Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(startContent); 148 | if (matcher.find()) { 149 | final String charset = matcher.group(1); 150 | if (Charset.isSupported(charset)) { 151 | return charset; 152 | } 153 | } 154 | } 155 | 156 | return null; 157 | } 158 | 159 | public static boolean isHexadecimalDigit(final char ch) { 160 | return Character.isDigit(ch) || ch == 'A' || ch == 'a' || ch == 'B' || ch == 'b' || ch == 'C' || ch == 'c' 161 | || ch == 'D' || ch == 'd' || ch == 'E' || ch == 'e' || ch == 'F' || ch == 'f'; 162 | } 163 | 164 | public static boolean isValidXmlChar(final char ch) { 165 | return ((ch >= 0x20) && (ch <= 0xD7FF)) || (ch == 0x9) || (ch == 0xA) || (ch == 0xD) 166 | || ((ch >= 0xE000) && (ch <= 0xFFFD)) || ((ch >= 0x10000) && (ch <= 0x10FFFF)); 167 | } 168 | 169 | public static boolean isReservedXmlChar(final char ch) { 170 | return (ch < RESERVED_XML_CHARS_SIZE && RESERVED_XML_CHARS[ch] != null); 171 | } 172 | 173 | public static boolean isValidInt(final String s, final int radix) { 174 | try { 175 | Integer.parseInt(s, radix); 176 | return true; 177 | } catch (NumberFormatException e) { 178 | return false; 179 | } 180 | } 181 | 182 | /** 183 | * Escapes XML string. 184 | * 185 | * @param s 186 | * String to be escaped 187 | * @param props 188 | * Cleaner properties gover affect escaping behaviour 189 | * @param isDomCreation 190 | * Tells if escaped content will be part of the DOM 191 | */ 192 | public static String escapeXml(final String s, final CleanerProperties props, final boolean isDomCreation) { 193 | final boolean advanced = props.isAdvancedXmlEscape(); 194 | final boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars(); 195 | final boolean translateSpecialEntities = props.isTranslateSpecialEntities(); 196 | 197 | if (s != null) { 198 | final int len = s.length(); 199 | final StringBuilder result = new StringBuilder(len); 200 | 201 | for (int i = 0; i < len; i++) { 202 | final char ch = s.charAt(i); 203 | 204 | if (ch == '&') { 205 | if ((advanced || recognizeUnicodeChars) && (i < len - 2) && (s.charAt(i + 1) == '#')) { 206 | final boolean isHex = Character.toLowerCase(s.charAt(i + 2)) == 'x'; 207 | int charIndex = i + (isHex ? 3 : 2); 208 | final int radix = isHex ? 16 : 10; 209 | String unicode = ""; 210 | while (charIndex < len) { 211 | final char currCh = s.charAt(charIndex); 212 | if (currCh == ';') { 213 | break; 214 | } else if (isValidInt(unicode + currCh, radix)) { 215 | unicode += currCh; 216 | charIndex++; 217 | } else { 218 | charIndex--; 219 | break; 220 | } 221 | } 222 | 223 | if (isValidInt(unicode, radix)) { 224 | final char unicodeChar = (char) Integer.parseInt(unicode, radix); 225 | if (!isValidXmlChar(unicodeChar)) { 226 | i = charIndex; 227 | } else if (!isReservedXmlChar(unicodeChar)) { 228 | result.append(recognizeUnicodeChars ? String.valueOf(unicodeChar) : "&#" + unicode 229 | + ";"); 230 | i = charIndex; 231 | } else { 232 | i = charIndex; 233 | result.append("&#" + (isHex ? "x" : "") + unicode + ";"); 234 | } 235 | } else { 236 | result.append("&"); 237 | } 238 | } else { 239 | if (translateSpecialEntities) { 240 | // get minimal following sequence required to 241 | // recognize some special entitiy 242 | final String seq = s.substring(i, 243 | i + Math.min(SpecialEntity.getMaxEntityLength() + 2, len - i)); 244 | final int semiIndex = seq.indexOf(';'); 245 | if (semiIndex > 0) { 246 | final String entityKey = seq.substring(1, semiIndex); 247 | final SpecialEntity entity = SpecialEntity.getEntity(entityKey); 248 | if (entity != null) { 249 | result.append(props.isTransSpecialEntitiesToNCR() ? entity.getDecimalNCR() : entity 250 | .getCharacter()); 251 | i += entityKey.length() + 1; 252 | continue; 253 | } 254 | } 255 | } 256 | 257 | if (advanced) { 258 | final String sub = s.substring(i); 259 | boolean isReservedSeq = false; 260 | for (int j = 0; j < RESERVED_XML_CHARS_LIST.length; j++) { 261 | final char currentChar = RESERVED_XML_CHARS_LIST[j]; 262 | final String seq = RESERVED_XML_CHARS[currentChar]; 263 | if (sub.startsWith(seq)) { 264 | result.append(isDomCreation ? currentChar : (props.isTransResCharsToNCR() ? "&#" 265 | + (int) currentChar + ";" : seq)); 266 | i += seq.length() - 1; 267 | isReservedSeq = true; 268 | break; 269 | } 270 | } 271 | if (!isReservedSeq) { 272 | result.append(isDomCreation ? "&" : (props.isTransResCharsToNCR() ? "&#" + (int) '&' 273 | + ";" : RESERVED_XML_CHARS['&'])); 274 | } 275 | continue; 276 | } 277 | 278 | result.append("&"); 279 | } 280 | } else if (isReservedXmlChar(ch)) { 281 | result.append(props.isTransResCharsToNCR() ? "&#" + (int) ch + ";" : (isDomCreation ? ch 282 | : RESERVED_XML_CHARS[ch])); 283 | } else { 284 | result.append(ch); 285 | } 286 | } 287 | 288 | return result.toString(); 289 | } 290 | 291 | return null; 292 | } 293 | 294 | /** 295 | * Checks whether specified object's string representation is empty string 296 | * (containing of only whitespaces). 297 | * 298 | * @param object 299 | * Object whose string representation is checked 300 | * @return true, if empty string, false otherwise 301 | */ 302 | public static boolean isWhitespaceString(final Object object) { 303 | if (object != null) { 304 | final String s = object.toString(); 305 | return s != null && "".equals(s.trim()); 306 | } 307 | return false; 308 | } 309 | 310 | /** 311 | * Checks if specified character can be part of xml identifier (tag name of 312 | * attribute name) and is not standard identifier character. 313 | * 314 | * @param ch 315 | * Character to be checked 316 | * @return True if it can be part of xml identifier 317 | */ 318 | public static boolean isIdentifierHelperChar(final char ch) { 319 | return ':' == ch || '.' == ch || '-' == ch || '_' == ch; 320 | } 321 | 322 | /** 323 | * Chacks whether specified string can be valid tag name or attribute name 324 | * in xml. 325 | * 326 | * @param s 327 | * String to be checked 328 | * @return True if string is valid xml identifier, false otherwise 329 | */ 330 | public static boolean isValidXmlIdentifier(final String s) { 331 | if (s != null) { 332 | final int len = s.length(); 333 | if (len == 0) { 334 | return false; 335 | } 336 | for (int i = 0; i < len; i++) { 337 | final char ch = s.charAt(i); 338 | if ((i == 0 && !Character.isUnicodeIdentifierStart(ch) && ch != '_') 339 | || (!Character.isUnicodeIdentifierStart(ch) && !Character.isDigit(ch) && !Utils 340 | .isIdentifierHelperChar(ch))) { 341 | return false; 342 | } 343 | } 344 | return true; 345 | } 346 | 347 | return false; 348 | } 349 | 350 | /** 351 | * @param o 352 | * @return True if specified string is null of contains only whitespace 353 | * characters 354 | */ 355 | public static boolean isEmptyString(final Object o) { 356 | return o == null || "".equals(o.toString().trim()); 357 | } 358 | 359 | /** 360 | * Evaluates string template for specified map of variables. Template string 361 | * can contain dynamic parts in the form of ${VARNAME}. Each such part is 362 | * replaced with value of the variable if such exists in the map, or with 363 | * empty string otherwise. 364 | * 365 | * @param template 366 | * Template string 367 | * @param variables 368 | * Map of variables (can be null) 369 | * @return Evaluated string 370 | */ 371 | public static String evaluateTemplate(final String template, final Map variables) { 372 | if (template == null) { 373 | return template; 374 | } 375 | 376 | final StringBuilder result = new StringBuilder(); 377 | 378 | int startIndex = template.indexOf(VAR_START); 379 | int endIndex = -1; 380 | 381 | while (startIndex >= 0 && startIndex < template.length()) { 382 | result.append(template.substring(endIndex + 1, startIndex)); 383 | endIndex = template.indexOf(VAR_END, startIndex); 384 | 385 | if (endIndex > startIndex) { 386 | final String varName = template.substring(startIndex + VAR_START.length(), endIndex); 387 | final Object resultObj = variables != null ? variables.get(varName.toLowerCase()) : ""; 388 | result.append(resultObj == null ? "" : resultObj.toString()); 389 | } 390 | 391 | startIndex = template.indexOf(VAR_START, Math.max(endIndex + VAR_END.length(), startIndex + 1)); 392 | } 393 | 394 | result.append(template.substring(endIndex + 1)); 395 | 396 | return result.toString(); 397 | } 398 | 399 | public static String[] tokenize(final String s, final String delimiters) { 400 | if (s == null) { 401 | return new String[] {}; 402 | } 403 | 404 | final StringTokenizer tokenizer = new StringTokenizer(s, delimiters); 405 | final String result[] = new String[tokenizer.countTokens()]; 406 | int index = 0; 407 | while (tokenizer.hasMoreTokens()) { 408 | result[index++] = tokenizer.nextToken(); 409 | } 410 | 411 | return result; 412 | } 413 | 414 | public static void updateTagTransformations(final CleanerTransformations transformations, final String key, 415 | final String value) { 416 | final int index = key.indexOf('.'); 417 | 418 | // new tag transformation case (tagname[=destname[,preserveatts]]) 419 | if (index <= 0) { 420 | String destTag = null; 421 | boolean preserveSourceAtts = true; 422 | if (value != null) { 423 | final String[] tokens = tokenize(value, ",;"); 424 | if (tokens.length > 0) { 425 | destTag = tokens[0]; 426 | } 427 | if (tokens.length > 1) { 428 | preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) || "yes".equalsIgnoreCase(tokens[1]) 429 | || "1".equals(tokens[1]); 430 | } 431 | } 432 | final TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts); 433 | transformations.addTransformation(newTagTrans); 434 | } else { // attribute transformation description 435 | final String[] parts = tokenize(key, "."); 436 | final String tagName = parts[0]; 437 | final TagTransformation trans = transformations.getTransformation(tagName); 438 | if (trans != null) { 439 | trans.addAttributeTransformation(parts[1], value); 440 | } 441 | } 442 | } 443 | 444 | /** 445 | * Checks if specified link is full URL. 446 | * 447 | * @param link 448 | * @return True, if full URl, false otherwise. 449 | */ 450 | public static boolean isFullUrl(String link) { 451 | if (link == null) { 452 | return false; 453 | } 454 | link = link.trim().toLowerCase(); 455 | return link.startsWith("http://") || link.startsWith("https://") || link.startsWith("file://"); 456 | } 457 | 458 | /** 459 | * Calculates full URL for specified page URL and link which could be full, 460 | * absolute or relative like there can be found in A or IMG tags. 461 | */ 462 | public static String fullUrl(String pageUrl, final String link) { 463 | if (isFullUrl(link)) { 464 | return link; 465 | } else if (link != null && link.charAt(0) == '?') { 466 | final int qindex = pageUrl.indexOf('?'); 467 | final int len = pageUrl.length(); 468 | if (qindex < 0) { 469 | return pageUrl + link; 470 | } else if (qindex == len - 1) { 471 | return pageUrl.substring(0, len - 1) + link; 472 | } else { 473 | return pageUrl + "&" + link.substring(1); 474 | } 475 | } 476 | 477 | final boolean isLinkAbsolute = (link.charAt(0) == '/'); 478 | 479 | if (!isFullUrl(pageUrl)) { 480 | pageUrl = "http://" + pageUrl; 481 | } 482 | 483 | final int slashIndex = isLinkAbsolute ? pageUrl.indexOf('/', 8) : pageUrl.lastIndexOf('/'); 484 | if (slashIndex <= 8) { 485 | pageUrl += "/"; 486 | } else { 487 | pageUrl = pageUrl.substring(0, slashIndex + 1); 488 | } 489 | 490 | return isLinkAbsolute ? pageUrl + link.substring(1) : pageUrl + link; 491 | } 492 | 493 | /** 494 | * @param name 495 | * @return For xml element name or attribute name returns prefix (part 496 | * before :) or null if there is no prefix 497 | */ 498 | public static String getXmlNSPrefix(final String name) { 499 | final int colIndex = name.indexOf(':'); 500 | if (colIndex > 0) { 501 | return name.substring(0, colIndex); 502 | } 503 | 504 | return null; 505 | } 506 | 507 | /** 508 | * @param name 509 | * @return For xml element name or attribute name returns name after prefix 510 | * (part after :) 511 | */ 512 | public static String getXmlName(final String name) { 513 | final int colIndex = name.indexOf(':'); 514 | if (colIndex > 0 && colIndex < name.length() - 1) { 515 | return name.substring(colIndex + 1); 516 | } 517 | 518 | return name; 519 | } 520 | 521 | } 522 | -------------------------------------------------------------------------------- /src/org/htmlcleaner/XPather.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2011 Zheng Sun 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /* Copyright (c) 2006-2007, Vladimir Nikic 18 | All rights reserved. 19 | 20 | Redistribution and use of this software in source and binary forms, 21 | with or without modification, are permitted provided that the following 22 | conditions are met: 23 | 24 | * Redistributions of source code must retain the above 25 | copyright notice, this list of conditions and the 26 | following disclaimer. 27 | 28 | * Redistributions in binary form must reproduce the above 29 | copyright notice, this list of conditions and the 30 | following disclaimer in the documentation and/or other 31 | materials provided with the distribution. 32 | 33 | * The name of HtmlCleaner may not be used to endorse or promote 34 | products derived from this software without specific prior 35 | written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 47 | POSSIBILITY OF SUCH DAMAGE. 48 | 49 | You can contact Vladimir Nikic by sending e-mail to 50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the 51 | subject line. 52 | */ 53 | 54 | package org.htmlcleaner; 55 | 56 | import java.util.*; 57 | 58 | /** 59 | *

60 | * Utility for searching cleaned document tree with XPath expressions. 61 | *

62 | * Examples of supported axes: 63 | *
    64 | *
  • //div//a
  • 65 | *
  • //div//a[@id][@class]
  • 66 | *
  • /body/*[1]/@type
  • 67 | *
  • //div[3]//a[@id][@href='r/n4']
  • 68 | *
  • //div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a
  • 69 | *
  • //div[2]/@*[2]
  • 70 | *
  • data(//div//a[@id][@class])
  • 71 | *
  • //p/last()
  • 72 | *
  • //body//div[3][@class]//span[12.2 73 | *
  • data(//a['v' < @id])
  • 74 | *
75 | *
76 | */ 77 | public class XPather { 78 | 79 | // array of basic tokens of which XPath expression is made 80 | final private String tokenArray[]; 81 | 82 | /** 83 | * Constructor - creates XPather instance with specified XPath expression. 84 | * 85 | * @param expression 86 | */ 87 | public XPather(final String expression) { 88 | final StringTokenizer tokenizer = new StringTokenizer(expression, "/()[]\"'=<>", true); 89 | final int tokenCount = tokenizer.countTokens(); 90 | tokenArray = new String[tokenCount]; 91 | 92 | int index = 0; 93 | 94 | // this is not real XPath compiler, rather simple way to recognize basic 95 | // XPaths expressions 96 | // and interpret them against some TagNode instance. 97 | while (tokenizer.hasMoreTokens()) { 98 | tokenArray[index++] = tokenizer.nextToken(); 99 | } 100 | } 101 | 102 | private Collection evaluateAgainst(final Collection object, int from, final int to, final boolean isRecursive, 103 | final int position, final int last, final boolean isFilterContext, final Collection filterSource) 104 | throws XPatherException { 105 | if (from >= 0 && to < tokenArray.length && from <= to) { 106 | if (tokenArray[from].trim().length() == 0) { 107 | return evaluateAgainst(object, from + 1, to, isRecursive, position, last, isFilterContext, filterSource); 108 | } else if (isToken("(", from)) { 109 | final int closingBracket = findClosingIndex(from, to); 110 | if (closingBracket > 0) { 111 | final Collection value = evaluateAgainst(object, from + 1, closingBracket - 1, false, position, 112 | last, isFilterContext, filterSource); 113 | return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, 114 | filterSource); 115 | } else { 116 | throwStandardException(); 117 | } 118 | } else if (isToken("[", from)) { 119 | final int closingBracket = findClosingIndex(from, to); 120 | if (closingBracket > 0) { 121 | final Collection value = filterByCondition(object, from + 1, closingBracket - 1); 122 | return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, 123 | filterSource); 124 | } else { 125 | throwStandardException(); 126 | } 127 | } else if (isToken("\"", from) || isToken("'", from)) { 128 | // string constant 129 | final int closingQuote = findClosingIndex(from, to); 130 | if (closingQuote > from) { 131 | final Collection value = singleton(flatten(from + 1, closingQuote - 1)); 132 | return evaluateAgainst(value, closingQuote + 1, to, false, position, last, isFilterContext, 133 | filterSource); 134 | } else { 135 | throwStandardException(); 136 | } 137 | } else if ((isToken("=", from) || isToken("<", from) || isToken(">", from)) && isFilterContext) { 138 | // operator inside filter 139 | final boolean logicValue; 140 | if (isToken("=", from + 1) && (isToken("<", from) || isToken(">", from))) { 141 | final Collection secondObject = evaluateAgainst(filterSource, from + 2, to, false, position, last, 142 | isFilterContext, filterSource); 143 | logicValue = evaluateLogic(object, secondObject, tokenArray[from] + tokenArray[from + 1]); 144 | } else { 145 | final Collection secondObject = evaluateAgainst(filterSource, from + 1, to, false, position, last, 146 | isFilterContext, filterSource); 147 | logicValue = evaluateLogic(object, secondObject, tokenArray[from]); 148 | } 149 | return singleton(Boolean.valueOf(logicValue)); 150 | } else if (isToken("/", from)) { // children of the node 151 | final boolean goRecursive = isToken("/", from + 1); 152 | if (goRecursive) { 153 | from++; 154 | } 155 | if (from < to) { 156 | int toIndex = findClosingIndex(from, to) - 1; 157 | if (toIndex <= from) { 158 | toIndex = to; 159 | } 160 | final Collection value = evaluateAgainst(object, from + 1, toIndex, goRecursive, 1, last, 161 | isFilterContext, filterSource); 162 | return evaluateAgainst(value, toIndex + 1, to, false, 1, last, isFilterContext, filterSource); 163 | } else { 164 | throwStandardException(); 165 | } 166 | } else if (isFunctionCall(from, to)) { 167 | final int closingBracketIndex = findClosingIndex(from + 1, to); 168 | final Collection funcValue = evaluateFunction(object, from, to, position, last, isFilterContext); 169 | return evaluateAgainst(funcValue, closingBracketIndex + 1, to, false, 1, last, isFilterContext, 170 | filterSource); 171 | } else if (isValidInteger(tokenArray[from])) { 172 | final Collection value = singleton(Integer.valueOf(tokenArray[from])); 173 | return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource); 174 | } else if (isValidDouble(tokenArray[from])) { 175 | final Collection value = singleton(new Double(tokenArray[from])); 176 | return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource); 177 | } else { 178 | return getElementsByName(object, from, to, isRecursive, isFilterContext); 179 | } 180 | } else { 181 | return object; 182 | } 183 | 184 | throw new XPatherException(); 185 | } 186 | 187 | /** 188 | * Main public method for this class - a way to execute XPath expression 189 | * against specified TagNode instance. 190 | * 191 | * @param node 192 | */ 193 | public Object[] evaluateAgainstNode(final TagNode node) throws XPatherException { 194 | if (node == null) { 195 | throw new XPatherException("Cannot evaluate XPath expression against null value!"); 196 | } 197 | 198 | final Collection collectionResult = evaluateAgainst(singleton(node), 0, tokenArray.length - 1, false, 1, 0, 199 | false, null); 200 | final Object[] array = new Object[collectionResult.size()]; 201 | 202 | final Iterator iterator = collectionResult.iterator(); 203 | int index = 0; 204 | while (iterator.hasNext()) { 205 | array[index++] = iterator.next(); 206 | } 207 | 208 | return array; 209 | } 210 | 211 | /** 212 | * Evaluates specified function. Currently, following XPath functions are 213 | * supported: last, position, text, count, data 214 | * 215 | * @param source 216 | * @param from 217 | * @param to 218 | * @param position 219 | * @param last 220 | * @return Collection as the result of evaluation. 221 | */ 222 | private Collection evaluateFunction(final Collection source, final int from, final int to, final int position, 223 | final int last, final boolean isFilterContext) throws XPatherException { 224 | final String name = tokenArray[from].trim(); 225 | final ArrayList result = new ArrayList(); 226 | 227 | final int size = source.size(); 228 | final Iterator iterator = source.iterator(); 229 | int index = 0; 230 | while (iterator.hasNext()) { 231 | final Object curr = iterator.next(); 232 | index++; 233 | if ("last".equals(name)) { 234 | result.add(Integer.valueOf(isFilterContext ? last : size)); 235 | } else if ("position".equals(name)) { 236 | result.add(Integer.valueOf(isFilterContext ? position : index)); 237 | } else if ("text".equals(name)) { 238 | if (curr instanceof TagNode) { 239 | result.add(((TagNode) curr).getText()); 240 | } else if (curr instanceof String) { 241 | result.add(curr.toString()); 242 | } 243 | } else if ("count".equals(name)) { 244 | final Collection argumentEvaluated = evaluateAgainst(source, from + 2, to - 1, false, position, 0, 245 | isFilterContext, null); 246 | result.add(Integer.valueOf(argumentEvaluated.size())); 247 | } else if ("data".equals(name)) { 248 | final Collection argumentEvaluated = evaluateAgainst(source, from + 2, to - 1, false, position, 0, 249 | isFilterContext, null); 250 | final Iterator it = argumentEvaluated.iterator(); 251 | while (it.hasNext()) { 252 | final Object elem = it.next(); 253 | if (elem instanceof TagNode) { 254 | result.add(((TagNode) elem).getText()); 255 | } else if (elem instanceof String) { 256 | result.add(elem.toString()); 257 | } 258 | } 259 | } else { 260 | throw new XPatherException("Unknown function " + name + "!"); 261 | } 262 | } 263 | 264 | return result; 265 | } 266 | 267 | /** 268 | * Evaluates logic operation on two collections. 269 | * 270 | * @param first 271 | * @param second 272 | * @param logicOperator 273 | * @return Result of logic operation 274 | */ 275 | private boolean evaluateLogic(final Collection first, final Collection second, final String logicOperator) { 276 | if (first == null || first.isEmpty() || second == null || second.isEmpty()) { 277 | return false; 278 | } 279 | final Object elem1 = first.iterator().next(); 280 | final Object elem2 = second.iterator().next(); 281 | if (elem1 instanceof Number && elem2 instanceof Number) { 282 | final double d1 = ((Number) elem1).doubleValue(); 283 | final double d2 = ((Number) elem2).doubleValue(); 284 | if ("=".equals(logicOperator)) { 285 | return d1 == d2; 286 | } else if ("<".equals(logicOperator)) { 287 | return d1 < d2; 288 | } else if (">".equals(logicOperator)) { 289 | return d1 > d2; 290 | } else if ("<=".equals(logicOperator)) { 291 | return d1 <= d2; 292 | } else if (">=".equals(logicOperator)) { 293 | return d1 >= d2; 294 | } 295 | } else { 296 | final String s1 = toText(elem1); 297 | final String s2 = toText(elem2); 298 | final int result = s1.compareTo(s2); 299 | if ("=".equals(logicOperator)) { 300 | return result == 0; 301 | } else if ("<".equals(logicOperator)) { 302 | return result < 0; 303 | } else if (">".equals(logicOperator)) { 304 | return result > 0; 305 | } else if ("<=".equals(logicOperator)) { 306 | return result <= 0; 307 | } else if (">=".equals(logicOperator)) { 308 | return result >= 0; 309 | } 310 | } 311 | 312 | return false; 313 | } 314 | 315 | /** 316 | * Filter nodes satisfying the condition 317 | * 318 | * @param source 319 | * @param from 320 | * @param to 321 | */ 322 | private final Collection filterByCondition(final Collection source, final int from, final int to) 323 | throws XPatherException { 324 | final ArrayList result = new ArrayList(); 325 | final Iterator iterator = source.iterator(); 326 | int index = 0; 327 | final int size = source.size(); 328 | while (iterator.hasNext()) { 329 | final Object curr = iterator.next(); 330 | index++; 331 | 332 | final ArrayList logicValueList = new ArrayList(evaluateAgainst(singleton(curr), from, to, false, index, 333 | size, true, singleton(curr))); 334 | if (logicValueList.size() >= 1) { 335 | final Object first = logicValueList.get(0); 336 | if (first instanceof Boolean) { 337 | if (((Boolean) first).booleanValue()) { 338 | result.add(curr); 339 | } 340 | } else if (first instanceof Integer) { 341 | if (((Integer) first).intValue() == index) { 342 | result.add(curr); 343 | } 344 | } else { 345 | result.add(curr); 346 | } 347 | } 348 | } 349 | return result; 350 | } 351 | 352 | /** 353 | * @param from 354 | * @param to 355 | * @return matching closing index in the token array for the current token, 356 | * or -1 if there is no closing token within expected bounds. 357 | */ 358 | private int findClosingIndex(final int from, final int to) { 359 | if (from < to) { 360 | final String currToken = tokenArray[from]; 361 | 362 | if ("\"".equals(currToken)) { 363 | for (int i = from + 1; i <= to; i++) { 364 | if ("\"".equals(tokenArray[i])) { 365 | return i; 366 | } 367 | } 368 | } else if ("'".equals(currToken)) { 369 | for (int i = from + 1; i <= to; i++) { 370 | if ("'".equals(tokenArray[i])) { 371 | return i; 372 | } 373 | } 374 | } else if ("(".equals(currToken) || "[".equals(currToken) || "/".equals(currToken)) { 375 | boolean isQuoteClosed = true; 376 | boolean isAposClosed = true; 377 | int brackets = "(".equals(currToken) ? 1 : 0; 378 | int angleBrackets = "[".equals(currToken) ? 1 : 0; 379 | int slashes = "/".equals(currToken) ? 1 : 0; 380 | for (int i = from + 1; i <= to; i++) { 381 | if ("\"".equals(tokenArray[i])) { 382 | isQuoteClosed = !isQuoteClosed; 383 | } else if ("'".equals(tokenArray[i])) { 384 | isAposClosed = !isAposClosed; 385 | } else if ("(".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) { 386 | brackets++; 387 | } else if (")".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) { 388 | brackets--; 389 | } else if ("[".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) { 390 | angleBrackets++; 391 | } else if ("]".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) { 392 | angleBrackets--; 393 | } else if ("/".equals(tokenArray[i]) && isQuoteClosed && isAposClosed && brackets == 0 394 | && angleBrackets == 0) { 395 | slashes--; 396 | } 397 | 398 | if (isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0 && slashes == 0) { 399 | return i; 400 | } 401 | } 402 | } 403 | 404 | } 405 | 406 | return -1; 407 | } 408 | 409 | private String flatten(final int from, final int to) { 410 | if (from <= to) { 411 | final StringBuffer result = new StringBuffer(); 412 | for (int i = from; i <= to; i++) { 413 | result.append(tokenArray[i]); 414 | } 415 | 416 | return result.toString(); 417 | } 418 | 419 | return ""; 420 | } 421 | 422 | /** 423 | * For the given source collection and specified name, returns collection of 424 | * subnodes or attribute values. 425 | * 426 | * @param source 427 | * @param from 428 | * @param to 429 | * @param isRecursive 430 | * @return Colection of TagNode instances or collection of String instances. 431 | */ 432 | private Collection getElementsByName(final Collection source, final int from, final int to, 433 | final boolean isRecursive, final boolean isFilterContext) throws XPatherException { 434 | String name = tokenArray[from].trim(); 435 | 436 | if (isAtt(name)) { 437 | name = name.substring(1); 438 | final Collection result = new ArrayList(); 439 | Collection nodes; 440 | if (isRecursive) { 441 | nodes = new LinkedHashSet(); 442 | final Iterator iterator = source.iterator(); 443 | while (iterator.hasNext()) { 444 | final Object next = iterator.next(); 445 | if (next instanceof TagNode) { 446 | final TagNode node = (TagNode) next; 447 | nodes.addAll(node.getAllElementsList(true)); 448 | } 449 | } 450 | } else { 451 | nodes = source; 452 | } 453 | 454 | final Iterator iterator = nodes.iterator(); 455 | while (iterator.hasNext()) { 456 | final Object next = iterator.next(); 457 | if (next instanceof TagNode) { 458 | final TagNode node = (TagNode) next; 459 | if ("*".equals(name)) { 460 | result.addAll(evaluateAgainst(node.getAttributes().values(), from + 1, to, false, 1, 1, 461 | isFilterContext, null)); 462 | } else { 463 | final String attValue = node.getAttributeByName(name); 464 | if (attValue != null) { 465 | result.addAll(evaluateAgainst(singleton(attValue), from + 1, to, false, 1, 1, 466 | isFilterContext, null)); 467 | } 468 | } 469 | } else { 470 | throwStandardException(); 471 | } 472 | } 473 | return result; 474 | } else { 475 | final Collection result = new LinkedHashSet(); 476 | final Iterator iterator = source.iterator(); 477 | int index = 0; 478 | while (iterator.hasNext()) { 479 | final Object next = iterator.next(); 480 | if (next instanceof TagNode) { 481 | final TagNode node = (TagNode) next; 482 | index++; 483 | final boolean isSelf = ".".equals(name); 484 | final boolean isParent = "..".equals(name); 485 | final boolean isAll = "*".equals(name); 486 | 487 | final Collection subnodes; 488 | if (isSelf) { 489 | subnodes = singleton(node); 490 | } else if (isParent) { 491 | final TagNode parent = node.getParent(); 492 | subnodes = parent != null ? singleton(parent) : new ArrayList(); 493 | } else { 494 | subnodes = isAll ? node.getChildTagList() : node.getElementListByName(name, false); 495 | } 496 | 497 | final LinkedHashSet nodeSet = new LinkedHashSet(subnodes); 498 | final Collection refinedSubnodes = evaluateAgainst(nodeSet, from + 1, to, false, index, nodeSet 499 | .size(), isFilterContext, null); 500 | 501 | if (isRecursive) { 502 | final List childTags = node.getChildTagList(); 503 | if (isSelf || isParent || isAll) { 504 | result.addAll(refinedSubnodes); 505 | } 506 | final Iterator childIterator = childTags.iterator(); 507 | while (childIterator.hasNext()) { 508 | final TagNode childTag = (TagNode) childIterator.next(); 509 | final Collection childrenByName = getElementsByName(singleton(childTag), from, to, 510 | isRecursive, isFilterContext); 511 | if (!isSelf && !isParent && !isAll && refinedSubnodes.contains(childTag)) { 512 | result.add(childTag); 513 | } 514 | result.addAll(childrenByName); 515 | } 516 | } else { 517 | result.addAll(refinedSubnodes); 518 | } 519 | } else { 520 | throwStandardException(); 521 | } 522 | } 523 | return result; 524 | } 525 | } 526 | 527 | /** 528 | * Checks if token is attribute (starts with @) 529 | * 530 | * @param token 531 | */ 532 | private boolean isAtt(final String token) { 533 | return token != null && token.length() > 1 && token.charAt(0) == '@'; 534 | } 535 | 536 | /** 537 | * Checks if tokens in specified range represents valid function call. 538 | * 539 | * @param from 540 | * @param to 541 | * @return True if it is valid function call, false otherwise. 542 | */ 543 | private boolean isFunctionCall(final int from, final int to) { 544 | if (!isIdentifier(tokenArray[from]) && !isToken("(", from + 1)) { 545 | return false; 546 | } 547 | 548 | return findClosingIndex(from + 1, to) > from + 1; 549 | } 550 | 551 | /** 552 | * Checks if given string is valid identifier. 553 | * 554 | * @param str 555 | */ 556 | private boolean isIdentifier(String str) { 557 | if (str == null) { 558 | return false; 559 | } 560 | 561 | str = str.trim(); 562 | if (str.length() > 0) { 563 | if (!Character.isLetter(str.charAt(0))) { 564 | return false; 565 | } 566 | for (int i = 1; i < str.length(); i++) { 567 | final char ch = str.charAt(i); 568 | if (ch != '_' && ch != '-' && !Character.isLetterOrDigit(ch)) { 569 | return false; 570 | } 571 | } 572 | } 573 | 574 | return false; 575 | } 576 | 577 | private boolean isToken(final String token, final int index) { 578 | final int len = tokenArray.length; 579 | return index >= 0 && index < len && tokenArray[index].trim().equals(token.trim()); 580 | } 581 | 582 | private boolean isValidDouble(final String s) { 583 | try { 584 | Double.parseDouble(s); 585 | return true; 586 | } catch (NumberFormatException e) { 587 | return false; 588 | } 589 | } 590 | 591 | private boolean isValidInteger(final String s) { 592 | try { 593 | Integer.parseInt(s); 594 | return true; 595 | } catch (NumberFormatException e) { 596 | return false; 597 | } 598 | } 599 | 600 | /** 601 | * Creates one-element collection for the specified object. 602 | * 603 | * @param element 604 | */ 605 | private Collection singleton(final Object element) { 606 | final ArrayList result = new ArrayList(); 607 | result.add(element); 608 | return result; 609 | } 610 | 611 | private void throwStandardException() throws XPatherException { 612 | throw new XPatherException(); 613 | } 614 | 615 | private String toText(final Object o) { 616 | if (o == null) { 617 | return ""; 618 | } 619 | if (o instanceof TagNode) { 620 | return ((TagNode) o).getText().toString(); 621 | } else { 622 | return o.toString(); 623 | } 624 | } 625 | 626 | } 627 | --------------------------------------------------------------------------------