├── .gitignore
├── .classpath
├── .project
├── README
├── .settings
└── org.eclipse.jdt.core.prefs
└── src
└── org
└── htmlcleaner
├── HtmlNode.java
├── TagNodeVisitor.java
├── BaseToken.java
├── FastHtmlSerializer.java
├── ITagInfoProvider.java
├── TagToken.java
├── XPatherException.java
├── HtmlCleanerException.java
├── EndTagToken.java
├── CommentNode.java
├── ContentNode.java
├── CleanerTransformations.java
├── SimpleHtmlSerializer.java
├── SimpleXmlSerializer.java
├── CompactXmlSerializer.java
├── DoctypeToken.java
├── BrowserCompactXmlSerializer.java
├── CompactHtmlSerializer.java
├── Html5TagProvider.java
├── TagTransformation.java
├── DomSerializer.java
├── XmlSerializer.java
├── PrettyHtmlSerializer.java
├── CleanerProperties.java
├── HtmlSerializer.java
├── Serializer.java
├── TagInfo.java
├── SpecialEntity.java
├── Utils.java
└── XPather.java
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | release/
3 |
--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | HtmlCleaner
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | HtmlCleaner is a project originally developed by Vladimir Nikic (http://htmlcleaner.sourceforge.net/).
2 |
3 | This version is modified by Zheng Sun.
4 |
5 | Briefly speaking, the modifications are
6 |
7 | * Added *final* keyword to variables if possible, to avoid memory leaks
8 | * Changed some methods for better performance
9 | * Add new class *FastHtmlSerializer* to output the HTML tree non-recursively, to avoid stack overflow (especially for Android)
10 | * Other minor changes
11 |
12 | HtmlCleaner is used as HTML parser in EasyRSS (http://easyrss.pursuer.me/).
13 | Author: Zheng Sun (http://pursuer.me).
14 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.6
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.6
12 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/HtmlNode.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | package org.htmlcleaner;
18 |
19 | /**
20 | * Marker interface denoting nodes of the document tree
21 | */
22 | public interface HtmlNode {
23 | }
24 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/TagNodeVisitor.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | package org.htmlcleaner;
18 |
19 | /**
20 | * Defines action to be performed on TagNodes
21 | */
22 | public interface TagNodeVisitor {
23 |
24 | /**
25 | * Action to be performed on single node in the tree
26 | *
27 | * @param parentNode
28 | * Parent of tagNode
29 | * @param htmlNode
30 | * node visited
31 | * @return True if tree traversal should be continued, false if it has to
32 | * stop.
33 | */
34 | boolean visit(TagNode parentNode, HtmlNode htmlNode);
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/BaseToken.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 |
59 | /**
60 | *
61 | * Base token interface. Tokens are individual entities recognized by HTML
62 | * parser.
63 | *
64 | */
65 | public interface BaseToken {
66 | void serialize(Serializer serializer, Writer writer) throws IOException;
67 | }
68 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/FastHtmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | package org.htmlcleaner;
18 |
19 | import java.io.IOException;
20 | import java.io.Writer;
21 | import java.util.ArrayList;
22 | import java.util.List;
23 | import java.util.Stack;
24 |
25 | public class FastHtmlSerializer extends HtmlSerializer {
26 | public FastHtmlSerializer(final CleanerProperties props) {
27 | super(props);
28 | }
29 |
30 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException {
31 | final Stack tagStack = new Stack();
32 | final Stack> childStack = new Stack>();
33 | serializeOpenTag(tagNode, writer, false);
34 | if (!isMinimizedTagSyntax(tagNode)) {
35 | tagStack.push(tagNode);
36 | childStack.push(new ArrayList(tagNode.getChildren()));
37 | while (!tagStack.isEmpty()) {
38 | final TagNode tag = tagStack.peek();
39 | final List children = childStack.peek();
40 | if (children.isEmpty()) {
41 | tagStack.pop();
42 | childStack.pop();
43 | if (!isMinimizedTagSyntax(tag)) {
44 | serializeEndTag(tag, writer, false);
45 | }
46 | } else {
47 | final Object item = children.get(0);
48 | children.remove(0);
49 | if (item instanceof ContentNode) {
50 | final String content = item.toString();
51 | writer.write(dontEscape(tag) ? content : escapeText(content));
52 | } else if (item instanceof TagNode) {
53 | final TagNode currentTag = (TagNode) item;
54 | serializeOpenTag(currentTag, writer, false);
55 | tagStack.push(currentTag);
56 | childStack.push(new ArrayList(currentTag.getChildren()));
57 | } else if (item instanceof BaseToken) {
58 | ((BaseToken) item).serialize(this, writer);
59 | }
60 | }
61 | }
62 | }
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/ITagInfoProvider.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | /**
57 | *
58 | * Provides set of TagInfo instances. The instance of this interface is used as
59 | * a collection of tag definitions used in cleanup process. Implementing this
60 | * interface desired behaviour of cleaner can be achived.
61 | * In most cases implementation will be or contain a kind of Map.
62 | *
63 | */
64 | public interface ITagInfoProvider {
65 | TagInfo getTagInfo(String tagName);
66 | }
67 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/TagToken.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | /**
57 | *
58 | * HTML tag token - descendants are start (TagNode) and end token (EndTagToken).
59 | *
60 | */
61 | public abstract class TagToken implements BaseToken {
62 | protected String name;
63 |
64 | public TagToken() {
65 | // TODO empty method
66 | }
67 |
68 | public TagToken(final String name) {
69 | this.name = name;
70 | }
71 |
72 | public String getName() {
73 | return name;
74 | }
75 |
76 | public String toString() {
77 | return name;
78 | }
79 |
80 | abstract public void setAttribute(String attName, String attValue);
81 | }
82 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/XPatherException.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | /**
57 | *
58 | * Exception that could occure during XPather evaluation.
59 | *
60 | */
61 | public class XPatherException extends Exception {
62 | private static final long serialVersionUID = 1L;
63 |
64 | public XPatherException() {
65 | this("Error in evaluating XPath expression!");
66 | }
67 |
68 | public XPatherException(final String message) {
69 | super(message);
70 | }
71 |
72 | public XPatherException(final String message, final Throwable cause) {
73 | super(message, cause);
74 | }
75 |
76 | public XPatherException(final Throwable cause) {
77 | super(cause);
78 | }
79 |
80 | }
81 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/HtmlCleanerException.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | /**
57 | *
58 | * General HtmlCleaner runtime exception.
59 | *
60 | */
61 | public class HtmlCleanerException extends RuntimeException {
62 | private static final long serialVersionUID = 1L;
63 |
64 | public HtmlCleanerException() {
65 | this("HtmlCleaner expression occureed!");
66 | }
67 |
68 | public HtmlCleanerException(final String message) {
69 | super(message);
70 | }
71 |
72 | public HtmlCleanerException(final String message, final Throwable cause) {
73 | super(message, cause);
74 | }
75 |
76 | public HtmlCleanerException(final Throwable cause) {
77 | super(cause);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/EndTagToken.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.Writer;
57 |
58 | /**
59 | *
60 | * HTML tag end token.
61 | *
62 | */
63 | public class EndTagToken extends TagToken {
64 | public EndTagToken() {
65 | super();
66 | }
67 |
68 | public EndTagToken(final String name) {
69 | super(name == null ? null : name.toLowerCase());
70 | }
71 |
72 | public void setAttribute(final String attName, final String attValue) {
73 | // do nothing - simply ignore attributes in closing tag
74 | }
75 |
76 | public void serialize(final Serializer serializer, final Writer writer) {
77 | // do nothing - simply ignore serialization
78 | }
79 |
80 | }
81 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/CommentNode.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 |
59 | /**
60 | *
61 | * HTML comment token.
62 | *
63 | */
64 | public class CommentNode implements BaseToken, HtmlNode {
65 | final private StringBuilder content;
66 |
67 | public CommentNode(final String content) {
68 | this.content = new StringBuilder(content);
69 | }
70 |
71 | public String getCommentedContent() {
72 | return "";
73 | }
74 |
75 | public StringBuilder getContent() {
76 | return content;
77 | }
78 |
79 | public void serialize(final Serializer serializer, final Writer writer) throws IOException {
80 | writer.write(getCommentedContent());
81 | }
82 |
83 | public String toString() {
84 | return getCommentedContent();
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/ContentNode.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 |
59 | /**
60 | *
61 | * HTML text token.
62 | *
63 | */
64 | public class ContentNode implements BaseToken, HtmlNode {
65 | final private StringBuilder content;
66 |
67 | public ContentNode(final char content[], final int len) {
68 | this.content = new StringBuilder(len + 16);
69 | this.content.append(content, 0, len);
70 | }
71 |
72 | public ContentNode(final String content) {
73 | this.content = new StringBuilder(content);
74 | }
75 |
76 | public StringBuilder getContent() {
77 | return content;
78 | }
79 |
80 | public void serialize(final Serializer serializer, final Writer writer) throws IOException {
81 | writer.write(content.toString());
82 | }
83 |
84 | public String toString() {
85 | return content.toString();
86 | }
87 |
88 | }
89 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/CleanerTransformations.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.util.HashMap;
57 | import java.util.Map;
58 |
59 | /**
60 | * Contains transformation collection.
61 | */
62 | public class CleanerTransformations {
63 |
64 | final private Map mappings = new HashMap();
65 |
66 | /**
67 | * Adds specified tag transformation to the collection.
68 | *
69 | * @param tagTransformation
70 | */
71 | public void addTransformation(final TagTransformation tagTransformation) {
72 | if (tagTransformation != null) {
73 | mappings.put(tagTransformation.getSourceTag(), tagTransformation);
74 | }
75 | }
76 |
77 | public TagTransformation getTransformation(final String tagName) {
78 | return tagName == null ? null : mappings.get(tagName.toLowerCase());
79 | }
80 |
81 | public boolean hasTransformationForTag(final String tagName) {
82 | return tagName != null && mappings.containsKey(tagName.toLowerCase());
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/SimpleHtmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 |
59 | /**
60 | *
61 | * Simple HTML serializer - creates resulting HTML without indenting and/or
62 | * compacting.
63 | *
64 | */
65 | public class SimpleHtmlSerializer extends HtmlSerializer {
66 | public SimpleHtmlSerializer(final CleanerProperties props) {
67 | super(props);
68 | }
69 |
70 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException {
71 | serializeOpenTag(tagNode, writer, false);
72 |
73 | if (!isMinimizedTagSyntax(tagNode)) {
74 | for (final Object item : tagNode.getChildren()) {
75 | if (item instanceof ContentNode) {
76 | final String content = item.toString();
77 | writer.write(dontEscape(tagNode) ? content : escapeText(content));
78 | } else if (item instanceof BaseToken) {
79 | ((BaseToken) item).serialize(this, writer);
80 | }
81 | }
82 |
83 | serializeEndTag(tagNode, writer, false);
84 | }
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/SimpleXmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 |
59 | /**
60 | *
61 | * Simple XML serializer - creates resulting XML without indenting lines.
62 | *
63 | */
64 | public class SimpleXmlSerializer extends XmlSerializer {
65 |
66 | public SimpleXmlSerializer(final CleanerProperties props) {
67 | super(props);
68 | }
69 |
70 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException {
71 | serializeOpenTag(tagNode, writer, false);
72 |
73 | if (!isMinimizedTagSyntax(tagNode)) {
74 | for (final Object item : tagNode.getChildren()) {
75 | if (item instanceof ContentNode) {
76 | final String content = item.toString();
77 | writer.write(dontEscape(tagNode) ? content.replaceAll("]]>", "]]>") : escapeXml(content));
78 | } else if (item instanceof BaseToken) {
79 | ((BaseToken) item).serialize(this, writer);
80 | }
81 | }
82 |
83 | serializeEndTag(tagNode, writer, false);
84 | }
85 | }
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/CompactXmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 | import java.util.*;
59 |
60 | /**
61 | *
62 | * Compact XML serializer - creates resulting XML by stripping whitespaces.
63 | *
64 | */
65 | public class CompactXmlSerializer extends XmlSerializer {
66 | public CompactXmlSerializer(final CleanerProperties props) {
67 | super(props);
68 | }
69 |
70 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException {
71 | serializeOpenTag(tagNode, writer, false);
72 |
73 | final List tagChildren = tagNode.getChildren();
74 | if (!isMinimizedTagSyntax(tagNode)) {
75 | final ListIterator childrenIt = tagChildren.listIterator();
76 | while (childrenIt.hasNext()) {
77 | final Object item = childrenIt.next();
78 | if (item instanceof ContentNode) {
79 | final String content = item.toString().trim();
80 | writer.write(dontEscape(tagNode) ? content.replaceAll("]]>", "]]>") : escapeXml(content));
81 |
82 | if (childrenIt.hasNext()) {
83 | if (!Utils.isWhitespaceString(childrenIt.next())) {
84 | writer.write("\n");
85 | }
86 | childrenIt.previous();
87 | }
88 | } else if (item instanceof CommentNode) {
89 | final String content = ((CommentNode) item).getCommentedContent().trim();
90 | writer.write(content);
91 | } else if (item instanceof BaseToken) {
92 | ((BaseToken) item).serialize(this, writer);
93 | }
94 | }
95 |
96 | serializeEndTag(tagNode, writer, false);
97 | }
98 | }
99 |
100 | }
101 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/DoctypeToken.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 |
59 | /**
60 | *
61 | * HTML doctype token.
62 | *
63 | */
64 | public class DoctypeToken implements BaseToken {
65 | private static String clean(String s) {
66 | if (s != null) {
67 | s = s.replace('>', ' ');
68 | s = s.replace('<', ' ');
69 | s = s.replace('&', ' ');
70 | s = s.replace('\'', ' ');
71 | s = s.replace('\"', ' ');
72 | }
73 |
74 | return s;
75 | }
76 |
77 | final private String part1;
78 | final private String part2;
79 | final private String part3;
80 | final private String part4;
81 |
82 | public DoctypeToken(final String part1, final String part2, final String part3, final String part4) {
83 | this.part1 = part1 != null ? part1.toUpperCase() : part1;
84 | this.part2 = part2 != null ? part2.toUpperCase() : part2;
85 | this.part3 = clean(part3);
86 | this.part4 = clean(part4);
87 | }
88 |
89 | public String getContent() {
90 | String result = "";
96 | return result;
97 | }
98 |
99 | public String getName() {
100 | return "";
101 | }
102 |
103 | public String getPart1() {
104 | return part1;
105 | }
106 |
107 | public String getPart2() {
108 | return part2;
109 | }
110 |
111 | public String getPart3() {
112 | return part3;
113 | }
114 |
115 | public String getPart4() {
116 | return part4;
117 | }
118 |
119 | public boolean isValid() {
120 | if (part1 == null || "".equals(part1)) {
121 | return false;
122 | }
123 |
124 | if (!"public".equalsIgnoreCase(part2) && !"system".equalsIgnoreCase(part2)) {
125 | return false;
126 | }
127 |
128 | if ("system".equalsIgnoreCase(part2) && part4 != null && !"".equals(part4)) {
129 | return false;
130 | }
131 |
132 | if ("public".equalsIgnoreCase(part2) && (part4 == null || "".equals(part4))) {
133 | return false;
134 | }
135 |
136 | return true;
137 | }
138 |
139 | public void serialize(final Serializer serializer, final Writer writer) throws IOException {
140 | writer.write(getContent() + "\n");
141 | }
142 |
143 | public String toString() {
144 | return getContent();
145 | }
146 | }
147 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/BrowserCompactXmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.Writer;
57 | import java.io.IOException;
58 | import java.util.List;
59 | import java.util.ListIterator;
60 |
61 | /**
62 | *
63 | * Broswer compact XML serializer - creates resulting XML by stripping
64 | * whitespaces wherever possible, but preserving single whitespace where at
65 | * least one exists. This behaviour is well suited for web-browsers, which
66 | * usualy treat multiple whitespaces as single one, but make diffrence between
67 | * single whitespace and empty text.
68 | *
69 | */
70 | public class BrowserCompactXmlSerializer extends XmlSerializer {
71 | public BrowserCompactXmlSerializer(final CleanerProperties props) {
72 | super(props);
73 | }
74 |
75 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException {
76 | serializeOpenTag(tagNode, writer, false);
77 |
78 | final List tagChildren = tagNode.getChildren();
79 | if (!isMinimizedTagSyntax(tagNode)) {
80 | final ListIterator childrenIt = tagChildren.listIterator();
81 | while (childrenIt.hasNext()) {
82 | final Object item = childrenIt.next();
83 | if (item instanceof ContentNode) {
84 | String content = item.toString();
85 | final boolean startsWithSpace = content.length() > 0 && Character.isWhitespace(content.charAt(0));
86 | final boolean endsWithSpace = content.length() > 1
87 | && Character.isWhitespace(content.charAt(content.length() - 1));
88 | content = dontEscape(tagNode) ? content.trim().replaceAll("]]>", "]]>") : escapeXml(content
89 | .trim());
90 |
91 | if (startsWithSpace) {
92 | writer.write(' ');
93 | }
94 |
95 | if (content.length() != 0) {
96 | writer.write(content);
97 | if (endsWithSpace) {
98 | writer.write(' ');
99 | }
100 | }
101 |
102 | if (childrenIt.hasNext()) {
103 | if (!Utils.isWhitespaceString(childrenIt.next())) {
104 | writer.write("\n");
105 | }
106 | childrenIt.previous();
107 | }
108 | } else if (item instanceof CommentNode) {
109 | final String content = ((CommentNode) item).getCommentedContent().trim();
110 | writer.write(content);
111 | } else if (item instanceof BaseToken) {
112 | ((BaseToken) item).serialize(this, writer);
113 | }
114 | }
115 |
116 | serializeEndTag(tagNode, writer, false);
117 | }
118 | }
119 |
120 | }
121 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/CompactHtmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 | import java.util.List;
59 | import java.util.ListIterator;
60 |
61 | /**
62 | *
63 | * Compact HTML serializer - creates resulting HTML by stripping whitespaces
64 | * wherever possible.
65 | *
66 | */
67 | public class CompactHtmlSerializer extends HtmlSerializer {
68 | private int openPreTags = 0;
69 |
70 | public CompactHtmlSerializer(final CleanerProperties props) {
71 | super(props);
72 | }
73 |
74 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException {
75 | final boolean isPreTag = "pre".equalsIgnoreCase(tagNode.getName());
76 | if (isPreTag) {
77 | openPreTags++;
78 | }
79 |
80 | serializeOpenTag(tagNode, writer, false);
81 |
82 | final List tagChildren = tagNode.getChildren();
83 | if (!isMinimizedTagSyntax(tagNode)) {
84 | final ListIterator childrenIt = tagChildren.listIterator();
85 | while (childrenIt.hasNext()) {
86 | final Object item = childrenIt.next();
87 | if (item instanceof ContentNode) {
88 | String content = item.toString();
89 | if (openPreTags > 0) {
90 | writer.write(content);
91 | } else {
92 | final boolean startsWithSpace = content.length() > 0
93 | && Character.isWhitespace(content.charAt(0));
94 | final boolean endsWithSpace = content.length() > 1
95 | && Character.isWhitespace(content.charAt(content.length() - 1));
96 | content = dontEscape(tagNode) ? content.trim() : escapeText(content.trim());
97 |
98 | if (startsWithSpace) {
99 | writer.write(' ');
100 | }
101 |
102 | if (content.length() != 0) {
103 | writer.write(content);
104 | if (endsWithSpace) {
105 | writer.write(' ');
106 | }
107 | }
108 |
109 | if (childrenIt.hasNext()) {
110 | if (!Utils.isWhitespaceString(childrenIt.next())) {
111 | writer.write("\n");
112 | }
113 | childrenIt.previous();
114 | }
115 | }
116 | } else if (item instanceof CommentNode) {
117 | final String content = ((CommentNode) item).getCommentedContent().trim();
118 | writer.write(content);
119 | } else if (item instanceof BaseToken) {
120 | ((BaseToken) item).serialize(this, writer);
121 | }
122 | }
123 |
124 | serializeEndTag(tagNode, writer, false);
125 | if (isPreTag) {
126 | openPreTags--;
127 | }
128 | }
129 | }
130 |
131 | }
132 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/Html5TagProvider.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | public class Html5TagProvider extends DefaultTagProvider {
57 | private static Html5TagProvider instance;
58 | private static final long serialVersionUID = 1L;
59 |
60 | /**
61 | * @return Singleton instance of this class.
62 | */
63 | public static synchronized Html5TagProvider getInstance() {
64 | if (instance == null) {
65 | instance = new Html5TagProvider();
66 | }
67 | return instance;
68 | }
69 |
70 | protected Html5TagProvider() {
71 | super();
72 |
73 | TagInfo tagInfo;
74 |
75 | tagInfo = new TagInfo("time", TagInfo.CONTENT_TEXT, TagInfo.BODY, false, false, false);
76 | tagInfo.defineCloseBeforeCopyInsideTags("bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font");
77 | tagInfo.defineCloseBeforeTags("address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
78 | this.put("time", tagInfo);
79 |
80 | tagInfo = new TagInfo("article", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
81 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font");
82 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
83 | this.put("article", tagInfo);
84 |
85 | tagInfo = new TagInfo("section", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
86 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font");
87 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
88 | this.put("section", tagInfo);
89 |
90 | tagInfo = new TagInfo("header", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
91 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font");
92 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
93 | this.put("header", tagInfo);
94 |
95 | tagInfo = new TagInfo("footer", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
96 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font");
97 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
98 | this.put("footer", tagInfo);
99 |
100 | tagInfo = new TagInfo("aside", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
101 | tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font");
102 | tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
103 | this.put("aside", tagInfo);
104 |
105 | tagInfo = new TagInfo("video", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
106 | tagInfo.defineCloseBeforeTags("object");
107 | this.put("video", tagInfo);
108 |
109 | tagInfo = new TagInfo("audio", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
110 | tagInfo.defineCloseBeforeTags("object");
111 | this.put("audio", tagInfo);
112 |
113 | tagInfo = new TagInfo("source", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false);
114 | tagInfo.defineCloseBeforeTags("source");
115 | this.put("source", tagInfo);
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/TagTransformation.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.util.Map;
57 | import java.util.LinkedHashMap;
58 |
59 | /**
60 | * Describes how specified tag is transformed to another one, or is ignored
61 | * during parsing
62 | */
63 | public class TagTransformation {
64 |
65 | private String sourceTag;
66 | private String destTag;
67 | private boolean preserveSourceAttributes;
68 | private Map attributeTransformations;
69 |
70 | /**
71 | * Creates new tag transformation from source tag to target tag specifying
72 | * whether source tag attributes are preserved.
73 | *
74 | * @param sourceTag
75 | * Name of the tag to be transformed.
76 | * @param destTag
77 | * Name of tag to which source tag is to be transformed.
78 | * @param preserveSourceAttributes
79 | * Tells whether source tag attributes are preserved in
80 | * transformation.
81 | */
82 | public TagTransformation(final String sourceTag, final String destTag, final boolean preserveSourceAttributes) {
83 | this.sourceTag = sourceTag.toLowerCase();
84 | if (destTag == null) {
85 | this.destTag = null;
86 | } else {
87 | this.destTag = Utils.isValidXmlIdentifier(destTag) ? destTag.toLowerCase() : sourceTag;
88 | }
89 | this.preserveSourceAttributes = preserveSourceAttributes;
90 | }
91 |
92 | /**
93 | * Creates new tag transformation from source tag to target tag preserving
94 | * all source tag attributes.
95 | *
96 | * @param sourceTag
97 | * Name of the tag to be transformed.
98 | * @param destTag
99 | * Name of tag to which source tag is to be transformed.
100 | */
101 | public TagTransformation(final String sourceTag, final String destTag) {
102 | this(sourceTag, destTag, true);
103 | }
104 |
105 | /**
106 | * Creates new tag transformation in which specified tag will be skipped
107 | * (ignored) during parsing process.
108 | *
109 | * @param sourceTag
110 | */
111 | public TagTransformation(final String sourceTag) {
112 | this(sourceTag, null);
113 | }
114 |
115 | /**
116 | * Adds new attribute transformation to this tag transformation. It tells
117 | * how destination attribute will look like. Small templating mechanism is
118 | * used to describe attribute value: all names between ${ and } inside the
119 | * template are evaluated against source tag attributes. That way one can
120 | * make attribute values consist of mix of source tag attributes.
121 | *
122 | * @param targetAttName
123 | * Name of the destination attribute
124 | * @param transformationDesc
125 | * Template describing attribute value.
126 | */
127 | public void addAttributeTransformation(final String targetAttName, final String transformationDesc) {
128 | if (attributeTransformations == null) {
129 | attributeTransformations = new LinkedHashMap();
130 | }
131 | attributeTransformations.put(targetAttName.toLowerCase(), transformationDesc);
132 | }
133 |
134 | /**
135 | * Adds new attribute transformation in which destination attrbute will not
136 | * exists (simply removes it from list of attributes).
137 | *
138 | * @param targetAttName
139 | */
140 | public void addAttributeTransformation(final String targetAttName) {
141 | addAttributeTransformation(targetAttName, null);
142 | }
143 |
144 | boolean hasAttributeTransformations() {
145 | return attributeTransformations != null;
146 | }
147 |
148 | String getSourceTag() {
149 | return sourceTag;
150 | }
151 |
152 | String getDestTag() {
153 | return destTag;
154 | }
155 |
156 | boolean isPreserveSourceAttributes() {
157 | return preserveSourceAttributes;
158 | }
159 |
160 | Map getAttributeTransformations() {
161 | return attributeTransformations;
162 | }
163 |
164 | }
165 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/DomSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | package org.htmlcleaner;
18 |
19 | import org.w3c.dom.Comment;
20 | import org.w3c.dom.Document;
21 | import org.w3c.dom.Element;
22 |
23 | import javax.xml.parsers.DocumentBuilderFactory;
24 | import javax.xml.parsers.ParserConfigurationException;
25 | import java.util.Iterator;
26 | import java.util.List;
27 | import java.util.Map;
28 |
29 | /**
30 | *
31 | * DOM serializer - creates xml DOM.
32 | *
33 | */
34 | public class DomSerializer {
35 | protected CleanerProperties props;
36 | protected boolean escapeXml = true;
37 |
38 | public DomSerializer(final CleanerProperties props, final boolean escapeXml) {
39 | this.props = props;
40 | this.escapeXml = escapeXml;
41 | }
42 |
43 | public DomSerializer(final CleanerProperties props) {
44 | this(props, true);
45 | }
46 |
47 | public Document createDOM(final TagNode rootNode) throws ParserConfigurationException {
48 | final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
49 | final Document document = factory.newDocumentBuilder().newDocument();
50 | final Element rootElement = createElement(rootNode, document);
51 | document.appendChild(rootElement);
52 | setAttributes(rootNode, rootElement);
53 | createSubnodes(document, rootElement, rootNode.getChildren());
54 | return document;
55 | }
56 |
57 | private Element createElement(final TagNode node, final Document document) {
58 | String name = node.getName();
59 | final boolean nsAware = props.isNamespacesAware();
60 | final String prefix = Utils.getXmlNSPrefix(name);
61 | final Map nsDeclarations = node.getNamespaceDeclarations();
62 | String nsURI = null;
63 | if (prefix != null) {
64 | if (nsAware) {
65 | if (nsDeclarations != null) {
66 | nsURI = nsDeclarations.get(prefix);
67 | }
68 | if (nsURI == null) {
69 | nsURI = node.getNamespaceURIOnPath(prefix);
70 | }
71 | if (nsURI == null) {
72 | nsURI = prefix;
73 | }
74 | } else {
75 | name = Utils.getXmlName(name);
76 | }
77 | } else {
78 | if (nsAware) {
79 | if (nsDeclarations != null) {
80 | nsURI = nsDeclarations.get("");
81 | }
82 | if (nsURI == null) {
83 | nsURI = node.getNamespaceURIOnPath(prefix);
84 | }
85 | }
86 | }
87 |
88 | if (nsAware && nsURI != null) {
89 | return document.createElementNS(nsURI, name);
90 | } else {
91 | return document.createElement(name);
92 | }
93 | }
94 |
95 | private void setAttributes(final TagNode node, final Element element) {
96 | for (final Map.Entry entry : node.getAttributes().entrySet()) {
97 | final String attrName = entry.getKey();
98 | String attrValue = entry.getValue();
99 | if (escapeXml) {
100 | attrValue = Utils.escapeXml(attrValue, props, true);
101 | }
102 |
103 | final String attPrefix = Utils.getXmlNSPrefix(attrName);
104 | if (attPrefix != null) {
105 | if (props.isNamespacesAware()) {
106 | String nsURI = node.getNamespaceURIOnPath(attPrefix);
107 | if (nsURI == null) {
108 | nsURI = attPrefix;
109 | }
110 | element.setAttributeNS(nsURI, attrName, attrValue);
111 | } else {
112 | element.setAttribute(Utils.getXmlName(attrName), attrValue);
113 | }
114 | } else {
115 | element.setAttribute(attrName, attrValue);
116 | }
117 | }
118 | }
119 |
120 | private void createSubnodes(final Document document, final Element element, final List tagChildren) {
121 | if (tagChildren != null) {
122 | final Iterator it = tagChildren.iterator();
123 | while (it.hasNext()) {
124 | final Object item = it.next();
125 | if (item instanceof CommentNode) {
126 | final CommentNode commentNode = (CommentNode) item;
127 | final Comment comment = document.createComment(commentNode.getContent().toString());
128 | element.appendChild(comment);
129 | } else if (item instanceof ContentNode) {
130 | final String nodeName = element.getNodeName();
131 | String content = item.toString();
132 | final boolean specialCase = props.isUseCdataForScriptAndStyle()
133 | && ("script".equalsIgnoreCase(nodeName) || "style".equalsIgnoreCase(nodeName));
134 | if (escapeXml && !specialCase) {
135 | content = Utils.escapeXml(content, props, true);
136 | }
137 | element.appendChild(specialCase ? document.createCDATASection(content) : document
138 | .createTextNode(content));
139 | } else if (item instanceof TagNode) {
140 | final TagNode subTagNode = (TagNode) item;
141 | final Element subelement = createElement(subTagNode, document);
142 |
143 | setAttributes(subTagNode, subelement);
144 |
145 | // recursively create subnodes
146 | createSubnodes(document, subelement, subTagNode.getChildren());
147 |
148 | element.appendChild(subelement);
149 | } else if (item instanceof List) {
150 | final List sublist = (List) item;
151 | createSubnodes(document, element, sublist);
152 | }
153 | }
154 | }
155 | }
156 |
157 | }
158 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/XmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.*;
57 | import java.util.*;
58 |
59 | /**
60 | *
61 | * Abstract XML serializer - contains common logic for descendants.
62 | *
63 | */
64 | public abstract class XmlSerializer extends Serializer {
65 |
66 | protected XmlSerializer(final CleanerProperties props) {
67 | super(props);
68 | }
69 |
70 | /**
71 | * @deprecated Use writeToStream() instead.
72 | */
73 | @Deprecated
74 | public void writeXmlToStream(final TagNode tagNode, final OutputStream out, final String charset)
75 | throws IOException {
76 | super.writeToStream(tagNode, out, charset);
77 | }
78 |
79 | /**
80 | * @deprecated Use writeToStream() instead.
81 | */
82 | @Deprecated
83 | public void writeXmlToStream(final TagNode tagNode, final OutputStream out) throws IOException {
84 | super.writeToStream(tagNode, out);
85 | }
86 |
87 | /**
88 | * @deprecated Use writeToFile() instead.
89 | */
90 | @Deprecated
91 | public void writeXmlToFile(final TagNode tagNode, final String fileName, final String charset) throws IOException {
92 | super.writeToFile(tagNode, fileName, charset);
93 | }
94 |
95 | /**
96 | * @deprecated Use writeToFile() instead.
97 | */
98 | @Deprecated
99 | public void writeXmlToFile(final TagNode tagNode, final String fileName) throws IOException {
100 | super.writeToFile(tagNode, fileName);
101 | }
102 |
103 | /**
104 | * @deprecated Use getAsString() instead.
105 | */
106 | @Deprecated
107 | public String getXmlAsString(final TagNode tagNode, final String charset) throws IOException {
108 | return super.getAsString(tagNode, charset);
109 | }
110 |
111 | /**
112 | * @deprecated Use getAsString() instead.
113 | */
114 | @Deprecated
115 | public String getXmlAsString(final TagNode tagNode) throws IOException {
116 | return super.getAsString(tagNode);
117 | }
118 |
119 | /**
120 | * @deprecated Use write() instead.
121 | */
122 | @Deprecated
123 | public void writeXml(final TagNode tagNode, final Writer writer, final String charset) throws IOException {
124 | super.write(tagNode, writer, charset);
125 | }
126 |
127 | protected String escapeXml(final String xmlContent) {
128 | return Utils.escapeXml(xmlContent, props, false);
129 | }
130 |
131 | protected boolean dontEscape(final TagNode tagNode) {
132 | return props.isUseCdataForScriptAndStyle() && isScriptOrStyle(tagNode);
133 | }
134 |
135 | protected boolean isMinimizedTagSyntax(final TagNode tagNode) {
136 | final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
137 | return tagNode.getChildren().size() == 0
138 | && (props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag()));
139 | }
140 |
141 | protected void serializeOpenTag(final TagNode tagNode, final Writer writer, final boolean newLine)
142 | throws IOException {
143 | String tagName = tagNode.getName();
144 |
145 | if (Utils.isEmptyString(tagName)) {
146 | return;
147 | }
148 |
149 | final boolean nsAware = props.isNamespacesAware();
150 |
151 | Set definedNSPrefixes = null;
152 | Set additionalNSDeclNeeded = null;
153 |
154 | final String tagPrefix = Utils.getXmlNSPrefix(tagName);
155 | if (tagPrefix != null) {
156 | if (nsAware) {
157 | definedNSPrefixes = new HashSet();
158 | tagNode.collectNamespacePrefixesOnPath(definedNSPrefixes);
159 | if (!definedNSPrefixes.contains(tagPrefix)) {
160 | additionalNSDeclNeeded = new TreeSet();
161 | additionalNSDeclNeeded.add(tagPrefix);
162 | }
163 | } else {
164 | tagName = Utils.getXmlName(tagName);
165 | }
166 | }
167 |
168 | writer.write("<" + tagName);
169 |
170 | // write attributes
171 | for (Map.Entry entry : tagNode.getAttributes().entrySet()) {
172 | String attName = entry.getKey();
173 | final String attPrefix = Utils.getXmlNSPrefix(attName);
174 | if (attPrefix != null) {
175 | if (nsAware) {
176 | // collect used namespace prefixes in attributes in order to
177 | // explicitly define
178 | // ns declaration if needed; otherwise it would be
179 | // ill-formed xml
180 | if (definedNSPrefixes == null) {
181 | definedNSPrefixes = new HashSet();
182 | tagNode.collectNamespacePrefixesOnPath(definedNSPrefixes);
183 | }
184 | if (!definedNSPrefixes.contains(attPrefix)) {
185 | if (additionalNSDeclNeeded == null) {
186 | additionalNSDeclNeeded = new TreeSet();
187 | }
188 | additionalNSDeclNeeded.add(attPrefix);
189 | }
190 | } else {
191 | attName = Utils.getXmlName(attName);
192 | }
193 | }
194 | writer.write(" " + attName + "=\"" + escapeXml(entry.getValue()) + "\"");
195 | }
196 |
197 | // write namespace declarations
198 | if (nsAware) {
199 | final Map nsDeclarations = tagNode.getNamespaceDeclarations();
200 | if (nsDeclarations != null) {
201 | for (Map.Entry entry : nsDeclarations.entrySet()) {
202 | final String prefix = entry.getKey();
203 | String att = "xmlns";
204 | if (prefix.length() > 0) {
205 | att += ":" + prefix;
206 | }
207 | writer.write(" " + att + "=\"" + escapeXml(entry.getValue()) + "\"");
208 | }
209 | }
210 | }
211 |
212 | // write additional namespace declarations needed for this tag in order
213 | // xml to be well-formed
214 | if (additionalNSDeclNeeded != null) {
215 | for (String prefix : additionalNSDeclNeeded) {
216 | writer.write(" xmlns:" + prefix + "=\"" + prefix + "\"");
217 | }
218 | }
219 |
220 | if (isMinimizedTagSyntax(tagNode)) {
221 | writer.write(" />");
222 | if (newLine) {
223 | writer.write("\n");
224 | }
225 | } else if (dontEscape(tagNode)) {
226 | writer.write(">");
229 | }
230 | }
231 |
232 | protected void serializeEndTag(final TagNode tagNode, final Writer writer, final boolean newLine)
233 | throws IOException {
234 | String tagName = tagNode.getName();
235 |
236 | if (Utils.isEmptyString(tagName)) {
237 | return;
238 | }
239 |
240 | if (dontEscape(tagNode)) {
241 | writer.write("]]>");
242 | }
243 |
244 | if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) {
245 | tagName = Utils.getXmlName(tagName);
246 | }
247 | writer.write("" + tagName + ">");
248 |
249 | if (newLine) {
250 | writer.write("\n");
251 | }
252 | }
253 |
254 | }
255 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/PrettyHtmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.*;
57 | import java.util.*;
58 |
59 | /**
60 | *
61 | * Pretty HTML serializer - creates resulting HTML with indenting lines.
62 | *
63 | */
64 | public class PrettyHtmlSerializer extends HtmlSerializer {
65 | private static final String DEFAULT_INDENTATION_STRING = "\t";
66 |
67 | private String indentString = DEFAULT_INDENTATION_STRING;
68 | final private List indents = new ArrayList();
69 |
70 | public PrettyHtmlSerializer(final CleanerProperties props) {
71 | this(props, DEFAULT_INDENTATION_STRING);
72 | }
73 |
74 | public PrettyHtmlSerializer(final CleanerProperties props, final String indentString) {
75 | super(props);
76 | this.indentString = indentString;
77 | }
78 |
79 | protected void serialize(final TagNode tagNode, final Writer writer) throws IOException {
80 | serializePrettyHtml(tagNode, writer, 0, false, true);
81 | }
82 |
83 | /**
84 | * @param level
85 | * @return Appropriate indentation for the specified depth.
86 | */
87 | private synchronized String getIndent(final int level) {
88 | final int size = indents.size();
89 | if (size <= level) {
90 | String prevIndent = size == 0 ? null : indents.get(size - 1);
91 | for (int i = size; i <= level; i++) {
92 | final String currIndent = prevIndent == null ? "" : prevIndent + indentString;
93 | indents.add(currIndent);
94 | prevIndent = currIndent;
95 | }
96 | }
97 |
98 | return indents.get(level);
99 | }
100 |
101 | private String getIndentedText(final String content, final int level) {
102 | final String indent = getIndent(level);
103 | final StringBuilder result = new StringBuilder(content.length());
104 | final StringTokenizer tokenizer = new StringTokenizer(content, "\n\r");
105 |
106 | while (tokenizer.hasMoreTokens()) {
107 | final String line = tokenizer.nextToken().trim();
108 | if (!"".equals(line)) {
109 | result.append(indent).append(line).append('\n');
110 | }
111 | }
112 |
113 | return result.toString();
114 | }
115 |
116 | private String getSingleLineOfChildren(final List children) {
117 | final StringBuilder result = new StringBuilder();
118 | final Iterator childrenIt = children.iterator();
119 | boolean isFirst = true;
120 |
121 | while (childrenIt.hasNext()) {
122 | final Object child = childrenIt.next();
123 |
124 | if (!(child instanceof ContentNode)) {
125 | return null;
126 | } else {
127 | String content = child.toString();
128 |
129 | // if first item trims it from left
130 | if (isFirst) {
131 | content = Utils.ltrim(content);
132 | }
133 |
134 | // if last item trims it from right
135 | if (!childrenIt.hasNext()) {
136 | content = Utils.rtrim(content);
137 | }
138 |
139 | if (content.indexOf('\n') >= 0 || content.indexOf('\r') >= 0) {
140 | return null;
141 | }
142 | result.append(content);
143 | }
144 |
145 | isFirst = false;
146 | }
147 |
148 | return result.toString();
149 | }
150 |
151 | protected void serializePrettyHtml(final TagNode tagNode, final Writer writer, final int level,
152 | final boolean isPreserveWhitespaces, final boolean isLastNewLine) throws IOException {
153 | final List tagChildren = tagNode.getChildren();
154 | final String tagName = tagNode.getName();
155 | final boolean isHeadlessNode = Utils.isEmptyString(tagName);
156 | final String indent = isHeadlessNode ? "" : getIndent(level);
157 |
158 | if (!isPreserveWhitespaces) {
159 | if (!isLastNewLine) {
160 | writer.write("\n");
161 | }
162 | writer.write(indent);
163 | }
164 | serializeOpenTag(tagNode, writer, true);
165 |
166 | final boolean preserveWhitespaces = isPreserveWhitespaces || "pre".equalsIgnoreCase(tagName);
167 |
168 | boolean lastWasNewLine = false;
169 |
170 | if (!isMinimizedTagSyntax(tagNode)) {
171 | final String singleLine = getSingleLineOfChildren(tagChildren);
172 | final boolean dontEscape = dontEscape(tagNode);
173 | if (!preserveWhitespaces && singleLine != null) {
174 | writer.write(!dontEscape(tagNode) ? escapeText(singleLine) : singleLine);
175 | } else {
176 | final Iterator childIterator = tagChildren.iterator();
177 | while (childIterator.hasNext()) {
178 | final Object child = childIterator.next();
179 | if (child instanceof TagNode) {
180 | serializePrettyHtml((TagNode) child, writer, isHeadlessNode ? level : level + 1,
181 | preserveWhitespaces, lastWasNewLine);
182 | lastWasNewLine = false;
183 | } else if (child instanceof ContentNode) {
184 | final String content = dontEscape ? child.toString() : escapeText(child.toString());
185 | if (content.length() > 0) {
186 | if (dontEscape || preserveWhitespaces) {
187 | writer.write(content);
188 | } else if (Character.isWhitespace(content.charAt(0))) {
189 | if (!lastWasNewLine) {
190 | writer.write("\n");
191 | lastWasNewLine = false;
192 | }
193 | if (content.trim().length() > 0) {
194 | writer.write(getIndentedText(Utils.rtrim(content), isHeadlessNode ? level
195 | : level + 1));
196 | } else {
197 | lastWasNewLine = true;
198 | }
199 | } else {
200 | if (content.trim().length() > 0) {
201 | writer.write(Utils.rtrim(content));
202 | }
203 | if (!childIterator.hasNext()) {
204 | writer.write("\n");
205 | lastWasNewLine = true;
206 | }
207 | }
208 | }
209 | } else if (child instanceof CommentNode) {
210 | if (!lastWasNewLine && !preserveWhitespaces) {
211 | writer.write("\n");
212 | lastWasNewLine = false;
213 | }
214 | final CommentNode commentNode = (CommentNode) child;
215 | final String content = commentNode.getCommentedContent();
216 | writer.write(dontEscape ? content
217 | : getIndentedText(content, isHeadlessNode ? level : level + 1));
218 | }
219 | }
220 | }
221 |
222 | if (singleLine == null && !preserveWhitespaces) {
223 | if (!lastWasNewLine) {
224 | writer.write("\n");
225 | }
226 | writer.write(indent);
227 | }
228 |
229 | serializeEndTag(tagNode, writer, false);
230 | }
231 | }
232 |
233 | }
234 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/CleanerProperties.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | /**
57 | * Properties defining cleaner's behaviour
58 | */
59 | public class CleanerProperties {
60 | public static final String BOOL_ATT_SELF = "self";
61 | public static final String BOOL_ATT_EMPTY = "empty";
62 | public static final String BOOL_ATT_TRUE = "true";
63 |
64 | private ITagInfoProvider tagInfoProvider = null;
65 | private boolean advancedXmlEscape = true;
66 | private boolean transResCharsToNCR = false;
67 | private boolean useCdataForScriptAndStyle = true;
68 | private boolean translateSpecialEntities = true;
69 | private boolean transSpecialEntitiesToNCR = false;
70 | private boolean recognizeUnicodeChars = true;
71 | private boolean omitUnknownTags = false;
72 | private boolean treatUnknownTagsAsContent = false;
73 | private boolean omitDeprecatedTags = false;
74 | private boolean treatDeprecatedTagsAsContent = false;
75 | private boolean omitComments = false;
76 | private boolean omitXmlDeclaration = false;
77 | private boolean omitDoctypeDeclaration = true;
78 | private boolean omitHtmlEnvelope = false;
79 | private boolean useEmptyElementTags = true;
80 | private boolean allowMultiWordAttributes = true;
81 | private boolean allowHtmlInsideAttributes = false;
82 | private boolean ignoreQuestAndExclam = true;
83 | private boolean namespacesAware = true;
84 | private String hyphenReplacementInComment = "=";
85 | private String booleanAttributeValues = BOOL_ATT_SELF;
86 | private String pruneTags = null;
87 |
88 | public String getBooleanAttributeValues() {
89 | return booleanAttributeValues;
90 | }
91 |
92 | public String getHyphenReplacementInComment() {
93 | return hyphenReplacementInComment;
94 | }
95 |
96 | public String getPruneTags() {
97 | return pruneTags;
98 | }
99 |
100 | public ITagInfoProvider getTagInfoProvider() {
101 | return tagInfoProvider;
102 | }
103 |
104 | public boolean isAdvancedXmlEscape() {
105 | return advancedXmlEscape;
106 | }
107 |
108 | public boolean isAllowHtmlInsideAttributes() {
109 | return allowHtmlInsideAttributes;
110 | }
111 |
112 | public boolean isAllowMultiWordAttributes() {
113 | return allowMultiWordAttributes;
114 | }
115 |
116 | public boolean isIgnoreQuestAndExclam() {
117 | return ignoreQuestAndExclam;
118 | }
119 |
120 | public boolean isNamespacesAware() {
121 | return namespacesAware;
122 | }
123 |
124 | public boolean isOmitComments() {
125 | return omitComments;
126 | }
127 |
128 | public boolean isOmitDeprecatedTags() {
129 | return omitDeprecatedTags;
130 | }
131 |
132 | public boolean isOmitDoctypeDeclaration() {
133 | return omitDoctypeDeclaration;
134 | }
135 |
136 | public boolean isOmitHtmlEnvelope() {
137 | return omitHtmlEnvelope;
138 | }
139 |
140 | public boolean isOmitUnknownTags() {
141 | return omitUnknownTags;
142 | }
143 |
144 | public boolean isOmitXmlDeclaration() {
145 | return omitXmlDeclaration;
146 | }
147 |
148 | public boolean isRecognizeUnicodeChars() {
149 | return recognizeUnicodeChars;
150 | }
151 |
152 | public boolean isTranslateSpecialEntities() {
153 | return translateSpecialEntities;
154 | }
155 |
156 | public boolean isTransResCharsToNCR() {
157 | return transResCharsToNCR;
158 | }
159 |
160 | public boolean isTransSpecialEntitiesToNCR() {
161 | return transSpecialEntitiesToNCR;
162 | }
163 |
164 | public boolean isTreatDeprecatedTagsAsContent() {
165 | return treatDeprecatedTagsAsContent;
166 | }
167 |
168 | public boolean isTreatUnknownTagsAsContent() {
169 | return treatUnknownTagsAsContent;
170 | }
171 |
172 | public boolean isUseCdataForScriptAndStyle() {
173 | return useCdataForScriptAndStyle;
174 | }
175 |
176 | public boolean isUseEmptyElementTags() {
177 | return useEmptyElementTags;
178 | }
179 |
180 | public void setAdvancedXmlEscape(final boolean advancedXmlEscape) {
181 | this.advancedXmlEscape = advancedXmlEscape;
182 | }
183 |
184 | public void setAllowHtmlInsideAttributes(final boolean allowHtmlInsideAttributes) {
185 | this.allowHtmlInsideAttributes = allowHtmlInsideAttributes;
186 | }
187 |
188 | public void setAllowMultiWordAttributes(final boolean allowMultiWordAttributes) {
189 | this.allowMultiWordAttributes = allowMultiWordAttributes;
190 | }
191 |
192 | public void setBooleanAttributeValues(final String booleanAttributeValues) {
193 | if (BOOL_ATT_SELF.equalsIgnoreCase(booleanAttributeValues)
194 | || BOOL_ATT_EMPTY.equalsIgnoreCase(booleanAttributeValues)
195 | || BOOL_ATT_TRUE.equalsIgnoreCase(booleanAttributeValues)) {
196 | this.booleanAttributeValues = booleanAttributeValues.toLowerCase();
197 | } else {
198 | this.booleanAttributeValues = BOOL_ATT_SELF;
199 | }
200 | }
201 |
202 | public void setHyphenReplacementInComment(final String hyphenReplacementInComment) {
203 | this.hyphenReplacementInComment = hyphenReplacementInComment;
204 | }
205 |
206 | public void setIgnoreQuestAndExclam(final boolean ignoreQuestAndExclam) {
207 | this.ignoreQuestAndExclam = ignoreQuestAndExclam;
208 | }
209 |
210 | public void setNamespacesAware(final boolean namespacesAware) {
211 | this.namespacesAware = namespacesAware;
212 | }
213 |
214 | public void setOmitComments(final boolean omitComments) {
215 | this.omitComments = omitComments;
216 | }
217 |
218 | public void setOmitDeprecatedTags(final boolean omitDeprecatedTags) {
219 | this.omitDeprecatedTags = omitDeprecatedTags;
220 | }
221 |
222 | public void setOmitDoctypeDeclaration(final boolean omitDoctypeDeclaration) {
223 | this.omitDoctypeDeclaration = omitDoctypeDeclaration;
224 | }
225 |
226 | public void setOmitHtmlEnvelope(final boolean omitHtmlEnvelope) {
227 | this.omitHtmlEnvelope = omitHtmlEnvelope;
228 | }
229 |
230 | public void setOmitUnknownTags(final boolean omitUnknownTags) {
231 | this.omitUnknownTags = omitUnknownTags;
232 | }
233 |
234 | public void setOmitXmlDeclaration(final boolean omitXmlDeclaration) {
235 | this.omitXmlDeclaration = omitXmlDeclaration;
236 | }
237 |
238 | public void setPruneTags(final String pruneTags) {
239 | this.pruneTags = pruneTags;
240 | }
241 |
242 | public void setRecognizeUnicodeChars(final boolean recognizeUnicodeChars) {
243 | this.recognizeUnicodeChars = recognizeUnicodeChars;
244 | }
245 |
246 | public void setTagInfoProvider(final ITagInfoProvider tagInfoProvider) {
247 | this.tagInfoProvider = tagInfoProvider;
248 | }
249 |
250 | public void setTranslateSpecialEntities(final boolean translateSpecialEntities) {
251 | this.translateSpecialEntities = translateSpecialEntities;
252 | }
253 |
254 | public void setTransResCharsToNCR(final boolean transResCharsToNCR) {
255 | this.transResCharsToNCR = transResCharsToNCR;
256 | }
257 |
258 | public void setTransSpecialEntitiesToNCR(final boolean transSpecialEntitiesToNCR) {
259 | this.transSpecialEntitiesToNCR = transSpecialEntitiesToNCR;
260 | }
261 |
262 | public void setTreatDeprecatedTagsAsContent(final boolean treatDeprecatedTagsAsContent) {
263 | this.treatDeprecatedTagsAsContent = treatDeprecatedTagsAsContent;
264 | }
265 |
266 | public void setTreatUnknownTagsAsContent(final boolean treatUnknownTagsAsContent) {
267 | this.treatUnknownTagsAsContent = treatUnknownTagsAsContent;
268 | }
269 |
270 | public void setUseCdataForScriptAndStyle(final boolean useCdataForScriptAndStyle) {
271 | this.useCdataForScriptAndStyle = useCdataForScriptAndStyle;
272 | }
273 |
274 | public void setUseEmptyElementTags(final boolean useEmptyElementTags) {
275 | this.useEmptyElementTags = useEmptyElementTags;
276 | }
277 | }
278 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/HtmlSerializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.IOException;
57 | import java.io.Writer;
58 | import java.util.Map;
59 |
60 | /**
61 | *
62 | * Abstract HTML serializer - contains common logic for descendants.
63 | *
64 | */
65 | public abstract class HtmlSerializer extends Serializer {
66 | protected HtmlSerializer(final CleanerProperties props) {
67 | super(props);
68 | }
69 |
70 | protected boolean isMinimizedTagSyntax(final TagNode tagNode) {
71 | final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
72 | return tagInfo != null && !tagNode.hasChildren() && tagInfo.isEmptyTag();
73 | }
74 |
75 | protected boolean dontEscape(TagNode tagNode) {
76 | return isScriptOrStyle(tagNode);
77 | }
78 |
79 | protected String escapeText(String s) {
80 | final boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
81 | final boolean translateSpecialEntities = props.isTranslateSpecialEntities();
82 |
83 | if (s != null) {
84 | final int len = s.length();
85 | final StringBuilder result = new StringBuilder(len);
86 |
87 | for (int i = 0; i < len; i++) {
88 | char ch = s.charAt(i);
89 |
90 | if (ch == '&') {
91 | if (i < len - 2 && s.charAt(i + 1) == '#') {
92 | boolean isHex = Character.toLowerCase(s.charAt(i + 2)) == 'x';
93 | int charIndex = i + (isHex ? 3 : 2);
94 | int radix = isHex ? 16 : 10;
95 | String unicode = "";
96 | while (charIndex < len) {
97 | char currCh = s.charAt(charIndex);
98 | if (currCh == ';') {
99 | break;
100 | } else if (Utils.isValidInt(unicode + currCh, radix)) {
101 | unicode += currCh;
102 | charIndex++;
103 | } else {
104 | charIndex--;
105 | break;
106 | }
107 | }
108 |
109 | if (Utils.isValidInt(unicode, radix)) {
110 | char unicodeChar = (char) Integer.parseInt(unicode, radix);
111 | if (!Utils.isValidXmlChar(unicodeChar)) {
112 | i = charIndex;
113 | } else if (!Utils.isReservedXmlChar(unicodeChar)) {
114 | result.append(recognizeUnicodeChars ? String.valueOf(unicodeChar) : "" + unicode
115 | + ";");
116 | i = charIndex;
117 | } else {
118 | i = charIndex;
119 | result.append("" + unicode + ";");
120 | }
121 | } else {
122 | result.append(props.isTransResCharsToNCR() ? "" + (int) '&' + ";" : "&");
123 | }
124 | } else {
125 | // get minimal following sequence required to recognize
126 | // some special entitiy
127 | String seq = s.substring(i, i + Math.min(SpecialEntity.getMaxEntityLength() + 2, len - i));
128 | int semiIndex = seq.indexOf(';');
129 | if (semiIndex > 0) {
130 | String entityKey = seq.substring(1, semiIndex);
131 | SpecialEntity entity = SpecialEntity.getEntity(entityKey);
132 | if (entity != null) {
133 | if (translateSpecialEntities) {
134 | result.append(props.isTransSpecialEntitiesToNCR() ? entity.getDecimalNCR() : entity
135 | .getCharacter());
136 | } else {
137 | result.append(entity.getEscapedValue());
138 | }
139 |
140 | i += entityKey.length() + 1;
141 | continue;
142 | }
143 | }
144 |
145 | String sub = s.substring(i);
146 | boolean isReservedSeq = false;
147 | for (int j = 0; j < Utils.RESERVED_XML_CHARS_LIST.length; j++) {
148 | final char currentChar = Utils.RESERVED_XML_CHARS_LIST[j];
149 | seq = Utils.RESERVED_XML_CHARS[currentChar];
150 | if (sub.startsWith(seq)) {
151 | result.append(props.isTransResCharsToNCR() ? "" + (int) currentChar + ";" : seq);
152 | i += seq.length() - 1;
153 | isReservedSeq = true;
154 | break;
155 | }
156 | }
157 | if (!isReservedSeq) {
158 | result.append(props.isTransResCharsToNCR() ? "" + (int) '&' + ";" : "&");
159 | }
160 | }
161 | } else if (Utils.isReservedXmlChar(ch)) {
162 | result.append(props.isTransResCharsToNCR() ? "" + (int) ch + ";" : ch);
163 | } else {
164 | result.append(ch);
165 | }
166 | }
167 |
168 | return result.toString();
169 | }
170 |
171 | return null;
172 | }
173 |
174 | protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
175 | String tagName = tagNode.getName();
176 |
177 | if (Utils.isEmptyString(tagName)) {
178 | return;
179 | }
180 |
181 | boolean nsAware = props.isNamespacesAware();
182 |
183 | if (!nsAware && Utils.getXmlNSPrefix(tagName) != null) {
184 | tagName = Utils.getXmlName(tagName);
185 | }
186 |
187 | writer.write("<" + tagName);
188 | for (Map.Entry entry : tagNode.getAttributes().entrySet()) {
189 | String attName = entry.getKey();
190 | if (!nsAware && Utils.getXmlNSPrefix(attName) != null) {
191 | attName = Utils.getXmlName(attName);
192 | }
193 | writer.write(" " + attName + "=\"" + escapeText(entry.getValue()) + "\"");
194 | }
195 |
196 | if (nsAware) {
197 | final Map nsDeclarations = tagNode.getNamespaceDeclarations();
198 | if (nsDeclarations != null) {
199 | for (Map.Entry entry : nsDeclarations.entrySet()) {
200 | String prefix = entry.getKey();
201 | String att = "xmlns";
202 | if (prefix.length() > 0) {
203 | att += ":" + prefix;
204 | }
205 | writer.write(" " + att + "=\"" + escapeText(entry.getValue()) + "\"");
206 | }
207 | }
208 | }
209 |
210 | if (isMinimizedTagSyntax(tagNode)) {
211 | writer.write(" />");
212 | if (newLine) {
213 | writer.write("\n");
214 | }
215 | } else {
216 | writer.write(">");
217 | }
218 | }
219 |
220 | protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
221 | String tagName = tagNode.getName();
222 |
223 | if (Utils.isEmptyString(tagName)) {
224 | return;
225 | }
226 |
227 | if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) {
228 | tagName = Utils.getXmlName(tagName);
229 | }
230 |
231 | writer.write("" + tagName + ">");
232 | if (newLine) {
233 | writer.write("\n");
234 | }
235 | }
236 |
237 | }
238 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/Serializer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.*;
57 | import java.util.*;
58 |
59 | /**
60 | *
61 | * Basic abstract serializer - contains common logic for descendants (methods
62 | * writeXXX().
63 | *
64 | */
65 | public abstract class Serializer {
66 |
67 | /**
68 | * Used to implement serialization with missing envelope - omiting open and
69 | * close tags, just serialize children.
70 | */
71 | private class HeadlessTagNode extends TagNode {
72 | private HeadlessTagNode(final TagNode wrappedNode) {
73 | super("");
74 | getAttributes().putAll(wrappedNode.getAttributes());
75 | getChildren().addAll(wrappedNode.getChildren());
76 | setDocType(wrappedNode.getDocType());
77 | final Map nsDecls = getNamespaceDeclarations();
78 | if (nsDecls != null) {
79 | final Map wrappedNSDecls = wrappedNode.getNamespaceDeclarations();
80 | if (wrappedNSDecls != null) {
81 | nsDecls.putAll(wrappedNSDecls);
82 | }
83 | }
84 |
85 | }
86 | }
87 |
88 | protected CleanerProperties props;
89 |
90 | protected Serializer(final CleanerProperties props) {
91 | this.props = props;
92 | }
93 |
94 | /**
95 | * @param tagNode
96 | * Node to serialize to string
97 | * @return Output as string
98 | * @throws IOException
99 | */
100 | public String getAsString(final TagNode tagNode) throws IOException {
101 | return getAsString(tagNode, false);
102 | }
103 |
104 | /**
105 | * @param tagNode
106 | * Node to serialize to string
107 | * @param omitEnvelope
108 | * Tells whether to skip open and close tag of the node.
109 | * @return Output as string
110 | * @throws IOException
111 | */
112 | public String getAsString(final TagNode tagNode, final boolean omitEnvelope) throws IOException {
113 | return getAsString(tagNode, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope);
114 | }
115 |
116 | /**
117 | * @param tagNode
118 | * Node to serialize to string
119 | * @param charset
120 | * Charset of the output - stands in xml declaration part
121 | * @return Output as string
122 | * @throws IOException
123 | */
124 | public String getAsString(final TagNode tagNode, final String charset) throws IOException {
125 | return getAsString(tagNode, charset, false);
126 | }
127 |
128 | /**
129 | * @param tagNode
130 | * Node to serialize to string
131 | * @param charset
132 | * Charset of the output - stands in xml declaration part
133 | * @param omitEnvelope
134 | * Tells whether to skip open and close tag of the node.
135 | * @return Output as string
136 | * @throws IOException
137 | */
138 | public String getAsString(final TagNode tagNode, final String charset, final boolean omitEnvelope)
139 | throws IOException {
140 | final StringWriter writer = new StringWriter();
141 | write(tagNode, writer, charset, omitEnvelope);
142 | return writer.getBuffer().toString();
143 | }
144 |
145 | protected boolean isScriptOrStyle(final TagNode tagNode) {
146 | final String tagName = tagNode.getName();
147 | return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
148 | }
149 |
150 | protected abstract void serialize(TagNode tagNode, Writer writer) throws IOException;
151 |
152 | /**
153 | * Writes specified node using specified writer.
154 | *
155 | * @param tagNode
156 | * Node to serialize.
157 | * @param writer
158 | * Writer instance
159 | * @param charset
160 | * Charset of the output
161 | * @throws IOException
162 | */
163 | public void write(final TagNode tagNode, final Writer writer, final String charset) throws IOException {
164 | write(tagNode, writer, charset, false);
165 | }
166 |
167 | /**
168 | * Writes specified node using specified writer.
169 | *
170 | * @param tagNode
171 | * Node to serialize.
172 | * @param writer
173 | * Writer instance
174 | * @param charset
175 | * Charset of the output
176 | * @param omitEnvelope
177 | * Tells whether to skip open and close tag of the node.
178 | * @throws IOException
179 | */
180 | public void write(TagNode tagNode, Writer writer, final String charset, final boolean omitEnvelope)
181 | throws IOException {
182 | if (omitEnvelope) {
183 | tagNode = new HeadlessTagNode(tagNode);
184 | }
185 | writer = new BufferedWriter(writer);
186 | if (!props.isOmitXmlDeclaration()) {
187 | String declaration = "";
192 | writer.write(declaration + "\n");
193 | }
194 |
195 | if (!props.isOmitDoctypeDeclaration()) {
196 | final DoctypeToken doctypeToken = tagNode.getDocType();
197 | if (doctypeToken != null) {
198 | doctypeToken.serialize(this, writer);
199 | }
200 | }
201 |
202 | serialize(tagNode, writer);
203 |
204 | writer.flush();
205 | writer.close();
206 | }
207 |
208 | /**
209 | * Writes specified TagNode to the file, using system default charset.
210 | *
211 | * @param tagNode
212 | * Node to be written
213 | * @param fileName
214 | * Output file name
215 | * @throws IOException
216 | */
217 | public void writeToFile(final TagNode tagNode, final String fileName) throws IOException {
218 | writeToFile(tagNode, fileName, false);
219 | }
220 |
221 | /**
222 | * Writes specified TagNode to the file, using specified charset and
223 | * optionally omits node envelope (skips open and close tags of the node).
224 | *
225 | * @param tagNode
226 | * Node to be written
227 | * @param fileName
228 | * Output file name
229 | * @param omitEnvelope
230 | * Tells whether to skip open and close tag of the node.
231 | * @throws IOException
232 | */
233 | public void writeToFile(final TagNode tagNode, final String fileName, final boolean omitEnvelope)
234 | throws IOException {
235 | writeToFile(tagNode, fileName, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope);
236 | }
237 |
238 | /**
239 | * Writes specified TagNode to the file, using specified charset.
240 | *
241 | * @param tagNode
242 | * Node to be written
243 | * @param fileName
244 | * Output file name
245 | * @param charset
246 | * Charset of the output
247 | * @throws IOException
248 | */
249 | public void writeToFile(final TagNode tagNode, final String fileName, final String charset) throws IOException {
250 | writeToFile(tagNode, fileName, charset, false);
251 | }
252 |
253 | /**
254 | * Writes specified TagNode to the file, using specified charset and
255 | * optionally omits node envelope (skips open and close tags of the node).
256 | *
257 | * @param tagNode
258 | * Node to be written
259 | * @param fileName
260 | * Output file name
261 | * @param charset
262 | * Charset of the output
263 | * @param omitEnvelope
264 | * Tells whether to skip open and close tag of the node.
265 | * @throws IOException
266 | */
267 | public void writeToFile(final TagNode tagNode, final String fileName, final String charset,
268 | final boolean omitEnvelope) throws IOException {
269 | writeToStream(tagNode, new FileOutputStream(fileName), charset, omitEnvelope);
270 | }
271 |
272 | /**
273 | * Writes specified TagNode to the output stream, using system default
274 | * charset.
275 | *
276 | * @param tagNode
277 | * Node to be written
278 | * @param out
279 | * Output stream
280 | * @throws IOException
281 | */
282 | public void writeToStream(final TagNode tagNode, final OutputStream out) throws IOException {
283 | writeToStream(tagNode, out, false);
284 | }
285 |
286 | /**
287 | * Writes specified TagNode to the output stream, using system default
288 | * charset and optionally omits node envelope (skips open and close tags of
289 | * the node).
290 | *
291 | * @param tagNode
292 | * Node to be written
293 | * @param out
294 | * Output stream
295 | * @param omitEnvelope
296 | * Tells whether to skip open and close tag of the node.
297 | * @throws IOException
298 | */
299 | public void writeToStream(final TagNode tagNode, final OutputStream out, final boolean omitEnvelope)
300 | throws IOException {
301 | writeToStream(tagNode, out, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope);
302 | }
303 |
304 | /**
305 | * Writes specified TagNode to the output stream, using specified charset.
306 | *
307 | * @param tagNode
308 | * Node to be written
309 | * @param out
310 | * Output stream
311 | * @param charset
312 | * Charset of the output
313 | * @throws IOException
314 | */
315 | public void writeToStream(final TagNode tagNode, final OutputStream out, final String charset) throws IOException {
316 | writeToStream(tagNode, out, charset, false);
317 | }
318 |
319 | /**
320 | * Writes specified TagNode to the output stream, using specified charset
321 | * and optionally omits node envelope (skips open and close tags of the
322 | * node).
323 | *
324 | * @param tagNode
325 | * Node to be written
326 | * @param out
327 | * Output stream
328 | * @param charset
329 | * Charset of the output
330 | * @param omitEnvelope
331 | * Tells whether to skip open and close tag of the node.
332 | * @throws IOException
333 | */
334 | public void writeToStream(final TagNode tagNode, final OutputStream out, final String charset,
335 | final boolean omitEnvelope) throws IOException {
336 | write(tagNode, new OutputStreamWriter(out, charset), charset, omitEnvelope);
337 | }
338 | }
339 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/TagInfo.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011-2013 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | package org.htmlcleaner;
18 |
19 | import java.util.HashSet;
20 | import java.util.Set;
21 | import java.util.StringTokenizer;
22 |
23 | /**
24 | *
25 | * Class contains information about single HTML tag.
26 | * It also contains rules for tag balancing. For each tag, list of dependant
27 | * tags may be defined. There are several kinds of dependancies used to reorder
28 | * tags:
29 | *
30 | *
31 | * fatal tags - required outer tag - the tag will be ignored during parsing
32 | * (will be skipped) if this fatal tag is missing. For example, most web
33 | * browsers ignore elements TD, TR, TBODY if they are not in the context of
34 | * TABLE tag.
35 | *
36 | * required enclosing tags - if there is no such, it is implicitely created. For
37 | * example if TD is out of TR - open TR is created before.
38 | *
39 | * forbidden tags - it is not allowed to occure inside - for example FORM cannot
40 | * be inside other FORM and it will be ignored during cleanup.
41 | *
42 | * allowed children tags - for example TR allowes TD and TH. If there are some
43 | * dependant allowed tags defined then cleaner ignores other tags, treating them
44 | * as unallowed, unless they are in some other relationship with this tag.
45 | *
46 | * higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT.
47 | *
48 | * tags that must be closed and copied - for example, in
49 | * <a href="#"><div>.... tag A must be closed before
50 | * DIV but copied again inside DIV.
51 | *
52 | * tags that must be closed before closing this tag and copied again after - for
53 | * example, in <i><b>at</i> first</b> text
54 | * tag B must be closed before closing I, but it must be copied again after
55 | * resulting finally in sequence:
56 | * <i><b>at</b></i><b> first</b> text
57 | * .
58 | *
59 | *
60 | *
61 | *
62 | * Tag TR for instance (table row) may define the following dependancies:
63 | *
64 | * fatal tag is table
65 | * required enclosing tag is tbody
66 | * allowed children tags are td,th
67 | * higher level tags are thead,tfoot
68 | * tags that muste be closed before are
69 | * tr,td,th,caption,colgroup
70 | *
71 | * meaning the following:
72 | *
73 | * tr must be in context of table, otherwise it
74 | * will be ignored,
75 | * tr may can be directly inside tbody,
76 | * tfoot and thead, otherwise tbody will
77 | * be implicitely created in front of it.
78 | * tr can contain td and th, all
79 | * other tags and content will be pushed out of current limiting context, in the
80 | * case of html tables, in front of enclosing table tag.
81 | * if previous open tag is one of tr, caption or
82 | * colgroup, it will be implicitely closed.
83 | *
84 | *
85 | */
86 | public class TagInfo {
87 |
88 | protected static final int BODY = 2;
89 | protected static final int CONTENT_ALL = 0;
90 | protected static final int CONTENT_NONE = 1;
91 |
92 | protected static final int CONTENT_TEXT = 2;
93 | protected static final int HEAD = 1;
94 | protected static final int HEAD_AND_BODY = 0;
95 |
96 | private int belongsTo = BODY;
97 | private Set childTags = new HashSet();
98 | final private int contentType;
99 | private Set continueAfterTags = new HashSet();
100 | private Set copyTags = new HashSet();
101 | private boolean deprecated = false;
102 | private String fatalTag = null;
103 | private Set higherTags = new HashSet();
104 | private boolean ignorePermitted = false;
105 | private Set mustCloseTags = new HashSet();
106 | private String name;
107 | private Set permittedTags = new HashSet();
108 | private String requiredParent = null;
109 | private boolean unique = false;
110 |
111 | public TagInfo(final String name, final int contentType, final int belongsTo, final boolean depricated,
112 | final boolean unique, final boolean ignorePermitted) {
113 | this.name = name;
114 | this.contentType = contentType;
115 | this.belongsTo = belongsTo;
116 | this.deprecated = depricated;
117 | this.unique = unique;
118 | this.ignorePermitted = ignorePermitted;
119 | }
120 |
121 | public boolean allowsAnything() {
122 | return CONTENT_ALL == contentType && childTags.isEmpty();
123 | }
124 |
125 | public boolean allowsBody() {
126 | return CONTENT_NONE != contentType;
127 | }
128 |
129 | public boolean allowsItem(final BaseToken token) {
130 | if (contentType != CONTENT_NONE && token instanceof TagToken) {
131 | final TagToken tagToken = (TagToken) token;
132 | final String tagName = tagToken.getName();
133 | if ("script".equals(tagName)) {
134 | return true;
135 | }
136 | }
137 |
138 | if (CONTENT_ALL == contentType) {
139 | if (!childTags.isEmpty()) {
140 | return token instanceof TagToken ? childTags.contains(((TagToken) token).getName()) : false;
141 | } else if (!permittedTags.isEmpty()) {
142 | return token instanceof TagToken ? !permittedTags.contains(((TagToken) token).getName()) : true;
143 | }
144 | return true;
145 | } else if (CONTENT_TEXT == contentType) {
146 | return !(token instanceof TagToken);
147 | }
148 |
149 | return false;
150 | }
151 |
152 | public void defineAllowedChildrenTags(final String commaSeparatedListOfTags) {
153 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
154 | while (tokenizer.hasMoreTokens()) {
155 | final String currTag = tokenizer.nextToken();
156 | this.childTags.add(currTag);
157 | }
158 | }
159 |
160 | public void defineCloseBeforeCopyInsideTags(final String commaSeparatedListOfTags) {
161 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
162 | while (tokenizer.hasMoreTokens()) {
163 | final String currTag = tokenizer.nextToken();
164 | this.copyTags.add(currTag);
165 | this.mustCloseTags.add(currTag);
166 | }
167 | }
168 |
169 | public void defineCloseBeforeTags(final String commaSeparatedListOfTags) {
170 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
171 | while (tokenizer.hasMoreTokens()) {
172 | final String currTag = tokenizer.nextToken();
173 | this.mustCloseTags.add(currTag);
174 | }
175 | }
176 |
177 | public void defineCloseInsideCopyAfterTags(final String commaSeparatedListOfTags) {
178 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
179 | while (tokenizer.hasMoreTokens()) {
180 | final String currTag = tokenizer.nextToken();
181 | this.continueAfterTags.add(currTag);
182 | }
183 | }
184 |
185 | public void defineFatalTags(final String commaSeparatedListOfTags) {
186 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
187 | while (tokenizer.hasMoreTokens()) {
188 | final String currTag = tokenizer.nextToken();
189 | this.fatalTag = currTag;
190 | this.higherTags.add(currTag);
191 | }
192 | }
193 |
194 | public void defineForbiddenTags(final String commaSeparatedListOfTags) {
195 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
196 | while (tokenizer.hasMoreTokens()) {
197 | final String currTag = tokenizer.nextToken();
198 | this.permittedTags.add(currTag);
199 | }
200 | }
201 |
202 | public void defineHigherLevelTags(final String commaSeparatedListOfTags) {
203 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
204 | while (tokenizer.hasMoreTokens()) {
205 | final String currTag = tokenizer.nextToken();
206 | this.higherTags.add(currTag);
207 | }
208 | }
209 |
210 | public void defineRequiredEnclosingTags(final String commaSeparatedListOfTags) {
211 | final StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
212 | while (tokenizer.hasMoreTokens()) {
213 | final String currTag = tokenizer.nextToken();
214 | this.requiredParent = currTag;
215 | this.higherTags.add(currTag);
216 | }
217 | }
218 |
219 | public int getBelongsTo() {
220 | return belongsTo;
221 | }
222 |
223 | public Set getChildTags() {
224 | return childTags;
225 | }
226 |
227 | public int getContentType() {
228 | return contentType;
229 | }
230 |
231 | public Set getContinueAfterTags() {
232 | return continueAfterTags;
233 | }
234 |
235 | public Set getCopyTags() {
236 | return copyTags;
237 | }
238 |
239 | public String getFatalTag() {
240 | return fatalTag;
241 | }
242 |
243 | public Set getHigherTags() {
244 | return higherTags;
245 | }
246 |
247 | public Set getMustCloseTags() {
248 | return mustCloseTags;
249 | }
250 |
251 | public String getName() {
252 | return name;
253 | }
254 |
255 | public Set getPermittedTags() {
256 | return permittedTags;
257 | }
258 |
259 | public String getRequiredParent() {
260 | return requiredParent;
261 | }
262 |
263 | public boolean hasCopyTags() {
264 | return !copyTags.isEmpty();
265 | }
266 |
267 | public boolean hasPermittedTags() {
268 | return !permittedTags.isEmpty();
269 | }
270 |
271 | public boolean isContinueAfter(final String tagName) {
272 | return continueAfterTags.contains(tagName);
273 | }
274 |
275 | public boolean isCopy(final String tagName) {
276 | return copyTags.contains(tagName);
277 | }
278 |
279 | public boolean isDeprecated() {
280 | return deprecated;
281 | }
282 |
283 | public boolean isEmptyTag() {
284 | return CONTENT_NONE == contentType;
285 | }
286 |
287 | public boolean isHeadAndBodyTag() {
288 | return belongsTo == HEAD || belongsTo == HEAD_AND_BODY;
289 | }
290 |
291 | public boolean isHeadTag() {
292 | return belongsTo == HEAD;
293 | }
294 |
295 | public boolean isHigher(final String tagName) {
296 | return higherTags.contains(tagName);
297 | }
298 |
299 | public boolean isIgnorePermitted() {
300 | return ignorePermitted;
301 | }
302 |
303 | public boolean isMustCloseTag(final TagInfo tagInfo) {
304 | if (tagInfo != null) {
305 | return mustCloseTags.contains(tagInfo.getName()) || tagInfo.contentType == CONTENT_TEXT;
306 | }
307 |
308 | return false;
309 | }
310 |
311 | public boolean isUnique() {
312 | return unique;
313 | }
314 |
315 | public void setBelongsTo(final int belongsTo) {
316 | this.belongsTo = belongsTo;
317 | }
318 |
319 | public void setChildTags(final Set childTags) {
320 | this.childTags = childTags;
321 | }
322 |
323 | // other functionality
324 |
325 | public void setContinueAfterTags(final Set continueAfterTags) {
326 | this.continueAfterTags = continueAfterTags;
327 | }
328 |
329 | public void setCopyTags(final Set copyTags) {
330 | this.copyTags = copyTags;
331 | }
332 |
333 | public void setDeprecated(final boolean deprecated) {
334 | this.deprecated = deprecated;
335 | }
336 |
337 | public void setFatalTag(final String fatalTag) {
338 | this.fatalTag = fatalTag;
339 | }
340 |
341 | public void setHigherTags(final Set higherTags) {
342 | this.higherTags = higherTags;
343 | }
344 |
345 | public void setIgnorePermitted(final boolean ignorePermitted) {
346 | this.ignorePermitted = ignorePermitted;
347 | }
348 |
349 | public void setMustCloseTags(final Set mustCloseTags) {
350 | this.mustCloseTags = mustCloseTags;
351 | }
352 |
353 | public void setName(final String name) {
354 | this.name = name;
355 | }
356 |
357 | public void setPermittedTags(final Set permittedTags) {
358 | this.permittedTags = permittedTags;
359 | }
360 |
361 | public void setRequiredParent(final String requiredParent) {
362 | this.requiredParent = requiredParent;
363 | }
364 |
365 | public void setUnique(final boolean unique) {
366 | this.unique = unique;
367 | }
368 | }
369 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/SpecialEntity.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.util.HashMap;
57 | import java.util.Map;
58 |
59 | /**
60 | *
61 | * This class contains map with special entities used in HTML and their
62 | * unicodes.
63 | *
64 | */
65 | final public class SpecialEntity {
66 |
67 | private static Map entities = new HashMap();
68 |
69 | private static int maxEntityLength = 0;
70 |
71 | static {
72 | addEntity("nbsp", 160);
73 | addEntity("iexcl", 161);
74 | addEntity("cent", 162);
75 | addEntity("pound", 163);
76 | addEntity("curren", 164);
77 | addEntity("yen", 165);
78 | addEntity("brvbar", 166);
79 | addEntity("sect", 167);
80 | addEntity("uml", 168);
81 | addEntity("copy", 169);
82 | addEntity("ordf", 170);
83 | addEntity("laquo", 171);
84 | addEntity("not", 172);
85 | addEntity("shy", 173);
86 | addEntity("reg", 174);
87 | addEntity("macr", 175);
88 | addEntity("deg", 176);
89 | addEntity("plusmn", 177);
90 | addEntity("sup2", 178);
91 | addEntity("sup3", 179);
92 | addEntity("acute", 180);
93 | addEntity("micro", 181);
94 | addEntity("para", 182);
95 | addEntity("middot", 183);
96 | addEntity("cedil", 184);
97 | addEntity("sup1", 185);
98 | addEntity("ordm", 186);
99 | addEntity("raquo", 187);
100 | addEntity("frac14", 188);
101 | addEntity("frac12", 189);
102 | addEntity("frac34", 190);
103 | addEntity("iquest", 191);
104 | addEntity("Agrave", 192);
105 | addEntity("Aacute", 193);
106 | addEntity("Acirc", 194);
107 | addEntity("Atilde", 195);
108 | addEntity("Auml", 196);
109 | addEntity("Aring", 197);
110 | addEntity("AElig", 198);
111 | addEntity("Ccedil", 199);
112 | addEntity("Egrave", 200);
113 | addEntity("Eacute", 201);
114 | addEntity("Ecirc", 202);
115 | addEntity("Euml", 203);
116 | addEntity("Igrave", 204);
117 | addEntity("Iacute", 205);
118 | addEntity("Icirc", 206);
119 | addEntity("Iuml", 207);
120 | addEntity("ETH", 208);
121 | addEntity("Ntilde", 209);
122 | addEntity("Ograve", 210);
123 | addEntity("Oacute", 211);
124 | addEntity("Ocirc", 212);
125 | addEntity("Otilde", 213);
126 | addEntity("Ouml", 214);
127 | addEntity("times", 215);
128 | addEntity("Oslash", 216);
129 | addEntity("Ugrave", 217);
130 | addEntity("Uacute", 218);
131 | addEntity("Ucirc", 219);
132 | addEntity("Uuml", 220);
133 | addEntity("Yacute", 221);
134 | addEntity("THORN", 222);
135 | addEntity("szlig", 223);
136 | addEntity("agrave", 224);
137 | addEntity("aacute", 225);
138 | addEntity("acirc", 226);
139 | addEntity("atilde", 227);
140 | addEntity("auml", 228);
141 | addEntity("aring", 229);
142 | addEntity("aelig", 230);
143 | addEntity("ccedil", 231);
144 | addEntity("egrave", 232);
145 | addEntity("eacute", 233);
146 | addEntity("ecirc", 234);
147 | addEntity("euml", 235);
148 | addEntity("igrave", 236);
149 | addEntity("iacute", 237);
150 | addEntity("icirc", 238);
151 | addEntity("iuml", 239);
152 | addEntity("eth", 240);
153 | addEntity("ntilde", 241);
154 | addEntity("ograve", 242);
155 | addEntity("oacute", 243);
156 | addEntity("ocirc", 244);
157 | addEntity("otilde", 245);
158 | addEntity("ouml", 246);
159 | addEntity("divide", 247);
160 | addEntity("oslash", 248);
161 | addEntity("ugrave", 249);
162 | addEntity("uacute", 250);
163 | addEntity("ucirc", 251);
164 | addEntity("uuml", 252);
165 | addEntity("yacute", 253);
166 | addEntity("thorn", 254);
167 | addEntity("yuml", 255);
168 | addEntity("OElig", 338);
169 | addEntity("oelig", 339);
170 | addEntity("Scaron", 352);
171 | addEntity("scaron", 353);
172 | addEntity("Yuml", 376);
173 | addEntity("fnof", 402);
174 | addEntity("circ", 710);
175 | addEntity("tilde", 732);
176 |
177 | // Greek letters
178 | addEntity("Alpha", 913);
179 | addEntity("Beta", 914);
180 | addEntity("Gamma", 915);
181 | addEntity("Delta", 916);
182 | addEntity("Epsilon", 917);
183 | addEntity("Zeta", 918);
184 | addEntity("Eta", 919);
185 | addEntity("Theta", 920);
186 | addEntity("Iota", 921);
187 | addEntity("Kappa", 922);
188 | addEntity("Lambda", 923);
189 | addEntity("Mu", 924);
190 | addEntity("Nu", 925);
191 | addEntity("Xi", 926);
192 | addEntity("Omicron", 927);
193 | addEntity("Pi", 928);
194 | addEntity("Rho", 929);
195 | addEntity("Sigma", 931);
196 | addEntity("Tau", 932);
197 | addEntity("Upsilon", 933);
198 | addEntity("Phi", 934);
199 | addEntity("Chi", 935);
200 | addEntity("Psi", 936);
201 | addEntity("Omega", 937);
202 | addEntity("alpha", 945);
203 | addEntity("beta", 946);
204 | addEntity("gamma", 947);
205 | addEntity("delta", 948);
206 | addEntity("epsilon", 949);
207 | addEntity("zeta", 950);
208 | addEntity("eta", 951);
209 | addEntity("theta", 952);
210 | addEntity("iota", 953);
211 | addEntity("kappa", 954);
212 | addEntity("lambda", 955);
213 | addEntity("mu", 956);
214 | addEntity("nu", 957);
215 | addEntity("xi", 958);
216 | addEntity("omicron", 959);
217 | addEntity("pi", 960);
218 | addEntity("rho", 961);
219 | addEntity("sigmaf", 962);
220 | addEntity("sigma", 963);
221 | addEntity("tau", 964);
222 | addEntity("upsilon", 965);
223 | addEntity("phi", 966);
224 | addEntity("chi", 967);
225 | addEntity("psi", 968);
226 | addEntity("omega", 969);
227 | addEntity("thetasym", 977);
228 | addEntity("upsih", 978);
229 | addEntity("piv", 982);
230 |
231 | addEntity("ensp", 8194);
232 | addEntity("emsp", 8195);
233 | addEntity("thinsp", 8201);
234 | addEntity("zwnj", 8204);
235 | addEntity("zwj", 8205);
236 | addEntity("lrm", 8206);
237 | addEntity("rlm", 8207);
238 | addEntity("ndash", 8211);
239 | addEntity("mdash", 8212);
240 | addEntity("lsquo", 8216);
241 | addEntity("rsquo", 8217);
242 | addEntity("sbquo", 8218);
243 | addEntity("ldquo", 8220);
244 | addEntity("rdquo", 8221);
245 | addEntity("bdquo", 8222);
246 | addEntity("dagger", 8224);
247 | addEntity("Dagger", 8225);
248 | addEntity("bull", 8226);
249 |
250 | addEntity("hellip", 8230);
251 | addEntity("permil", 8240);
252 | addEntity("prime", 8242);
253 | addEntity("Prime", 8243);
254 | addEntity("lsaquo", 8249);
255 | addEntity("rsaquo", 8250);
256 | addEntity("oline", 8254);
257 | addEntity("frasl", 8260);
258 | addEntity("euro", 8364);
259 | addEntity("image", 8465);
260 | addEntity("weierp", 8472);
261 | addEntity("real", 8476);
262 | addEntity("trade", 8482);
263 | addEntity("alefsym", 8501);
264 | addEntity("larr", 8592);
265 | addEntity("uarr", 8593);
266 | addEntity("rarr", 8594);
267 | addEntity("darr", 8595);
268 | addEntity("harr", 8596);
269 | addEntity("crarr", 8629);
270 | addEntity("lArr", 8656);
271 | addEntity("uArr", 8657);
272 | addEntity("rArr", 8658);
273 | addEntity("dArr", 8659);
274 | addEntity("hArr", 8660);
275 |
276 | // math symbols
277 | addEntity("forall", 8704);
278 | addEntity("part", 8706);
279 | addEntity("exist", 8707);
280 | addEntity("empty", 8709);
281 | addEntity("nabla", 8711);
282 | addEntity("isin", 8712);
283 | addEntity("notin", 8713);
284 | addEntity("ni", 8715);
285 | addEntity("prod", 8719);
286 | addEntity("sum", 8721);
287 | addEntity("minus", 8722);
288 | addEntity("lowast", 8727);
289 | addEntity("radic", 8730);
290 | addEntity("prop", 8733);
291 | addEntity("infin", 8734);
292 | addEntity("ang", 8736);
293 | addEntity("and", 8743);
294 | addEntity("or", 8744);
295 | addEntity("cap", 8745);
296 | addEntity("cup", 8746);
297 | addEntity("int", 8747);
298 | addEntity("there4", 8756);
299 | addEntity("sim", 8764);
300 | addEntity("cong", 8773);
301 | addEntity("asymp", 8776);
302 | addEntity("ne", 8800);
303 | addEntity("equiv", 8801);
304 | addEntity("le", 8804);
305 | addEntity("ge", 8805);
306 | addEntity("sub", 8834);
307 | addEntity("sup", 8835);
308 | addEntity("nsub", 8836);
309 | addEntity("sube", 8838);
310 | addEntity("supe", 8839);
311 | addEntity("oplus", 8853);
312 | addEntity("otimes", 8855);
313 | addEntity("perp", 8869);
314 | addEntity("sdot", 8901);
315 | addEntity("lceil", 8968);
316 | addEntity("rceil", 8969);
317 | addEntity("lfloor", 8970);
318 | addEntity("rfloor", 8971);
319 | addEntity("lang", 9001);
320 | addEntity("rang", 9002);
321 | addEntity("loz", 9674);
322 | addEntity("spades", 9824);
323 | addEntity("clubs", 9827);
324 | addEntity("hearts", 9829);
325 | addEntity("diams", 9830);
326 | }
327 |
328 | /**
329 | * Add new entity to the set.
330 | *
331 | * @param entityName
332 | * Entity name, for example "pound"
333 | * @param intCode
334 | * Unicode of the entity, for example 163
335 | *
336 | * @throws org.htmlcleaner.HtmlCleanerException
337 | */
338 | public static void addEntity(final String entityName, final int intCode) throws HtmlCleanerException {
339 | if (entities.containsKey(entityName)) {
340 | throw new HtmlCleanerException("Entity \"" + entityName + "\" is already defined!");
341 | }
342 | entities.put(entityName, new SpecialEntity(entityName, intCode));
343 | final int entityNameLen = entityName.length();
344 | if (entityNameLen > maxEntityLength) {
345 | maxEntityLength = entityNameLen;
346 | }
347 | }
348 |
349 | public static SpecialEntity getEntity(final String key) {
350 | return entities.get(key);
351 | }
352 |
353 | public static int getMaxEntityLength() {
354 | return maxEntityLength;
355 | }
356 |
357 | final private String key;
358 | final private int intCode;
359 |
360 | private SpecialEntity(final String key, final int intCode) {
361 | this.key = key;
362 | this.intCode = intCode;
363 | }
364 |
365 | public char getCharacter() {
366 | return (char) intCode;
367 | }
368 |
369 | /**
370 | * @return Numeric Character Reference in decimal format
371 | */
372 | public String getDecimalNCR() {
373 | return "" + intCode + ";";
374 | }
375 |
376 | /**
377 | * @return Escaped value of the entity
378 | */
379 | public String getEscapedValue() {
380 | return "&" + key + ";";
381 | }
382 |
383 | /**
384 | * @return Numeric Character Reference in hex format
385 | */
386 | public String getHexNCR() {
387 | return "" + Integer.toHexString(intCode) + ";";
388 | }
389 |
390 | public int getIntCode() {
391 | return intCode;
392 | }
393 |
394 | public String getKey() {
395 | return key;
396 | }
397 |
398 | }
399 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/Utils.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.io.*;
57 | import java.net.URL;
58 | import java.nio.charset.Charset;
59 | import java.util.Map;
60 | import java.util.StringTokenizer;
61 | import java.util.regex.Matcher;
62 | import java.util.regex.Pattern;
63 |
64 | /**
65 | *
66 | * Common utilities.
67 | *
68 | */
69 | final public class Utils {
70 | final public static int RESERVED_XML_CHARS_SIZE = 128;
71 | final public static String VAR_START = "${";
72 | final public static String VAR_END = "}";
73 |
74 | public static final String RESERVED_XML_CHARS[] = new String[RESERVED_XML_CHARS_SIZE];
75 | public static final char RESERVED_XML_CHARS_LIST[] = { '&', '<', '>', '\"', '\'' };
76 |
77 | static {
78 | RESERVED_XML_CHARS['&'] = "&";
79 | RESERVED_XML_CHARS['<'] = "<";
80 | RESERVED_XML_CHARS['>'] = ">";
81 | RESERVED_XML_CHARS['\"'] = """;
82 | RESERVED_XML_CHARS['\''] = "'";
83 | }
84 |
85 | /**
86 | * Trims specified string from left.
87 | *
88 | * @param s
89 | */
90 | public static String ltrim(final String s) {
91 | if (s == null) {
92 | return null;
93 | }
94 |
95 | int index = 0;
96 | final int len = s.length();
97 |
98 | while (index < len && Character.isWhitespace(s.charAt(index))) {
99 | index++;
100 | }
101 |
102 | return (index >= len) ? "" : s.substring(index);
103 | }
104 |
105 | /**
106 | * Trims specified string from right.
107 | *
108 | * @param s
109 | */
110 | public static String rtrim(final String s) {
111 | if (s == null) {
112 | return null;
113 | }
114 |
115 | final int len = s.length();
116 | int index = len;
117 |
118 | while (index > 0 && Character.isWhitespace(s.charAt(index - 1))) {
119 | index--;
120 | }
121 |
122 | return (index <= 0) ? "" : s.substring(0, index);
123 | }
124 |
125 | public static String getCharsetFromContentTypeString(final String contentType) {
126 | if (contentType != null) {
127 | final String pattern = "charset=([a-z\\d\\-]*)";
128 | final Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(contentType);
129 | if (matcher.find()) {
130 | final String charset = matcher.group(1);
131 | if (Charset.isSupported(charset)) {
132 | return charset;
133 | }
134 | }
135 | }
136 |
137 | return null;
138 | }
139 |
140 | public static String getCharsetFromContent(final URL url) throws IOException {
141 | final InputStream stream = url.openStream();
142 | final byte chunk[] = new byte[2048];
143 | final int bytesRead = stream.read(chunk);
144 | if (bytesRead > 0) {
145 | final String startContent = new String(chunk);
146 | final String pattern = "\\ ]";
147 | final Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(startContent);
148 | if (matcher.find()) {
149 | final String charset = matcher.group(1);
150 | if (Charset.isSupported(charset)) {
151 | return charset;
152 | }
153 | }
154 | }
155 |
156 | return null;
157 | }
158 |
159 | public static boolean isHexadecimalDigit(final char ch) {
160 | return Character.isDigit(ch) || ch == 'A' || ch == 'a' || ch == 'B' || ch == 'b' || ch == 'C' || ch == 'c'
161 | || ch == 'D' || ch == 'd' || ch == 'E' || ch == 'e' || ch == 'F' || ch == 'f';
162 | }
163 |
164 | public static boolean isValidXmlChar(final char ch) {
165 | return ((ch >= 0x20) && (ch <= 0xD7FF)) || (ch == 0x9) || (ch == 0xA) || (ch == 0xD)
166 | || ((ch >= 0xE000) && (ch <= 0xFFFD)) || ((ch >= 0x10000) && (ch <= 0x10FFFF));
167 | }
168 |
169 | public static boolean isReservedXmlChar(final char ch) {
170 | return (ch < RESERVED_XML_CHARS_SIZE && RESERVED_XML_CHARS[ch] != null);
171 | }
172 |
173 | public static boolean isValidInt(final String s, final int radix) {
174 | try {
175 | Integer.parseInt(s, radix);
176 | return true;
177 | } catch (NumberFormatException e) {
178 | return false;
179 | }
180 | }
181 |
182 | /**
183 | * Escapes XML string.
184 | *
185 | * @param s
186 | * String to be escaped
187 | * @param props
188 | * Cleaner properties gover affect escaping behaviour
189 | * @param isDomCreation
190 | * Tells if escaped content will be part of the DOM
191 | */
192 | public static String escapeXml(final String s, final CleanerProperties props, final boolean isDomCreation) {
193 | final boolean advanced = props.isAdvancedXmlEscape();
194 | final boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
195 | final boolean translateSpecialEntities = props.isTranslateSpecialEntities();
196 |
197 | if (s != null) {
198 | final int len = s.length();
199 | final StringBuilder result = new StringBuilder(len);
200 |
201 | for (int i = 0; i < len; i++) {
202 | final char ch = s.charAt(i);
203 |
204 | if (ch == '&') {
205 | if ((advanced || recognizeUnicodeChars) && (i < len - 2) && (s.charAt(i + 1) == '#')) {
206 | final boolean isHex = Character.toLowerCase(s.charAt(i + 2)) == 'x';
207 | int charIndex = i + (isHex ? 3 : 2);
208 | final int radix = isHex ? 16 : 10;
209 | String unicode = "";
210 | while (charIndex < len) {
211 | final char currCh = s.charAt(charIndex);
212 | if (currCh == ';') {
213 | break;
214 | } else if (isValidInt(unicode + currCh, radix)) {
215 | unicode += currCh;
216 | charIndex++;
217 | } else {
218 | charIndex--;
219 | break;
220 | }
221 | }
222 |
223 | if (isValidInt(unicode, radix)) {
224 | final char unicodeChar = (char) Integer.parseInt(unicode, radix);
225 | if (!isValidXmlChar(unicodeChar)) {
226 | i = charIndex;
227 | } else if (!isReservedXmlChar(unicodeChar)) {
228 | result.append(recognizeUnicodeChars ? String.valueOf(unicodeChar) : "" + unicode
229 | + ";");
230 | i = charIndex;
231 | } else {
232 | i = charIndex;
233 | result.append("" + (isHex ? "x" : "") + unicode + ";");
234 | }
235 | } else {
236 | result.append("&");
237 | }
238 | } else {
239 | if (translateSpecialEntities) {
240 | // get minimal following sequence required to
241 | // recognize some special entitiy
242 | final String seq = s.substring(i,
243 | i + Math.min(SpecialEntity.getMaxEntityLength() + 2, len - i));
244 | final int semiIndex = seq.indexOf(';');
245 | if (semiIndex > 0) {
246 | final String entityKey = seq.substring(1, semiIndex);
247 | final SpecialEntity entity = SpecialEntity.getEntity(entityKey);
248 | if (entity != null) {
249 | result.append(props.isTransSpecialEntitiesToNCR() ? entity.getDecimalNCR() : entity
250 | .getCharacter());
251 | i += entityKey.length() + 1;
252 | continue;
253 | }
254 | }
255 | }
256 |
257 | if (advanced) {
258 | final String sub = s.substring(i);
259 | boolean isReservedSeq = false;
260 | for (int j = 0; j < RESERVED_XML_CHARS_LIST.length; j++) {
261 | final char currentChar = RESERVED_XML_CHARS_LIST[j];
262 | final String seq = RESERVED_XML_CHARS[currentChar];
263 | if (sub.startsWith(seq)) {
264 | result.append(isDomCreation ? currentChar : (props.isTransResCharsToNCR() ? ""
265 | + (int) currentChar + ";" : seq));
266 | i += seq.length() - 1;
267 | isReservedSeq = true;
268 | break;
269 | }
270 | }
271 | if (!isReservedSeq) {
272 | result.append(isDomCreation ? "&" : (props.isTransResCharsToNCR() ? "" + (int) '&'
273 | + ";" : RESERVED_XML_CHARS['&']));
274 | }
275 | continue;
276 | }
277 |
278 | result.append("&");
279 | }
280 | } else if (isReservedXmlChar(ch)) {
281 | result.append(props.isTransResCharsToNCR() ? "" + (int) ch + ";" : (isDomCreation ? ch
282 | : RESERVED_XML_CHARS[ch]));
283 | } else {
284 | result.append(ch);
285 | }
286 | }
287 |
288 | return result.toString();
289 | }
290 |
291 | return null;
292 | }
293 |
294 | /**
295 | * Checks whether specified object's string representation is empty string
296 | * (containing of only whitespaces).
297 | *
298 | * @param object
299 | * Object whose string representation is checked
300 | * @return true, if empty string, false otherwise
301 | */
302 | public static boolean isWhitespaceString(final Object object) {
303 | if (object != null) {
304 | final String s = object.toString();
305 | return s != null && "".equals(s.trim());
306 | }
307 | return false;
308 | }
309 |
310 | /**
311 | * Checks if specified character can be part of xml identifier (tag name of
312 | * attribute name) and is not standard identifier character.
313 | *
314 | * @param ch
315 | * Character to be checked
316 | * @return True if it can be part of xml identifier
317 | */
318 | public static boolean isIdentifierHelperChar(final char ch) {
319 | return ':' == ch || '.' == ch || '-' == ch || '_' == ch;
320 | }
321 |
322 | /**
323 | * Chacks whether specified string can be valid tag name or attribute name
324 | * in xml.
325 | *
326 | * @param s
327 | * String to be checked
328 | * @return True if string is valid xml identifier, false otherwise
329 | */
330 | public static boolean isValidXmlIdentifier(final String s) {
331 | if (s != null) {
332 | final int len = s.length();
333 | if (len == 0) {
334 | return false;
335 | }
336 | for (int i = 0; i < len; i++) {
337 | final char ch = s.charAt(i);
338 | if ((i == 0 && !Character.isUnicodeIdentifierStart(ch) && ch != '_')
339 | || (!Character.isUnicodeIdentifierStart(ch) && !Character.isDigit(ch) && !Utils
340 | .isIdentifierHelperChar(ch))) {
341 | return false;
342 | }
343 | }
344 | return true;
345 | }
346 |
347 | return false;
348 | }
349 |
350 | /**
351 | * @param o
352 | * @return True if specified string is null of contains only whitespace
353 | * characters
354 | */
355 | public static boolean isEmptyString(final Object o) {
356 | return o == null || "".equals(o.toString().trim());
357 | }
358 |
359 | /**
360 | * Evaluates string template for specified map of variables. Template string
361 | * can contain dynamic parts in the form of ${VARNAME}. Each such part is
362 | * replaced with value of the variable if such exists in the map, or with
363 | * empty string otherwise.
364 | *
365 | * @param template
366 | * Template string
367 | * @param variables
368 | * Map of variables (can be null)
369 | * @return Evaluated string
370 | */
371 | public static String evaluateTemplate(final String template, final Map variables) {
372 | if (template == null) {
373 | return template;
374 | }
375 |
376 | final StringBuilder result = new StringBuilder();
377 |
378 | int startIndex = template.indexOf(VAR_START);
379 | int endIndex = -1;
380 |
381 | while (startIndex >= 0 && startIndex < template.length()) {
382 | result.append(template.substring(endIndex + 1, startIndex));
383 | endIndex = template.indexOf(VAR_END, startIndex);
384 |
385 | if (endIndex > startIndex) {
386 | final String varName = template.substring(startIndex + VAR_START.length(), endIndex);
387 | final Object resultObj = variables != null ? variables.get(varName.toLowerCase()) : "";
388 | result.append(resultObj == null ? "" : resultObj.toString());
389 | }
390 |
391 | startIndex = template.indexOf(VAR_START, Math.max(endIndex + VAR_END.length(), startIndex + 1));
392 | }
393 |
394 | result.append(template.substring(endIndex + 1));
395 |
396 | return result.toString();
397 | }
398 |
399 | public static String[] tokenize(final String s, final String delimiters) {
400 | if (s == null) {
401 | return new String[] {};
402 | }
403 |
404 | final StringTokenizer tokenizer = new StringTokenizer(s, delimiters);
405 | final String result[] = new String[tokenizer.countTokens()];
406 | int index = 0;
407 | while (tokenizer.hasMoreTokens()) {
408 | result[index++] = tokenizer.nextToken();
409 | }
410 |
411 | return result;
412 | }
413 |
414 | public static void updateTagTransformations(final CleanerTransformations transformations, final String key,
415 | final String value) {
416 | final int index = key.indexOf('.');
417 |
418 | // new tag transformation case (tagname[=destname[,preserveatts]])
419 | if (index <= 0) {
420 | String destTag = null;
421 | boolean preserveSourceAtts = true;
422 | if (value != null) {
423 | final String[] tokens = tokenize(value, ",;");
424 | if (tokens.length > 0) {
425 | destTag = tokens[0];
426 | }
427 | if (tokens.length > 1) {
428 | preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) || "yes".equalsIgnoreCase(tokens[1])
429 | || "1".equals(tokens[1]);
430 | }
431 | }
432 | final TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts);
433 | transformations.addTransformation(newTagTrans);
434 | } else { // attribute transformation description
435 | final String[] parts = tokenize(key, ".");
436 | final String tagName = parts[0];
437 | final TagTransformation trans = transformations.getTransformation(tagName);
438 | if (trans != null) {
439 | trans.addAttributeTransformation(parts[1], value);
440 | }
441 | }
442 | }
443 |
444 | /**
445 | * Checks if specified link is full URL.
446 | *
447 | * @param link
448 | * @return True, if full URl, false otherwise.
449 | */
450 | public static boolean isFullUrl(String link) {
451 | if (link == null) {
452 | return false;
453 | }
454 | link = link.trim().toLowerCase();
455 | return link.startsWith("http://") || link.startsWith("https://") || link.startsWith("file://");
456 | }
457 |
458 | /**
459 | * Calculates full URL for specified page URL and link which could be full,
460 | * absolute or relative like there can be found in A or IMG tags.
461 | */
462 | public static String fullUrl(String pageUrl, final String link) {
463 | if (isFullUrl(link)) {
464 | return link;
465 | } else if (link != null && link.charAt(0) == '?') {
466 | final int qindex = pageUrl.indexOf('?');
467 | final int len = pageUrl.length();
468 | if (qindex < 0) {
469 | return pageUrl + link;
470 | } else if (qindex == len - 1) {
471 | return pageUrl.substring(0, len - 1) + link;
472 | } else {
473 | return pageUrl + "&" + link.substring(1);
474 | }
475 | }
476 |
477 | final boolean isLinkAbsolute = (link.charAt(0) == '/');
478 |
479 | if (!isFullUrl(pageUrl)) {
480 | pageUrl = "http://" + pageUrl;
481 | }
482 |
483 | final int slashIndex = isLinkAbsolute ? pageUrl.indexOf('/', 8) : pageUrl.lastIndexOf('/');
484 | if (slashIndex <= 8) {
485 | pageUrl += "/";
486 | } else {
487 | pageUrl = pageUrl.substring(0, slashIndex + 1);
488 | }
489 |
490 | return isLinkAbsolute ? pageUrl + link.substring(1) : pageUrl + link;
491 | }
492 |
493 | /**
494 | * @param name
495 | * @return For xml element name or attribute name returns prefix (part
496 | * before :) or null if there is no prefix
497 | */
498 | public static String getXmlNSPrefix(final String name) {
499 | final int colIndex = name.indexOf(':');
500 | if (colIndex > 0) {
501 | return name.substring(0, colIndex);
502 | }
503 |
504 | return null;
505 | }
506 |
507 | /**
508 | * @param name
509 | * @return For xml element name or attribute name returns name after prefix
510 | * (part after :)
511 | */
512 | public static String getXmlName(final String name) {
513 | final int colIndex = name.indexOf(':');
514 | if (colIndex > 0 && colIndex < name.length() - 1) {
515 | return name.substring(colIndex + 1);
516 | }
517 |
518 | return name;
519 | }
520 |
521 | }
522 |
--------------------------------------------------------------------------------
/src/org/htmlcleaner/XPather.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2011 Zheng Sun
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | /* Copyright (c) 2006-2007, Vladimir Nikic
18 | All rights reserved.
19 |
20 | Redistribution and use of this software in source and binary forms,
21 | with or without modification, are permitted provided that the following
22 | conditions are met:
23 |
24 | * Redistributions of source code must retain the above
25 | copyright notice, this list of conditions and the
26 | following disclaimer.
27 |
28 | * Redistributions in binary form must reproduce the above
29 | copyright notice, this list of conditions and the
30 | following disclaimer in the documentation and/or other
31 | materials provided with the distribution.
32 |
33 | * The name of HtmlCleaner may not be used to endorse or promote
34 | products derived from this software without specific prior
35 | written permission.
36 |
37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
41 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 | POSSIBILITY OF SUCH DAMAGE.
48 |
49 | You can contact Vladimir Nikic by sending e-mail to
50 | nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
51 | subject line.
52 | */
53 |
54 | package org.htmlcleaner;
55 |
56 | import java.util.*;
57 |
58 | /**
59 | *
60 | * Utility for searching cleaned document tree with XPath expressions.
61 | *
62 | * Examples of supported axes:
63 | *
64 | * //div//a
65 | * //div//a[@id][@class]
66 | * /body/*[1]/@type
67 | * //div[3]//a[@id][@href='r/n4']
68 | * //div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a
69 | * //div[2]/@*[2]
70 | * data(//div//a[@id][@class])
71 | * //p/last()
72 | * //body//div[3][@class]//span[12.2
73 | * data(//a['v' < @id])
74 | *
75 | *
76 | */
77 | public class XPather {
78 |
79 | // array of basic tokens of which XPath expression is made
80 | final private String tokenArray[];
81 |
82 | /**
83 | * Constructor - creates XPather instance with specified XPath expression.
84 | *
85 | * @param expression
86 | */
87 | public XPather(final String expression) {
88 | final StringTokenizer tokenizer = new StringTokenizer(expression, "/()[]\"'=<>", true);
89 | final int tokenCount = tokenizer.countTokens();
90 | tokenArray = new String[tokenCount];
91 |
92 | int index = 0;
93 |
94 | // this is not real XPath compiler, rather simple way to recognize basic
95 | // XPaths expressions
96 | // and interpret them against some TagNode instance.
97 | while (tokenizer.hasMoreTokens()) {
98 | tokenArray[index++] = tokenizer.nextToken();
99 | }
100 | }
101 |
102 | private Collection evaluateAgainst(final Collection object, int from, final int to, final boolean isRecursive,
103 | final int position, final int last, final boolean isFilterContext, final Collection filterSource)
104 | throws XPatherException {
105 | if (from >= 0 && to < tokenArray.length && from <= to) {
106 | if (tokenArray[from].trim().length() == 0) {
107 | return evaluateAgainst(object, from + 1, to, isRecursive, position, last, isFilterContext, filterSource);
108 | } else if (isToken("(", from)) {
109 | final int closingBracket = findClosingIndex(from, to);
110 | if (closingBracket > 0) {
111 | final Collection value = evaluateAgainst(object, from + 1, closingBracket - 1, false, position,
112 | last, isFilterContext, filterSource);
113 | return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext,
114 | filterSource);
115 | } else {
116 | throwStandardException();
117 | }
118 | } else if (isToken("[", from)) {
119 | final int closingBracket = findClosingIndex(from, to);
120 | if (closingBracket > 0) {
121 | final Collection value = filterByCondition(object, from + 1, closingBracket - 1);
122 | return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext,
123 | filterSource);
124 | } else {
125 | throwStandardException();
126 | }
127 | } else if (isToken("\"", from) || isToken("'", from)) {
128 | // string constant
129 | final int closingQuote = findClosingIndex(from, to);
130 | if (closingQuote > from) {
131 | final Collection value = singleton(flatten(from + 1, closingQuote - 1));
132 | return evaluateAgainst(value, closingQuote + 1, to, false, position, last, isFilterContext,
133 | filterSource);
134 | } else {
135 | throwStandardException();
136 | }
137 | } else if ((isToken("=", from) || isToken("<", from) || isToken(">", from)) && isFilterContext) {
138 | // operator inside filter
139 | final boolean logicValue;
140 | if (isToken("=", from + 1) && (isToken("<", from) || isToken(">", from))) {
141 | final Collection secondObject = evaluateAgainst(filterSource, from + 2, to, false, position, last,
142 | isFilterContext, filterSource);
143 | logicValue = evaluateLogic(object, secondObject, tokenArray[from] + tokenArray[from + 1]);
144 | } else {
145 | final Collection secondObject = evaluateAgainst(filterSource, from + 1, to, false, position, last,
146 | isFilterContext, filterSource);
147 | logicValue = evaluateLogic(object, secondObject, tokenArray[from]);
148 | }
149 | return singleton(Boolean.valueOf(logicValue));
150 | } else if (isToken("/", from)) { // children of the node
151 | final boolean goRecursive = isToken("/", from + 1);
152 | if (goRecursive) {
153 | from++;
154 | }
155 | if (from < to) {
156 | int toIndex = findClosingIndex(from, to) - 1;
157 | if (toIndex <= from) {
158 | toIndex = to;
159 | }
160 | final Collection value = evaluateAgainst(object, from + 1, toIndex, goRecursive, 1, last,
161 | isFilterContext, filterSource);
162 | return evaluateAgainst(value, toIndex + 1, to, false, 1, last, isFilterContext, filterSource);
163 | } else {
164 | throwStandardException();
165 | }
166 | } else if (isFunctionCall(from, to)) {
167 | final int closingBracketIndex = findClosingIndex(from + 1, to);
168 | final Collection funcValue = evaluateFunction(object, from, to, position, last, isFilterContext);
169 | return evaluateAgainst(funcValue, closingBracketIndex + 1, to, false, 1, last, isFilterContext,
170 | filterSource);
171 | } else if (isValidInteger(tokenArray[from])) {
172 | final Collection value = singleton(Integer.valueOf(tokenArray[from]));
173 | return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
174 | } else if (isValidDouble(tokenArray[from])) {
175 | final Collection value = singleton(new Double(tokenArray[from]));
176 | return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
177 | } else {
178 | return getElementsByName(object, from, to, isRecursive, isFilterContext);
179 | }
180 | } else {
181 | return object;
182 | }
183 |
184 | throw new XPatherException();
185 | }
186 |
187 | /**
188 | * Main public method for this class - a way to execute XPath expression
189 | * against specified TagNode instance.
190 | *
191 | * @param node
192 | */
193 | public Object[] evaluateAgainstNode(final TagNode node) throws XPatherException {
194 | if (node == null) {
195 | throw new XPatherException("Cannot evaluate XPath expression against null value!");
196 | }
197 |
198 | final Collection collectionResult = evaluateAgainst(singleton(node), 0, tokenArray.length - 1, false, 1, 0,
199 | false, null);
200 | final Object[] array = new Object[collectionResult.size()];
201 |
202 | final Iterator iterator = collectionResult.iterator();
203 | int index = 0;
204 | while (iterator.hasNext()) {
205 | array[index++] = iterator.next();
206 | }
207 |
208 | return array;
209 | }
210 |
211 | /**
212 | * Evaluates specified function. Currently, following XPath functions are
213 | * supported: last, position, text, count, data
214 | *
215 | * @param source
216 | * @param from
217 | * @param to
218 | * @param position
219 | * @param last
220 | * @return Collection as the result of evaluation.
221 | */
222 | private Collection evaluateFunction(final Collection source, final int from, final int to, final int position,
223 | final int last, final boolean isFilterContext) throws XPatherException {
224 | final String name = tokenArray[from].trim();
225 | final ArrayList result = new ArrayList();
226 |
227 | final int size = source.size();
228 | final Iterator iterator = source.iterator();
229 | int index = 0;
230 | while (iterator.hasNext()) {
231 | final Object curr = iterator.next();
232 | index++;
233 | if ("last".equals(name)) {
234 | result.add(Integer.valueOf(isFilterContext ? last : size));
235 | } else if ("position".equals(name)) {
236 | result.add(Integer.valueOf(isFilterContext ? position : index));
237 | } else if ("text".equals(name)) {
238 | if (curr instanceof TagNode) {
239 | result.add(((TagNode) curr).getText());
240 | } else if (curr instanceof String) {
241 | result.add(curr.toString());
242 | }
243 | } else if ("count".equals(name)) {
244 | final Collection argumentEvaluated = evaluateAgainst(source, from + 2, to - 1, false, position, 0,
245 | isFilterContext, null);
246 | result.add(Integer.valueOf(argumentEvaluated.size()));
247 | } else if ("data".equals(name)) {
248 | final Collection argumentEvaluated = evaluateAgainst(source, from + 2, to - 1, false, position, 0,
249 | isFilterContext, null);
250 | final Iterator it = argumentEvaluated.iterator();
251 | while (it.hasNext()) {
252 | final Object elem = it.next();
253 | if (elem instanceof TagNode) {
254 | result.add(((TagNode) elem).getText());
255 | } else if (elem instanceof String) {
256 | result.add(elem.toString());
257 | }
258 | }
259 | } else {
260 | throw new XPatherException("Unknown function " + name + "!");
261 | }
262 | }
263 |
264 | return result;
265 | }
266 |
267 | /**
268 | * Evaluates logic operation on two collections.
269 | *
270 | * @param first
271 | * @param second
272 | * @param logicOperator
273 | * @return Result of logic operation
274 | */
275 | private boolean evaluateLogic(final Collection first, final Collection second, final String logicOperator) {
276 | if (first == null || first.isEmpty() || second == null || second.isEmpty()) {
277 | return false;
278 | }
279 | final Object elem1 = first.iterator().next();
280 | final Object elem2 = second.iterator().next();
281 | if (elem1 instanceof Number && elem2 instanceof Number) {
282 | final double d1 = ((Number) elem1).doubleValue();
283 | final double d2 = ((Number) elem2).doubleValue();
284 | if ("=".equals(logicOperator)) {
285 | return d1 == d2;
286 | } else if ("<".equals(logicOperator)) {
287 | return d1 < d2;
288 | } else if (">".equals(logicOperator)) {
289 | return d1 > d2;
290 | } else if ("<=".equals(logicOperator)) {
291 | return d1 <= d2;
292 | } else if (">=".equals(logicOperator)) {
293 | return d1 >= d2;
294 | }
295 | } else {
296 | final String s1 = toText(elem1);
297 | final String s2 = toText(elem2);
298 | final int result = s1.compareTo(s2);
299 | if ("=".equals(logicOperator)) {
300 | return result == 0;
301 | } else if ("<".equals(logicOperator)) {
302 | return result < 0;
303 | } else if (">".equals(logicOperator)) {
304 | return result > 0;
305 | } else if ("<=".equals(logicOperator)) {
306 | return result <= 0;
307 | } else if (">=".equals(logicOperator)) {
308 | return result >= 0;
309 | }
310 | }
311 |
312 | return false;
313 | }
314 |
315 | /**
316 | * Filter nodes satisfying the condition
317 | *
318 | * @param source
319 | * @param from
320 | * @param to
321 | */
322 | private final Collection filterByCondition(final Collection source, final int from, final int to)
323 | throws XPatherException {
324 | final ArrayList result = new ArrayList();
325 | final Iterator iterator = source.iterator();
326 | int index = 0;
327 | final int size = source.size();
328 | while (iterator.hasNext()) {
329 | final Object curr = iterator.next();
330 | index++;
331 |
332 | final ArrayList logicValueList = new ArrayList(evaluateAgainst(singleton(curr), from, to, false, index,
333 | size, true, singleton(curr)));
334 | if (logicValueList.size() >= 1) {
335 | final Object first = logicValueList.get(0);
336 | if (first instanceof Boolean) {
337 | if (((Boolean) first).booleanValue()) {
338 | result.add(curr);
339 | }
340 | } else if (first instanceof Integer) {
341 | if (((Integer) first).intValue() == index) {
342 | result.add(curr);
343 | }
344 | } else {
345 | result.add(curr);
346 | }
347 | }
348 | }
349 | return result;
350 | }
351 |
352 | /**
353 | * @param from
354 | * @param to
355 | * @return matching closing index in the token array for the current token,
356 | * or -1 if there is no closing token within expected bounds.
357 | */
358 | private int findClosingIndex(final int from, final int to) {
359 | if (from < to) {
360 | final String currToken = tokenArray[from];
361 |
362 | if ("\"".equals(currToken)) {
363 | for (int i = from + 1; i <= to; i++) {
364 | if ("\"".equals(tokenArray[i])) {
365 | return i;
366 | }
367 | }
368 | } else if ("'".equals(currToken)) {
369 | for (int i = from + 1; i <= to; i++) {
370 | if ("'".equals(tokenArray[i])) {
371 | return i;
372 | }
373 | }
374 | } else if ("(".equals(currToken) || "[".equals(currToken) || "/".equals(currToken)) {
375 | boolean isQuoteClosed = true;
376 | boolean isAposClosed = true;
377 | int brackets = "(".equals(currToken) ? 1 : 0;
378 | int angleBrackets = "[".equals(currToken) ? 1 : 0;
379 | int slashes = "/".equals(currToken) ? 1 : 0;
380 | for (int i = from + 1; i <= to; i++) {
381 | if ("\"".equals(tokenArray[i])) {
382 | isQuoteClosed = !isQuoteClosed;
383 | } else if ("'".equals(tokenArray[i])) {
384 | isAposClosed = !isAposClosed;
385 | } else if ("(".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) {
386 | brackets++;
387 | } else if (")".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) {
388 | brackets--;
389 | } else if ("[".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) {
390 | angleBrackets++;
391 | } else if ("]".equals(tokenArray[i]) && isQuoteClosed && isAposClosed) {
392 | angleBrackets--;
393 | } else if ("/".equals(tokenArray[i]) && isQuoteClosed && isAposClosed && brackets == 0
394 | && angleBrackets == 0) {
395 | slashes--;
396 | }
397 |
398 | if (isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0 && slashes == 0) {
399 | return i;
400 | }
401 | }
402 | }
403 |
404 | }
405 |
406 | return -1;
407 | }
408 |
409 | private String flatten(final int from, final int to) {
410 | if (from <= to) {
411 | final StringBuffer result = new StringBuffer();
412 | for (int i = from; i <= to; i++) {
413 | result.append(tokenArray[i]);
414 | }
415 |
416 | return result.toString();
417 | }
418 |
419 | return "";
420 | }
421 |
422 | /**
423 | * For the given source collection and specified name, returns collection of
424 | * subnodes or attribute values.
425 | *
426 | * @param source
427 | * @param from
428 | * @param to
429 | * @param isRecursive
430 | * @return Colection of TagNode instances or collection of String instances.
431 | */
432 | private Collection getElementsByName(final Collection source, final int from, final int to,
433 | final boolean isRecursive, final boolean isFilterContext) throws XPatherException {
434 | String name = tokenArray[from].trim();
435 |
436 | if (isAtt(name)) {
437 | name = name.substring(1);
438 | final Collection result = new ArrayList();
439 | Collection nodes;
440 | if (isRecursive) {
441 | nodes = new LinkedHashSet();
442 | final Iterator iterator = source.iterator();
443 | while (iterator.hasNext()) {
444 | final Object next = iterator.next();
445 | if (next instanceof TagNode) {
446 | final TagNode node = (TagNode) next;
447 | nodes.addAll(node.getAllElementsList(true));
448 | }
449 | }
450 | } else {
451 | nodes = source;
452 | }
453 |
454 | final Iterator iterator = nodes.iterator();
455 | while (iterator.hasNext()) {
456 | final Object next = iterator.next();
457 | if (next instanceof TagNode) {
458 | final TagNode node = (TagNode) next;
459 | if ("*".equals(name)) {
460 | result.addAll(evaluateAgainst(node.getAttributes().values(), from + 1, to, false, 1, 1,
461 | isFilterContext, null));
462 | } else {
463 | final String attValue = node.getAttributeByName(name);
464 | if (attValue != null) {
465 | result.addAll(evaluateAgainst(singleton(attValue), from + 1, to, false, 1, 1,
466 | isFilterContext, null));
467 | }
468 | }
469 | } else {
470 | throwStandardException();
471 | }
472 | }
473 | return result;
474 | } else {
475 | final Collection result = new LinkedHashSet();
476 | final Iterator iterator = source.iterator();
477 | int index = 0;
478 | while (iterator.hasNext()) {
479 | final Object next = iterator.next();
480 | if (next instanceof TagNode) {
481 | final TagNode node = (TagNode) next;
482 | index++;
483 | final boolean isSelf = ".".equals(name);
484 | final boolean isParent = "..".equals(name);
485 | final boolean isAll = "*".equals(name);
486 |
487 | final Collection subnodes;
488 | if (isSelf) {
489 | subnodes = singleton(node);
490 | } else if (isParent) {
491 | final TagNode parent = node.getParent();
492 | subnodes = parent != null ? singleton(parent) : new ArrayList();
493 | } else {
494 | subnodes = isAll ? node.getChildTagList() : node.getElementListByName(name, false);
495 | }
496 |
497 | final LinkedHashSet nodeSet = new LinkedHashSet(subnodes);
498 | final Collection refinedSubnodes = evaluateAgainst(nodeSet, from + 1, to, false, index, nodeSet
499 | .size(), isFilterContext, null);
500 |
501 | if (isRecursive) {
502 | final List childTags = node.getChildTagList();
503 | if (isSelf || isParent || isAll) {
504 | result.addAll(refinedSubnodes);
505 | }
506 | final Iterator childIterator = childTags.iterator();
507 | while (childIterator.hasNext()) {
508 | final TagNode childTag = (TagNode) childIterator.next();
509 | final Collection childrenByName = getElementsByName(singleton(childTag), from, to,
510 | isRecursive, isFilterContext);
511 | if (!isSelf && !isParent && !isAll && refinedSubnodes.contains(childTag)) {
512 | result.add(childTag);
513 | }
514 | result.addAll(childrenByName);
515 | }
516 | } else {
517 | result.addAll(refinedSubnodes);
518 | }
519 | } else {
520 | throwStandardException();
521 | }
522 | }
523 | return result;
524 | }
525 | }
526 |
527 | /**
528 | * Checks if token is attribute (starts with @)
529 | *
530 | * @param token
531 | */
532 | private boolean isAtt(final String token) {
533 | return token != null && token.length() > 1 && token.charAt(0) == '@';
534 | }
535 |
536 | /**
537 | * Checks if tokens in specified range represents valid function call.
538 | *
539 | * @param from
540 | * @param to
541 | * @return True if it is valid function call, false otherwise.
542 | */
543 | private boolean isFunctionCall(final int from, final int to) {
544 | if (!isIdentifier(tokenArray[from]) && !isToken("(", from + 1)) {
545 | return false;
546 | }
547 |
548 | return findClosingIndex(from + 1, to) > from + 1;
549 | }
550 |
551 | /**
552 | * Checks if given string is valid identifier.
553 | *
554 | * @param str
555 | */
556 | private boolean isIdentifier(String str) {
557 | if (str == null) {
558 | return false;
559 | }
560 |
561 | str = str.trim();
562 | if (str.length() > 0) {
563 | if (!Character.isLetter(str.charAt(0))) {
564 | return false;
565 | }
566 | for (int i = 1; i < str.length(); i++) {
567 | final char ch = str.charAt(i);
568 | if (ch != '_' && ch != '-' && !Character.isLetterOrDigit(ch)) {
569 | return false;
570 | }
571 | }
572 | }
573 |
574 | return false;
575 | }
576 |
577 | private boolean isToken(final String token, final int index) {
578 | final int len = tokenArray.length;
579 | return index >= 0 && index < len && tokenArray[index].trim().equals(token.trim());
580 | }
581 |
582 | private boolean isValidDouble(final String s) {
583 | try {
584 | Double.parseDouble(s);
585 | return true;
586 | } catch (NumberFormatException e) {
587 | return false;
588 | }
589 | }
590 |
591 | private boolean isValidInteger(final String s) {
592 | try {
593 | Integer.parseInt(s);
594 | return true;
595 | } catch (NumberFormatException e) {
596 | return false;
597 | }
598 | }
599 |
600 | /**
601 | * Creates one-element collection for the specified object.
602 | *
603 | * @param element
604 | */
605 | private Collection singleton(final Object element) {
606 | final ArrayList result = new ArrayList();
607 | result.add(element);
608 | return result;
609 | }
610 |
611 | private void throwStandardException() throws XPatherException {
612 | throw new XPatherException();
613 | }
614 |
615 | private String toText(final Object o) {
616 | if (o == null) {
617 | return "";
618 | }
619 | if (o instanceof TagNode) {
620 | return ((TagNode) o).getText().toString();
621 | } else {
622 | return o.toString();
623 | }
624 | }
625 |
626 | }
627 |
--------------------------------------------------------------------------------