Commit 3dfab5bf authored by Michael Hamann's avatar Michael Hamann
Browse files

XCOMMONS-901: Support both XHTML 1.0 and XHTML 5 in HtmlCleaner

* Cleanup some comments and introduce a helper to check if the configuration is HTML5.
* Allow b, i, u and s tags in HTML 5 as they are not obsolete in HTML 5 (and i is in fact frequently used for icons).
* Add XWikiHTML5TagProvider to fix svg and style tag handling.
* Fix class fan-out complexity in DefaultHTMLCleaner by not using the CleanerTransformations class but only our subclass.
parent dcb19ae0
......@@ -56,7 +56,7 @@
String TRANSLATE_SPECIAL_ENTITIES = "translateSpecialEntities";
/**
* The HTML (major) version. Should be "5" for HTML5 and "4" otherwise for the default implementation.
* The HTML (major) version. Should be "5" for HTML5 and "4" (default) otherwise for the default implementation.
* @since 14.0RC1
*/
@Unstable
......
......@@ -384,6 +384,12 @@
*/
String TAG_TEMPLATE = "template";
/**
* HTML <svg> tag.
* @since 14.0RC1
*/
String TAG_SVG = "svg";
/**
* HTML id attribute name.
*/
......
/*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.xwiki.xml.html;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.htmlcleaner.BelongsTo;
import org.htmlcleaner.Html5TagProvider;
import org.htmlcleaner.TagInfo;
import org.xwiki.stability.Unstable;
/**
* List the tags allowed in HTML5 with custom bug fixes for <style> and <svg>-tags.
*
* See https://sourceforge.net/p/htmlcleaner/bugs/228/ and https://sourceforge.net/p/htmlcleaner/bugs/229/
*
* This class should be removed once these bugs have been fixed.
*
* @version $Id$
* @since 14.0RC1
*/
@Unstable
public class XWikiHTML5TagProvider extends Html5TagProvider
{
private static final List<String> TAGS_WITH_EXPLICIT_PHRASING_CHILDREN =
Arrays.asList(HTMLConstants.TAG_EM, HTMLConstants.TAG_STRONG, "small", HTMLConstants.TAG_S, "wbr", "mark",
"bdi", "time", "data", HTMLConstants.TAG_CITE, HTMLConstants.TAG_Q, HTMLConstants.TAG_CODE, "bdo", "dfn",
HTMLConstants.TAG_KBD, HTMLConstants.TAG_ABBR, HTMLConstants.TAG_VAR, "samp", "sub", "sup",
HTMLConstants.TAG_B, HTMLConstants.TAG_I, HTMLConstants.TAG_U, "rtc", "rt", "rp", "meter", "legend",
"progress");
/**
* Default constructor, applies our bug fixes.
*/
public XWikiHTML5TagProvider()
{
super();
// Fix https://sourceforge.net/p/htmlcleaner/bugs/229/.
this.getTagInfo(HTMLConstants.TAG_STYLE).setBelongsTo(BelongsTo.HEAD);
// Fix https://sourceforge.net/p/htmlcleaner/bugs/228/.
TagInfo svgTag = this.getTagInfo(HTMLConstants.TAG_SVG);
// Do not close other tags before SVG apart from svg.
svgTag.setMustCloseTags(Collections.singleton(HTMLConstants.TAG_SVG));
// Do not copy other tags inside SVG.
svgTag.setCopyTags(Collections.emptySet());
// Allow the SVG tag as child everywhere, where HTML5TagProvider explicitly allows phrasing content.
// Note: unfortunately, we cannot iterate over the tags, otherwise we could have avoided copying this list.
TAGS_WITH_EXPLICIT_PHRASING_CHILDREN.forEach(this::allowSVGChild);
}
/**
* @param tagName Tag for which the &lt;svg&gt;-tag shall be added to the allowed children.
*/
private void allowSVGChild(String tagName)
{
this.getTagInfo(tagName).getChildTags().add(HTMLConstants.TAG_SVG);
}
}
......@@ -30,7 +30,6 @@
import javax.xml.parsers.ParserConfigurationException;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.CleanerTransformations;
import org.htmlcleaner.DoctypeToken;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
......@@ -44,6 +43,7 @@
import org.xwiki.xml.html.HTMLCleaner;
import org.xwiki.xml.html.HTMLCleanerConfiguration;
import org.xwiki.xml.html.HTMLConstants;
import org.xwiki.xml.html.XWikiHTML5TagProvider;
import org.xwiki.xml.html.filter.HTMLFilter;
/**
......@@ -144,7 +144,13 @@ public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration confi
// especially since this makes it extra safe with regards to multithreading (even though HTML Cleaner is
// already supposed to be thread safe).
CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties);
HtmlCleaner cleaner;
if (isHTML5(configuration)) {
// Use our custom provider to fix bugs, should be checked on each upgrade if still necessary.
cleaner = new HtmlCleaner(new XWikiHTML5TagProvider(), cleanerProperties);
} else {
cleaner = new HtmlCleaner(cleanerProperties);
}
TagNode cleanedNode;
try {
......@@ -161,7 +167,7 @@ public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration confi
// Replace by the following when fixed:
// result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode);
if (getHTMLVersion(configuration) == 5) {
if (isHTML5(configuration)) {
cleanedNode.setDocType(new DoctypeToken(HTMLConstants.TAG_HTML, null, null, null));
} else {
cleanedNode.setDocType(
......@@ -237,7 +243,7 @@ private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration c
boolean useCharacterReferences = (param != null) && Boolean.parseBoolean(param);
defaultProperties.setTransResCharsToNCR(useCharacterReferences);
// By default, we are cleaning XHTML 1.0 code, not HTML 5.
// Sets the HTML version from the configuration (by default 4).
defaultProperties.setHtmlVersion(getHTMLVersion(configuration));
// We trim values by default for all attributes but the input value attribute.
......@@ -262,23 +268,27 @@ private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration c
* @return the default cleaning transformations to perform on tags, in addition to the base transformations done by
* HTML Cleaner
*/
private CleanerTransformations getDefaultCleanerTransformations(HTMLCleanerConfiguration configuration)
private TrimAttributeCleanerTransformations getDefaultCleanerTransformations(HTMLCleanerConfiguration configuration)
{
CleanerTransformations defaultTransformations = new TrimAttributeCleanerTransformations();
TrimAttributeCleanerTransformations defaultTransformations = new TrimAttributeCleanerTransformations();
TagTransformation tt;
// note that we do not care here to use a TrimAttributeTagTransformation, since the attributes are not preserved
TagTransformation tt = new TagTransformation(HTMLConstants.TAG_B,
HTMLConstants.TAG_STRONG, false);
defaultTransformations.addTransformation(tt);
if (!isHTML5(configuration)) {
// These tags are not obsolete in HTML5.
tt = new TagTransformation(HTMLConstants.TAG_B, HTMLConstants.TAG_STRONG, false);
defaultTransformations.addTransformation(tt);
tt = new TagTransformation(HTMLConstants.TAG_I, HTMLConstants.TAG_EM, false);
defaultTransformations.addTransformation(tt);
tt = new TagTransformation(HTMLConstants.TAG_I, HTMLConstants.TAG_EM, false);
defaultTransformations.addTransformation(tt);
tt = new TagTransformation(HTMLConstants.TAG_U, HTMLConstants.TAG_INS, false);
defaultTransformations.addTransformation(tt);
tt = new TagTransformation(HTMLConstants.TAG_U, HTMLConstants.TAG_INS, false);
defaultTransformations.addTransformation(tt);
tt = new TagTransformation(HTMLConstants.TAG_S, HTMLConstants.TAG_DEL, false);
defaultTransformations.addTransformation(tt);
tt = new TagTransformation(HTMLConstants.TAG_S, HTMLConstants.TAG_DEL, false);
defaultTransformations.addTransformation(tt);
}
tt = new TagTransformation(HTMLConstants.TAG_STRIKE, HTMLConstants.TAG_DEL, false);
defaultTransformations.addTransformation(tt);
......@@ -287,12 +297,12 @@ private CleanerTransformations getDefaultCleanerTransformations(HTMLCleanerConfi
tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_STYLE, "text-align:center");
defaultTransformations.addTransformation(tt);
if (getHTMLVersion(configuration) == 5) {
if (isHTML5(configuration)) {
// Font tags are removed before the filters are applied in HTML5, we thus need a transformation here.
defaultTransformations.addTransformation(new FontTagTransformation());
tt = new TrimAttributeTagTransformation(HTMLConstants.TAG_TT,
HTMLConstants.TAG_SPAN);
// The tt-tag is obsolete in HTML5
tt = new TrimAttributeTagTransformation(HTMLConstants.TAG_TT, HTMLConstants.TAG_SPAN);
tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_CLASS, "${class} monospace");
defaultTransformations.addTransformation(tt);
}
......@@ -310,10 +320,18 @@ private CleanerTransformations getDefaultCleanerTransformations(HTMLCleanerConfi
return defaultTransformations;
}
/**
* @param configuration The configuration to parse.
* @return If the configuration specifies HTML 5 as version.
*/
private boolean isHTML5(HTMLCleanerConfiguration configuration)
{
return getHTMLVersion(configuration) == 5;
}
/**
* @param configuration The configuration to parse.
* @return The HTML version specified in the configuration.
* @since 14.0RC1
*/
private int getHTMLVersion(HTMLCleanerConfiguration configuration)
{
......
......@@ -32,7 +32,7 @@
* @version $Id$
* @since 14.0RC1
*/
public class HTML5HTMLCleanerTest extends DefaultHTMLCleanerTest
class HTML5HTMLCleanerTest extends DefaultHTMLCleanerTest
{
public static final String HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<!DOCTYPE html>\n";
......@@ -67,46 +67,6 @@ void setUpCleaner()
this.cleanerConfiguration.setParameters(parameters);
}
/**
* Disable SVG test until https://sourceforge.net/p/htmlcleaner/bugs/228/ is fixed.
*
* This test should be removed again once it has been fixed to re-enable the parent test.
*/
@Test
@Override
@Disabled("See https://sourceforge.net/p/htmlcleaner/bugs/228/")
void cleanSVGTags() throws Exception
{
String input =
"<p>before</p>\n" + "<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\">\n"
+ "<circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\"></circle>\n"
+ "</svg></p>\n" + "<p>after</p>\n";
assertHTML(input, getHeaderFull() + input + FOOTER);
}
/**
* Disable style test until https://sourceforge.net/p/htmlcleaner/bugs/229/ is fixed.
*
* This test should be removed again once it has been fixed to re-enable the parent test.
*/
@Test
@Override
@Disabled("See https://sourceforge.net/p/htmlcleaner/bugs/229/")
void styleAndCData()
{
assertHTMLWithHeadContent("<style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style>",
"<style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style>");
assertHTMLWithHeadContent("<style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style>",
"<style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style>");
assertHTMLWithHeadContent("<style type=\"text/css\">/*<![CDATA[*/\na>span { color: blue;}\n/*]]>*/</style>",
"<style type=\"text/css\">a&gt;span { color: blue;}</style>");
assertHTMLWithHeadContent("<style>/*<![CDATA[*/\n<>\n/*]]>*/</style>", "<style>&lt;&gt;</style>");
assertHTMLWithHeadContent("<style>/*<![CDATA[*/\n<>\n/*]]>*/</style>", "<style><></style>");
}
/**
* This tests various invalid list usages. With HTML5, the lists are cleaned by HTMLCleaner sometimes and thus the
* list-style-type is not set to none as in the custom filter in XWiki. Further, HTML comments are not moved
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment