Commit df4b0a91 authored by Michael Hamann's avatar Michael Hamann
Browse files

XCOMMONS-901: Support both XHTML 1.0 and XHTML 5 in HtmlCleaner

* Add a new HTML_VERSION parameter to HTMLCleanerConfiguration.
* Pass the HTML version in DefaultHTMLCleaner to HTMLCleaner and set the document type accordingly.
* Add a tag transformation to transform `<tt>` to `<span class="monospace">`.
* Add a tag transformation to replace the font filter in HTML5 as the cleaner removes the font tags too early otherwise.
* Extend BodyFilter to allow more HTML5 tags directly in the body.
* Add constants for many HTML5 tags in HTMLConstants.
* Add a new unit test HTML5HTMLCleanerTest to test the behavior of HTMLCleaner with the HTML5 configuration.
* Modify DefaultHTMLCleanerTest to allow re-using most test methods in HTML5HTMLCleanerTest.
parent bbbc6217
......@@ -22,6 +22,7 @@
import java.util.List;
import java.util.Map;
import org.xwiki.stability.Unstable;
import org.xwiki.xml.html.filter.HTMLFilter;
/**
......@@ -54,6 +55,13 @@
*/
String TRANSLATE_SPECIAL_ENTITIES = "translateSpecialEntities";
/**
* The HTML (major) version. Should be "5" for HTML5 and "4" otherwise for the default implementation.
* @since 14.0RC1
*/
@Unstable
String HTML_VERSION = "htmlVersion";
/**
* @return the ordered list of filters to use for cleaning the HTML content
*/
......
......@@ -175,6 +175,12 @@
*/
String TAG_STRONG = "strong";
/**
* HTML &lt;tt&gt; tag name.
* @since 14.0RC1
*/
String TAG_TT = "tt";
/**
* HTML &lt;p&gt; tag name.
*/
......@@ -300,6 +306,84 @@
*/
String TAG_DL = "dl";
/**
* HTML &lt;article&gt; tag.
* @since 14.0RC1
*/
String TAG_ARTICLE = "article";
/**
* HTML &lt;aside&gt; tag.
* @since 14.0RC1
*/
String TAG_ASIDE = "aside";
/**
* HTML &lt;details&gt; tag.
* @since 14.0RC1
*/
String TAG_DETAILS = "details";
/**
* HTML &lt;figure&gt; tag.
* @since 14.0RC1
*/
String TAG_FIGURE = "figure";
/**
* HTML &lt;figcaption&gt; tag.
* @since 14.0RC1
*/
String TAG_FIGCAPTION = "figcaption";
/**
* HTML &lt;footer&gt; tag.
* @since 14.0RC1
*/
String TAG_FOOTER = "footer";
/**
* HTML &lt;header&gt; tag.
* @since 14.0RC1
*/
String TAG_HEADER = "header";
/**
* HTML &lt;hgroup&gt; tag.
* @since 14.0RC1
*/
String TAG_HGROUP = "hgroup";
/**
* HTML &lt;main&gt; tag.
* @since 14.0RC1
*/
String TAG_MAIN = "main";
/**
* HTML &lt;menu&gt; tag.
* @since 14.0RC1
*/
String TAG_MENU = "menu";
/**
* HTML &lt;nav&gt; tag.
* @since 14.0RC1
*/
String TAG_NAV = "nav";
/**
* HTML &lt;section&gt; tag.
* @since 14.0RC1
*/
String TAG_SECTION = "section";
/**
* HTML &lt;template&gt; tag.
* @since 14.0RC1
*/
String TAG_TEMPLATE = "template";
/**
* HTML id attribute name.
*/
......
......@@ -35,6 +35,7 @@
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.TagTransformation;
import org.htmlcleaner.TrimAttributeTagTransformation;
import org.htmlcleaner.XWikiDOMSerializer;
import org.w3c.dom.Document;
import org.xwiki.component.annotation.Component;
......@@ -160,8 +161,12 @@ public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration confi
// Replace by the following when fixed:
// result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode);
cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 Strict//EN",
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
if (getHTMLVersion(configuration) == 5) {
cleanedNode.setDocType(new DoctypeToken("HTML", null, null, null));
} else {
cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 Strict//EN",
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
}
result =
new XWikiDOMSerializer(cleanerProperties).createDOM(getAvailableDocumentBuilder(), cleanedNode);
} catch (ParserConfigurationException ex) {
......@@ -232,9 +237,7 @@ private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration c
defaultProperties.setTransResCharsToNCR(useCharacterReferences);
// By default, we are cleaning XHTML 1.0 code, not HTML 5.
// Note: Tests are broken if we don't set the version 4, meaning that supporting HTML5 requires some work.
// TODO: handle HTML5 correctly (see: https://jira.xwiki.org/browse/XCOMMONS-901)
defaultProperties.setHtmlVersion(4);
defaultProperties.setHtmlVersion(getHTMLVersion(configuration));
// We trim values by default for all attributes but the input value attribute.
// The only way to currently do that is to switch off this flag, and to create a dedicated TagTransformation.
......@@ -283,6 +286,16 @@ private CleanerTransformations getDefaultCleanerTransformations(HTMLCleanerConfi
tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_STYLE, "text-align:center");
defaultTransformations.addTransformation(tt);
if (getHTMLVersion(configuration) == 5) {
// Font tags are removed before the filters are applied in HTML5, we thus need a transformation here.
defaultTransformations.addTransformation(new FontTagTransformation());
tt = new TrimAttributeTagTransformation(HTMLConstants.TAG_TT,
HTMLConstants.TAG_SPAN);
tt.addAttributeTransformation(HTMLConstants.ATTRIBUTE_CLASS, "${class} monospace");
defaultTransformations.addTransformation(tt);
}
String restricted = configuration.getParameters().get(HTMLCleanerConfiguration.RESTRICTED);
if ("true".equalsIgnoreCase(restricted)) {
......@@ -295,4 +308,19 @@ private CleanerTransformations getDefaultCleanerTransformations(HTMLCleanerConfi
return defaultTransformations;
}
/**
* @param configuration The configuration to parse.
* @return The HTML version specified in the configuration.
* @since 14.0RC1
*/
private int getHTMLVersion(HTMLCleanerConfiguration configuration)
{
String param = configuration.getParameters().get(HTMLCleanerConfiguration.HTML_VERSION);
int htmlVersion = 4;
if (param != null) {
htmlVersion = Integer.parseInt(param);
}
return htmlVersion;
}
}
/*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.xwiki.xml.internal.html;
import java.util.HashMap;
import java.util.Map;
import org.htmlcleaner.TagTransformation;
import org.xwiki.xml.html.HTMLConstants;
/**
* Replaces invalid &lt;font&gt; tags with equivalent &lt;span&gt; tags using inline css rules.
*
* @version $Id$
* @since 14.0RC1
*/
public class FontTagTransformation extends TagTransformation
{
/**
* A map holding the translation from 'size' attribute of html font tag to 'font-size' css property.
*/
private static final Map<String, String> FONT_SIZE_MAP;
static {
FONT_SIZE_MAP = new HashMap<>();
FONT_SIZE_MAP.put("1", "0.6em");
FONT_SIZE_MAP.put("2", "0.8em");
FONT_SIZE_MAP.put("3", "1.0em");
FONT_SIZE_MAP.put("4", "1.2em");
FONT_SIZE_MAP.put("5", "1.4em");
FONT_SIZE_MAP.put("6", "1.6em");
FONT_SIZE_MAP.put("7", "1.8em");
FONT_SIZE_MAP.put("-3", "0.4em");
FONT_SIZE_MAP.put("-2", FONT_SIZE_MAP.get("1"));
FONT_SIZE_MAP.put("-1", FONT_SIZE_MAP.get("2"));
FONT_SIZE_MAP.put("+1", FONT_SIZE_MAP.get("4"));
FONT_SIZE_MAP.put("+2", FONT_SIZE_MAP.get("5"));
FONT_SIZE_MAP.put("+3", FONT_SIZE_MAP.get("6"));
}
/**
* Create a transformation from the &lt;font&gt;-tag to the &lt;span&gt;-tag.
*/
public FontTagTransformation()
{
super(HTMLConstants.TAG_FONT, HTMLConstants.TAG_SPAN, false);
}
@Override
public Map<String, String> applyTagTransformations(Map<String, String> attributes)
{
Map<String, String> result = super.applyTagTransformations(attributes);
StringBuilder builder = new StringBuilder();
if (attributes.containsKey(HTMLConstants.ATTRIBUTE_FONTCOLOR)) {
builder.append(String.format("color:%s;", attributes.get(HTMLConstants.ATTRIBUTE_FONTCOLOR)));
}
if (attributes.containsKey(HTMLConstants.ATTRIBUTE_FONTFACE)) {
builder.append(String.format("font-family:%s;", attributes.get(HTMLConstants.ATTRIBUTE_FONTFACE)));
}
if (attributes.containsKey(HTMLConstants.ATTRIBUTE_FONTSIZE)) {
String fontSize = attributes.get(HTMLConstants.ATTRIBUTE_FONTSIZE);
String fontSizeCss = FONT_SIZE_MAP.get(fontSize);
fontSizeCss = (fontSizeCss != null) ? fontSizeCss : fontSize;
builder.append(String.format("font-size:%s;", fontSizeCss));
}
if (attributes.containsKey(HTMLConstants.ATTRIBUTE_STYLE)
&& attributes.get(HTMLConstants.ATTRIBUTE_STYLE).trim().length() == 0)
{
builder.append(attributes.get(HTMLConstants.ATTRIBUTE_STYLE));
}
if (builder.length() > 0) {
result.put(HTMLConstants.ATTRIBUTE_STYLE, builder.toString());
}
return result;
}
}
......@@ -55,11 +55,17 @@ public class BodyFilter extends AbstractHTMLFilter
* "P | %heading; | %list; | %preformatted; | DL | DIV | NOSCRIPT |
* BLOCKQUOTE | FORM | HR | TABLE | FIELDSET | ADDRESS">
* }</pre>
*
* We also use this list for HTML5 where in theory everything is allowed, but we instead only allow flow content
* that is not also phrasing content except for {@code <ins>} and {@code <del>} that were already allowed in HTML 4.
*/
private static final List<String> ALLOWED_BODY_TAGS = Arrays.asList(HTMLConstants.TAG_ADDRESS,
HTMLConstants.TAG_BLOCKQUOTE, HTMLConstants.TAG_DEL, HTMLConstants.TAG_DIV, HTMLConstants.TAG_FIELDSET,
HTMLConstants.TAG_FORM, HTMLConstants.TAG_HR, HTMLConstants.TAG_INS, HTMLConstants.TAG_NOSCRIPT,
HTMLConstants.TAG_P, HTMLConstants.TAG_PRE, HTMLConstants.TAG_SCRIPT, HTMLConstants.TAG_TABLE,
HTMLConstants.TAG_ARTICLE, HTMLConstants.TAG_ASIDE, HTMLConstants.TAG_BLOCKQUOTE, HTMLConstants.TAG_DEL,
HTMLConstants.TAG_DETAILS, HTMLConstants.TAG_DIV, HTMLConstants.TAG_FIELDSET, HTMLConstants.TAG_FIGURE,
HTMLConstants.TAG_FOOTER, HTMLConstants.TAG_FORM, HTMLConstants.TAG_HEADER, HTMLConstants.TAG_HGROUP,
HTMLConstants.TAG_HR, HTMLConstants.TAG_INS, HTMLConstants.TAG_MAIN, HTMLConstants.TAG_MENU,
HTMLConstants.TAG_NAV, HTMLConstants.TAG_NOSCRIPT, HTMLConstants.TAG_P, HTMLConstants.TAG_PRE,
HTMLConstants.TAG_SCRIPT, HTMLConstants.TAG_SECTION, HTMLConstants.TAG_TABLE, HTMLConstants.TAG_TEMPLATE,
HTMLConstants.TAG_H1, HTMLConstants.TAG_H2, HTMLConstants.TAG_H3, HTMLConstants.TAG_H4, HTMLConstants.TAG_H5,
HTMLConstants.TAG_H6, HTMLConstants.TAG_DL, HTMLConstants.TAG_OL, HTMLConstants.TAG_UL);
......
......@@ -19,17 +19,21 @@
*/
package org.xwiki.xml.internal.html.filter;
import java.util.HashMap;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import javax.inject.Named;
import javax.inject.Singleton;
import org.apache.xerces.util.DOMUtil;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xwiki.component.annotation.Component;
import org.xwiki.xml.html.filter.AbstractHTMLFilter;
import org.xwiki.xml.internal.html.FontTagTransformation;
/**
* Replaces invalid &lt;font&gt; tags with equivalent &lt;span&gt; tags using inline css rules.
......@@ -42,28 +46,6 @@
@Singleton
public class FontFilter extends AbstractHTMLFilter
{
/**
* A map holding the translation from 'size' attribute of html font tag to 'font-size' css property.
*/
private static final Map<String, String> FONT_SIZE_MAP;
static {
FONT_SIZE_MAP = new HashMap<>();
FONT_SIZE_MAP.put("1", "0.6em");
FONT_SIZE_MAP.put("2", "0.8em");
FONT_SIZE_MAP.put("3", "1.0em");
FONT_SIZE_MAP.put("4", "1.2em");
FONT_SIZE_MAP.put("5", "1.4em");
FONT_SIZE_MAP.put("6", "1.6em");
FONT_SIZE_MAP.put("7", "1.8em");
FONT_SIZE_MAP.put("-3", "0.4em");
FONT_SIZE_MAP.put("-2", FONT_SIZE_MAP.get("1"));
FONT_SIZE_MAP.put("-1", FONT_SIZE_MAP.get("2"));
FONT_SIZE_MAP.put("+1", FONT_SIZE_MAP.get("4"));
FONT_SIZE_MAP.put("+2", FONT_SIZE_MAP.get("5"));
FONT_SIZE_MAP.put("+3", FONT_SIZE_MAP.get("6"));
}
/**
* {@inheritDoc}
*
......@@ -76,25 +58,13 @@ public void filter(Document document, Map<String, String> cleaningParameters)
for (Element fontTag : fontTags) {
Element span = document.createElement(TAG_SPAN);
moveChildren(fontTag, span);
StringBuilder builder = new StringBuilder();
if (fontTag.hasAttribute(ATTRIBUTE_FONTCOLOR)) {
builder.append(String.format("color:%s;", fontTag.getAttribute(ATTRIBUTE_FONTCOLOR)));
}
if (fontTag.hasAttribute(ATTRIBUTE_FONTFACE)) {
builder.append(String.format("font-family:%s;", fontTag.getAttribute(ATTRIBUTE_FONTFACE)));
}
if (fontTag.hasAttribute(ATTRIBUTE_FONTSIZE)) {
String fontSize = fontTag.getAttribute(ATTRIBUTE_FONTSIZE);
String fontSizeCss = FONT_SIZE_MAP.get(fontSize);
fontSizeCss = (fontSizeCss != null) ? fontSizeCss : fontSize;
builder.append(String.format("font-size:%s;", fontSizeCss));
}
if (fontTag.hasAttribute(ATTRIBUTE_STYLE) && fontTag.getAttribute(ATTRIBUTE_STYLE).trim().length() == 0) {
builder.append(fontTag.getAttribute(ATTRIBUTE_STYLE));
}
if (builder.length() > 0) {
span.setAttribute(ATTRIBUTE_STYLE, builder.toString());
}
Map<String, String> attributes =
Arrays.stream(DOMUtil.getAttrs(fontTag)).collect(Collectors.toMap(Attr::getName,
Attr::getValue));
(new FontTagTransformation()).applyTagTransformations(attributes).forEach(span::setAttribute);
fontTag.getParentNode().insertBefore(span, fontTag);
fontTag.getParentNode().removeChild(fontTag);
}
......
......@@ -30,6 +30,7 @@
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.w3c.dom.Document;
......@@ -83,7 +84,44 @@
private static final String FOOTER = "</body></html>\n";
@InjectMockComponents
private DefaultHTMLCleaner cleaner;
protected DefaultHTMLCleaner cleaner;
protected HTMLCleanerConfiguration cleanerConfiguration;
/**
* @return The expected XHTML 1.0 header.
*/
public String getHeader()
{
return HEADER;
}
/**
* @return The expected full XHTML 1.0 header up to &lt;body&gt;.
*/
public String getHeaderFull()
{
return HEADER_FULL;
}
/**
* Cleans using the cleaner configuration {@link DefaultHTMLCleanerTest#cleanerConfiguration}.
*
* Ensures that always the correct configuration is used and allows executing the same tests for HTML 4 and HTML 5.
*
* @param originalHtmlContent The content to clean as string.
* @return The cleaned document.
*/
protected Document clean(String originalHtmlContent)
{
return this.cleaner.clean(new StringReader(originalHtmlContent), cleanerConfiguration);
}
@BeforeEach
void setUpCleaner()
{
this.cleanerConfiguration = this.cleaner.getDefaultConfiguration();
}
@Test
void elementExpansion()
......@@ -248,12 +286,11 @@ void styleAndCData()
@Test
void explicitFilterList()
{
HTMLCleanerConfiguration configuration = this.cleaner.getDefaultConfiguration();
configuration.setFilters(Collections.emptyList());
String result = HTMLUtils.toString(this.cleaner.clean(new StringReader("something"), configuration));
this.cleanerConfiguration.setFilters(Collections.emptyList());
String result = HTMLUtils.toString(clean("something"));
// Note that if the default Body filter had been executed the result would have been:
// <p>something</p>.
assertEquals(HEADER_FULL + "something" + FOOTER, result);
assertEquals(getHeaderFull() + "something" + FOOTER, result);
}
/**
......@@ -262,28 +299,26 @@ void explicitFilterList()
@Test
void restrictedHtml()
{
HTMLCleanerConfiguration configuration = this.cleaner.getDefaultConfiguration();
Map<String, String> parameters = new HashMap<>();
parameters.putAll(configuration.getParameters());
Map<String, String> parameters = new HashMap<>(this.cleanerConfiguration.getParameters());
parameters.put("restricted", "true");
configuration.setParameters(parameters);
Document document = this.cleaner.clean(new StringReader("<script>alert(\"foo\")</script>"), configuration);
this.cleanerConfiguration.setParameters(parameters);
Document document = clean("<script>alert(\"foo\")</script>");
String textContent =
document.getElementsByTagName("pre").item(0).getTextContent();
assertEquals("alert(\"foo\")", textContent);
String result = HTMLUtils.toString(document);
assertEquals(HEADER_FULL + "<pre>alert(\"foo\")</pre>" + FOOTER, result);
assertEquals(getHeaderFull() + "<pre>alert(\"foo\")</pre>" + FOOTER, result);
document = this.cleaner.clean(new StringReader("<style>p {color:white;}</style>"), configuration);
document = clean("<style>p {color:white;}</style>");
textContent =
document.getElementsByTagName("pre").item(0).getTextContent();
assertEquals("p {color:white;}", textContent);
result = HTMLUtils.toString(document);
assertEquals(HEADER_FULL + "<pre>p {color:white;}</pre>" + FOOTER, result);
assertEquals(getHeaderFull() + "<pre>p {color:white;}</pre>" + FOOTER, result);
}
/**
......@@ -292,7 +327,7 @@ void restrictedHtml()
@Test
void fullXHTMLHeader()
{
assertHTML("<p>test</p>", HEADER_FULL + "<p>test</p>" + FOOTER);
assertHTML("<p>test</p>", getHeaderFull() + "<p>test</p>" + FOOTER);
}
/**
......@@ -303,12 +338,11 @@ public void duplicateIds(ComponentManager componentManager) throws Exception
{
String actual = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x\">3</p>";
String expected = "<p id=\"x\">1</p><p id=\"xy\">2</p><p id=\"x0\">3</p>";
HTMLCleanerConfiguration config = this.cleaner.getDefaultConfiguration();
List<HTMLFilter> filters = new ArrayList<>(config.getFilters());
List<HTMLFilter> filters = new ArrayList<>(this.cleanerConfiguration.getFilters());
filters.add(componentManager.getInstance(HTMLFilter.class, "uniqueId"));
config.setFilters(filters);
assertEquals(HEADER_FULL + expected + FOOTER,
HTMLUtils.toString(this.cleaner.clean(new StringReader(actual), config)));
this.cleanerConfiguration.setFilters(filters);
assertEquals(getHeaderFull() + expected + FOOTER,
HTMLUtils.toString(clean(actual)));
}
/**
......@@ -322,7 +356,7 @@ void cleanSVGTags() throws Exception
"<p>before</p>\n" + "<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\">\n"
+ "<circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\"></circle>\n"
+ "</svg></p>\n" + "<p>after</p>\n";
assertHTML(input, HEADER_FULL + input + FOOTER);
assertHTML(input, getHeaderFull() + input + FOOTER);
}
/**
......@@ -348,8 +382,8 @@ void cleanTitleWithNamespace()
+ " <title>SVG Title Demo example</title>\n"
+ " <rect height=\"50\" style=\"fill:none; stroke:blue; stroke-width:1px\" width=\"200\" x=\"10\" "
+ "y=\"10\"></rect>\n" + " </g>\n" + " </svg>\n" + " <p>after</p>\n";
assertEquals(HEADER + input + FOOTER,
HTMLUtils.toString(this.cleaner.clean(new StringReader(input))));
assertEquals(getHeader() + input + FOOTER,
HTMLUtils.toString(clean(input)));
}
/**
......@@ -362,14 +396,15 @@ void cleanHTMLTagWithNamespace()
String input = "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head></head><body>";
// Default
assertEquals(HEADER + input + FOOTER,
HTMLUtils.toString(this.cleaner.clean(new StringReader(input))));
assertEquals(getHeader() + input + FOOTER,
HTMLUtils.toString(clean(input)));
// Configured for namespace awareness being false
HTMLCleanerConfiguration config = this.cleaner.getDefaultConfiguration();
config.setParameters(Collections.singletonMap(HTMLCleanerConfiguration.NAMESPACES_AWARE, "false"));
assertEquals(HEADER + "<html><head></head><body>" + FOOTER,
HTMLUtils.toString(this.cleaner.clean(new StringReader(input), config)));
Map<String, String> parameters = new HashMap<>(this.cleanerConfiguration.getParameters());
parameters.put(HTMLCleanerConfiguration.NAMESPACES_AWARE, "false");
this.cleanerConfiguration.setParameters(parameters);
assertEquals(getHeader() + "<html><head></head><body>" + FOOTER,
HTMLUtils.toString(clean(input)));
}