Commit 2e577282 authored by Rémi GAUTHIER's avatar Rémi GAUTHIER

!3 add space after not before to avoid pdf display pb

parent 1d170cbc
Pipeline #1907 passed with stages
in 3 minutes and 8 seconds
......@@ -137,8 +137,8 @@ public class HTMLCleaner {
final List<Element> pElementList = divElement.select("p");
if (pElementList.size() > 1) {
final List<Element> pElementButLastList = pElementList.subList(1, pElementList.size());
HTMLCleaner.addJumpLine(pElementButLastList);
final List<Element> pElementButLastList = pElementList.subList(0, pElementList.size() - 1);
HTMLCleaner.addEndSpace(pElementButLastList);
}
}
document.html(document.html().replaceAll("<br />", "&nbsp;<br />"));
......@@ -147,12 +147,18 @@ public class HTMLCleaner {
}
private static void addJumpLine(final List<Element> pElementButLastList) {
/*
* FIXME Should it be done here?
* Usefull when no space is added at between two line with two separate words.
* The case "...de Defensa.El papel..." should be treated in the NLP not here.
* It is problematic in some cases. For example in french when a dash separates a long word and go to a new line.
*/
private static void addEndSpace(final List<Element> pElementButLastList) {
for (final Element el : pElementButLastList) {
if (StringUtils.isNotBlank(el.html().replaceAll("&nbsp;", " "))) {
el.html("&nbsp;" + el.html());
el.html(el.html() + "&nbsp;");
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment