Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve WET text extraction, address #45 and #46 #47

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 73 additions & 14 deletions src/main/java/org/archive/format/text/html/CDATALexer.java
Original file line number Diff line number Diff line change
@@ -1,37 +1,96 @@
package org.archive.format.text.html;

import org.htmlparser.Node;
import org.htmlparser.Text;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.util.ParserException;

import static org.archive.format.text.html.NodeUtils.SCRIPT_TAG_NAME;
import static org.archive.format.text.html.NodeUtils.STYLE_TAG_NAME;

public class CDATALexer extends Lexer {
private static final long serialVersionUID = -8513653556979405106L;
private Node cached;
private boolean inCSS;
private boolean inJS;
private boolean cachedJS = false;
private boolean inCSS;

private static enum STATE { DEFAULT, START_JS, START_CSS };
private STATE state = STATE.DEFAULT;

private int start = -1;
private int end = -1;

@Override
public Node nextNode() throws ParserException {
inJS = false;
inCSS = false;
if(cached != null) {
if (cached != null) {
inJS = inCSS = false;
Node tmp = cached;
cached = null;
inJS = cachedJS;
inCSS = !cachedJS;
return tmp;
}
Node got = super.nextNode();
if(NodeUtils.isNonEmptyOpenTagNodeNamed(got, "SCRIPT")) {
cached = super.parseCDATA(true);
cachedJS = true;
} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, "STYLE")) {
cached = super.parseCDATA(true);
cachedJS = false;
Node got = null;
switch (state) {
case START_JS:
got = super.parseCDATA(false);
if (got != null) {
inJS = true;
}
break;
case START_CSS:
got = super.parseCDATA(false);
if (got != null) {
inCSS = true;
}
break;
default:
break;
}
if (got != null) {
Text t = (Text) got;
start = t.getStartPosition();
end = t.getEndPosition();
while ((t = (Text) super.parseCDATA(false)) != null) {
end = t.getEndPosition();
}
while ((got = super.nextNode()) != null) {
if (inJS) {
if (NodeUtils.isCloseTagNodeNamed(got, SCRIPT_TAG_NAME)) {
cached = got;
state = STATE.DEFAULT;
return createStringNode(getPage(), start, end);
} else {
end = got.getEndPosition();
}
} else if (inCSS) {
if (NodeUtils.isCloseTagNodeNamed(got, STYLE_TAG_NAME)) {
cached = got;
state = STATE.DEFAULT;
return createStringNode(getPage(), start, end);
} else {
end = got.getEndPosition();
}
}
}
t = createStringNode(getPage(), start, end);
state = STATE.DEFAULT;
start = end = -1;
return t;
}
got = super.nextNode();
if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, SCRIPT_TAG_NAME)) {
state = STATE.START_JS;
} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, STYLE_TAG_NAME)) {
state = STATE.START_CSS;
} else if (NodeUtils.isCloseTagNodeNamed(got, SCRIPT_TAG_NAME)) {
state = STATE.DEFAULT;
inJS = false;
} else if (NodeUtils.isCloseTagNodeNamed(got, STYLE_TAG_NAME)) {
state = STATE.DEFAULT;
inCSS = false;
}
return got;
}

public boolean inJS() {
return inJS;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ public class ExtractingParseObserver implements ParseObserver {
private final static int MAX_TEXT_LEN = 128;

private final static String[] BLOCK_ELEMENTS = { "address", "article", "aside", "blockquote", "body", "br",
"button", "canvas", "caption", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
"button", "canvas", "caption", "center", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
"figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
"li", "map", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody",
"textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
"li", "map", "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table",
"tbody", "textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
private static final Set<String> blockElements;
/* inline elements which content is not melted with surrounding words */
private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",
Expand Down
62 changes: 45 additions & 17 deletions src/test/java/org/archive/format/text/html/CDATALexerTest.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
package org.archive.format.text.html;

import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.NodeUtils;
import org.htmlparser.Node;
import org.htmlparser.lexer.Page;
//import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.ParserException;
Expand Down Expand Up @@ -72,20 +69,38 @@ public void testInCSS() throws ParserException {
assertFalse(l.inJS());
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
}

public void testInCSSEmpty() throws ParserException {
l = makeLexer("<style></style>");
assertFalse(l.inCSS());
assertFalse(l.inJS());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "STYLE"));
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
}

public void testInCSSBachelorTag() throws ParserException {
l = makeLexer("<style />");
assertFalse(l.inCSS());
assertFalse(l.inJS());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isTagNode(n));
assertTrue(((TagNode) n).isEmptyXmlTag());
assertEquals(((TagNode) n).getTagName(), "STYLE");
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertNull(n);
}

public void testInJSComment() throws ParserException {

// dumpParse("<script>//<!--\n foo bar baz\n //--></script>");
// dumpParse("<script><!-- foo bar baz --></script>");
// dumpParse("<script>//<!-- foo bar baz --></script>");
// dumpParse("<script><!-- foo bar baz //--></script>");
// dumpParse("<script>\n//<!-- foo bar baz\n //--></script>");
// dumpParse("<script> if(1 < 2) { foo(); } </script>");
// dumpParse("<script> if(1 <n) { foo(); } </script>");
// dumpParse("<script> document.write(\"<b>bold</b>\"); </script>");
// dumpParse("<script> document.write(\"<script>bold</script>\"); </script>");
// dumpParse("<script> <![CDATA[\n if(i<n) { foo() } // content of your Javascript goes here \n ]]> </script>");

assertJSContentWorks("//<!--\n foo bar baz\n //-->");
assertJSContentWorks("<!-- foo bar baz -->");
assertJSContentWorks("//<!-- foo bar baz -->");
Expand All @@ -94,9 +109,22 @@ public void testInJSComment() throws ParserException {
assertJSContentWorks("if(1 < 2) { foo(); } ");
assertJSContentWorks("if(1 <n) { foo(); } ");
assertJSContentWorks("document.write(\"<b>bold</b>\"); ");
assertJSContentWorks("document.write(\"<script>bold</script>\"); ");
assertJSContentWorks("document.write(\"<script>bold<\\/script>\"); ");
assertJSContentWorks("<![CDATA[\n if(i<n) { foo() } // a comment \n ]]> ");

assertJSContentWorks("var script = '<script>alert(\"hello, world!\")<\\/script>'; console.log(script); ");
assertJSContentWorks("\n"
+ " var _hmt = _hmt || [];\n"
+ " (function() {\n"
+ " var hm = document.createElement(\"script\");\n"
+ " hm.src = \"https://#/hm.js?aba99f7fd4116f6c8c3d1650e8f8ec17\";\n"
+ " var s = document.getElementsByTagName(\"script\")[0]; \n"
+ " s.parentNode.insertBefore(hm, s);\n"
+ " })();\n"
+ " ");
/*
* The parser fails on unfinished HTML comments inside script or style.
*/
// assertJSContentWorks("<!-- foo bar baz ");
}

private void assertJSContentWorks(String js) throws ParserException {
Expand Down
Loading