Skip to content

Commit ed57699

Browse files
Merge pull request #47 from commoncrawl/ia-web-commons-45-46
Improve WET text extraction, address #45 and #46
2 parents 1446d35 + 01052bc commit ed57699

File tree

3 files changed

+121
-34
lines changed

3 files changed

+121
-34
lines changed

src/main/java/org/archive/format/text/html/CDATALexer.java

Lines changed: 73 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,96 @@
11
package org.archive.format.text.html;
22

33
import org.htmlparser.Node;
4+
import org.htmlparser.Text;
45
import org.htmlparser.lexer.Lexer;
56
import org.htmlparser.util.ParserException;
67

8+
import static org.archive.format.text.html.NodeUtils.SCRIPT_TAG_NAME;
9+
import static org.archive.format.text.html.NodeUtils.STYLE_TAG_NAME;
10+
711
public class CDATALexer extends Lexer {
812
private static final long serialVersionUID = -8513653556979405106L;
913
private Node cached;
10-
private boolean inCSS;
1114
private boolean inJS;
12-
private boolean cachedJS = false;
15+
private boolean inCSS;
16+
17+
private static enum STATE { DEFAULT, START_JS, START_CSS };
18+
private STATE state = STATE.DEFAULT;
19+
20+
private int start = -1;
21+
private int end = -1;
1322

1423
@Override
1524
public Node nextNode() throws ParserException {
16-
inJS = false;
17-
inCSS = false;
18-
if(cached != null) {
25+
if (cached != null) {
26+
inJS = inCSS = false;
1927
Node tmp = cached;
2028
cached = null;
21-
inJS = cachedJS;
22-
inCSS = !cachedJS;
2329
return tmp;
2430
}
25-
Node got = super.nextNode();
26-
if(NodeUtils.isNonEmptyOpenTagNodeNamed(got, "SCRIPT")) {
27-
cached = super.parseCDATA(true);
28-
cachedJS = true;
29-
} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, "STYLE")) {
30-
cached = super.parseCDATA(true);
31-
cachedJS = false;
31+
Node got = null;
32+
switch (state) {
33+
case START_JS:
34+
got = super.parseCDATA(false);
35+
if (got != null) {
36+
inJS = true;
37+
}
38+
break;
39+
case START_CSS:
40+
got = super.parseCDATA(false);
41+
if (got != null) {
42+
inCSS = true;
43+
}
44+
break;
45+
default:
46+
break;
47+
}
48+
if (got != null) {
49+
Text t = (Text) got;
50+
start = t.getStartPosition();
51+
end = t.getEndPosition();
52+
while ((t = (Text) super.parseCDATA(false)) != null) {
53+
end = t.getEndPosition();
54+
}
55+
while ((got = super.nextNode()) != null) {
56+
if (inJS) {
57+
if (NodeUtils.isCloseTagNodeNamed(got, SCRIPT_TAG_NAME)) {
58+
cached = got;
59+
state = STATE.DEFAULT;
60+
return createStringNode(getPage(), start, end);
61+
} else {
62+
end = got.getEndPosition();
63+
}
64+
} else if (inCSS) {
65+
if (NodeUtils.isCloseTagNodeNamed(got, STYLE_TAG_NAME)) {
66+
cached = got;
67+
state = STATE.DEFAULT;
68+
return createStringNode(getPage(), start, end);
69+
} else {
70+
end = got.getEndPosition();
71+
}
72+
}
73+
}
74+
t = createStringNode(getPage(), start, end);
75+
state = STATE.DEFAULT;
76+
start = end = -1;
77+
return t;
78+
}
79+
got = super.nextNode();
80+
if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, SCRIPT_TAG_NAME)) {
81+
state = STATE.START_JS;
82+
} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, STYLE_TAG_NAME)) {
83+
state = STATE.START_CSS;
84+
} else if (NodeUtils.isCloseTagNodeNamed(got, SCRIPT_TAG_NAME)) {
85+
state = STATE.DEFAULT;
86+
inJS = false;
87+
} else if (NodeUtils.isCloseTagNodeNamed(got, STYLE_TAG_NAME)) {
88+
state = STATE.DEFAULT;
89+
inCSS = false;
3290
}
3391
return got;
3492
}
93+
3594
public boolean inJS() {
3695
return inJS;
3796
}

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ public class ExtractingParseObserver implements ParseObserver {
5959
private final static int MAX_TEXT_LEN = 128;
6060

6161
private final static String[] BLOCK_ELEMENTS = { "address", "article", "aside", "blockquote", "body", "br",
62-
"button", "canvas", "caption", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
62+
"button", "canvas", "caption", "center", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
6363
"figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
64-
"li", "map", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody",
65-
"textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
64+
"li", "map", "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table",
65+
"tbody", "textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
6666
private static final Set<String> blockElements;
6767
/* inline elements which content is not melted with surrounding words */
6868
private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",

src/test/java/org/archive/format/text/html/CDATALexerTest.java

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
package org.archive.format.text.html;
22

3-
import org.archive.format.text.html.CDATALexer;
4-
import org.archive.format.text.html.NodeUtils;
53
import org.htmlparser.Node;
64
import org.htmlparser.lexer.Page;
7-
//import org.htmlparser.nodes.RemarkNode;
85
import org.htmlparser.nodes.TagNode;
96
import org.htmlparser.nodes.TextNode;
107
import org.htmlparser.util.ParserException;
@@ -72,20 +69,38 @@ public void testInCSS() throws ParserException {
7269
assertFalse(l.inJS());
7370
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
7471
}
72+
73+
public void testInCSSEmpty() throws ParserException {
74+
l = makeLexer("<style></style>");
75+
assertFalse(l.inCSS());
76+
assertFalse(l.inJS());
77+
n = l.nextNode();
78+
assertFalse(l.inCSS());
79+
assertFalse(l.inJS());
80+
assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "STYLE"));
81+
n = l.nextNode();
82+
assertFalse(l.inCSS());
83+
assertFalse(l.inJS());
84+
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
85+
}
86+
87+
public void testInCSSBachelorTag() throws ParserException {
88+
l = makeLexer("<style />");
89+
assertFalse(l.inCSS());
90+
assertFalse(l.inJS());
91+
n = l.nextNode();
92+
assertFalse(l.inCSS());
93+
assertFalse(l.inJS());
94+
assertTrue(NodeUtils.isTagNode(n));
95+
assertTrue(((TagNode) n).isEmptyXmlTag());
96+
assertEquals(((TagNode) n).getTagName(), "STYLE");
97+
n = l.nextNode();
98+
assertFalse(l.inCSS());
99+
assertFalse(l.inJS());
100+
assertNull(n);
101+
}
75102

76103
public void testInJSComment() throws ParserException {
77-
78-
// dumpParse("<script>//<!--\n foo bar baz\n //--></script>");
79-
// dumpParse("<script><!-- foo bar baz --></script>");
80-
// dumpParse("<script>//<!-- foo bar baz --></script>");
81-
// dumpParse("<script><!-- foo bar baz //--></script>");
82-
// dumpParse("<script>\n//<!-- foo bar baz\n //--></script>");
83-
// dumpParse("<script> if(1 < 2) { foo(); } </script>");
84-
// dumpParse("<script> if(1 <n) { foo(); } </script>");
85-
// dumpParse("<script> document.write(\"<b>bold</b>\"); </script>");
86-
// dumpParse("<script> document.write(\"<script>bold</script>\"); </script>");
87-
// dumpParse("<script> <![CDATA[\n if(i<n) { foo() } // content of your Javascript goes here \n ]]> </script>");
88-
89104
assertJSContentWorks("//<!--\n foo bar baz\n //-->");
90105
assertJSContentWorks("<!-- foo bar baz -->");
91106
assertJSContentWorks("//<!-- foo bar baz -->");
@@ -94,9 +109,22 @@ public void testInJSComment() throws ParserException {
94109
assertJSContentWorks("if(1 < 2) { foo(); } ");
95110
assertJSContentWorks("if(1 <n) { foo(); } ");
96111
assertJSContentWorks("document.write(\"<b>bold</b>\"); ");
97-
assertJSContentWorks("document.write(\"<script>bold</script>\"); ");
112+
assertJSContentWorks("document.write(\"<script>bold<\\/script>\"); ");
98113
assertJSContentWorks("<![CDATA[\n if(i<n) { foo() } // a comment \n ]]> ");
99-
114+
assertJSContentWorks("var script = '<script>alert(\"hello, world!\")<\\/script>'; console.log(script); ");
115+
assertJSContentWorks("\n"
116+
+ " var _hmt = _hmt || [];\n"
117+
+ " (function() {\n"
118+
+ " var hm = document.createElement(\"script\");\n"
119+
+ " hm.src = \"https://#/hm.js?aba99f7fd4116f6c8c3d1650e8f8ec17\";\n"
120+
+ " var s = document.getElementsByTagName(\"script\")[0]; \n"
121+
+ " s.parentNode.insertBefore(hm, s);\n"
122+
+ " })();\n"
123+
+ " ");
124+
/*
125+
* The parser fails on unfinished HTML comments inside script or style.
126+
*/
127+
// assertJSContentWorks("<!-- foo bar baz ");
100128
}
101129

102130
private void assertJSContentWorks(String js) throws ParserException {

0 commit comments

Comments
 (0)