Skip to content

Commit e36c876

Browse files
WAT extractor: do not extract page title from embedded SVG images
- do not use <title> elements embedded in <svg> as page/document title - use the first non-empty <title> element to set the page/document title. This is required for documents where the <title> is not enclosed in the <head> element. Note: HTML5 allows the <head> element to be ommitted, see https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#optional-tags - overwrite the page/document title by the content of a <title> element inside the <head> element - for text extraction: define the title element as block element
1 parent fc11441 commit e36c876

File tree

2 files changed

+25
-4
lines changed

2 files changed

+25
-4
lines changed

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ public class ExtractingParseObserver implements ParseObserver {
2424
Stack<StringBuilder> openAnchorTexts;
2525
StringBuilder textExtract;
2626
String title = null;
27+
boolean inHead = false;
2728
boolean inTitle = false;
2829
boolean inPre = false;
30+
boolean inSVG = false;
2931

3032
protected static String cssUrlPatString =
3133
"url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)";
@@ -59,7 +61,7 @@ public class ExtractingParseObserver implements ParseObserver {
5961
"button", "canvas", "caption", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
6062
"figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
6163
"li", "map", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody",
62-
"textarea", "tfoot", "th", "thead", "tr", "ul", "video" };
64+
"textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
6365
private static final Set<String> blockElements;
6466
/* inline elements which content is not melted with surrounding words */
6567
private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",
@@ -144,11 +146,17 @@ public void handleTagEmpty(TagNode tag) {
144146
@Override
145147
public void handleTagOpen(TagNode tag) {
146148
String name = tag.getTagName();
147-
if(name.equals("TITLE")) {
149+
if (name.equals("HEAD")) {
150+
inHead = true;
151+
} else if (name.equals("TITLE")) {
148152
inTitle = !tag.isEmptyXmlTag();
149153
return;
150154
} else if (name.equals("PRE")) {
151155
inPre = true;
156+
} else if (name.equals("SVG")) {
157+
inSVG = true;
158+
} else if (name.equals("BODY")) {
159+
inHead = false;
152160
}
153161

154162
if (blockElements.contains(name)) {
@@ -183,9 +191,11 @@ public void handleTagOpen(TagNode tag) {
183191
public void handleTagClose(TagNode tag) {
184192
String name = tag.getTagName();
185193

186-
if(inTitle) {
194+
if (inTitle) {
187195
inTitle = false;
188-
data.setTitle(title);
196+
if (!inSVG && (inHead || !data.hasTitle())) {
197+
data.setTitle(title);
198+
}
189199
title = null;
190200
}
191201

@@ -222,8 +232,12 @@ public void handleTagClose(TagNode tag) {
222232
data.addHref(vals);
223233
}
224234
}
235+
} else if (tag.getTagName().equals("HEAD")) {
236+
inHead = false;
225237
} else if (tag.getTagName().equals("PRE")) {
226238
inPre = false;
239+
} else if (tag.getTagName().equals("SVG")) {
240+
inSVG = false;
227241
}
228242
}
229243

src/main/java/org/archive/resource/html/HTMLMetaData.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,15 @@ private JSONObject getHeader() {
3131
public void setBaseHref(String href) {
3232
putUnlessNull(getHeader(),HTML_BASE, href);
3333
}
34+
3435
public void setTitle(String title) {
3536
putUnlessNull(getHeader(),HTML_TITLE, title);
3637
}
38+
39+
public boolean hasTitle() {
40+
return header != null && header.has(HTML_TITLE);
41+
}
42+
3743
private void putUnlessNull(JSONObject o, String k, String v) {
3844
if(o != null) {
3945
try {
@@ -43,6 +49,7 @@ private void putUnlessNull(JSONObject o, String k, String v) {
4349
}
4450
}
4551
}
52+
4653
public String[] LtoA(List<String> l) {
4754
String[] a = new String[l.size()];
4855
l.toArray(a);

0 commit comments

Comments
 (0)