Skip to content

Commit 48e46d6

Browse files
Merge pull request #42 from commoncrawl/40-ignore-metadata-in-body
WAT extractor: do not add <meta itemprop="..." > from body as metadata
2 parents 1d94164 + b474f5d commit 48e46d6

File tree

3 files changed

+86
-19
lines changed

3 files changed

+86
-19
lines changed

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,24 @@ private static class MetaTagExtractor implements TagExtractor {
669669
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
670670
ArrayList<String> l = getAttrList(node,"name","rel","content","http-equiv","property");
671671
if(l != null) {
672+
if (l.size() == 2) {
673+
if (l.get(0).equals("content")) {
674+
/*
675+
* drop single "content" attributes very likely stemming
676+
* from <meta itemprop="..." content="..."> schema.org
677+
* annotations embedded in the HTML body, see
678+
* https://github.com/commoncrawl/ia-web-commons/issues/40
679+
*/
680+
return;
681+
} else {
682+
/*
683+
* Single key-value metadata pair, e.g. <meta
684+
* name="..."/> (no "content") - no value or something
685+
* when wrong with attribute parsing.
686+
*/
687+
return;
688+
}
689+
}
672690
data.addMeta(l);
673691
}
674692
}

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,24 @@ private void checkTitle(Resource resource, String title) {
166166
}
167167
}
168168

169+
private void checkExtractedAttributes(Resource resource, int metaElements, int metaElementIndex,
170+
String... attributes) throws JSONException {
171+
assertNotNull(resource);
172+
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
173+
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
174+
assertNotNull(metas);
175+
if (metaElements > -1) {
176+
assertEquals(metaElements, metas.length());
177+
}
178+
JSONObject meta = metas.getJSONObject(metaElementIndex);
179+
assertEquals(attributes.length / 2, meta.length());
180+
for (int i = 0; i < attributes.length; i += 2) {
181+
String key = attributes[i];
182+
assertNotNull(meta.get(key));
183+
assertEquals(attributes[i + 1], meta.get(key));
184+
}
185+
}
186+
169187
private void checkLinks(Resource resource, String[][] expectedLinks) {
170188
assertNotNull(resource);
171189
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
@@ -241,20 +259,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
241259
}
242260
}
243261

244-
private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes)
245-
throws JSONException {
246-
assertNotNull(resource);
247-
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
248-
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
249-
assertNotNull(metas);
250-
JSONObject meta = metas.getJSONObject(0);
251-
for (int i = 0; i < langAttributes.length; i += 2) {
252-
String key = langAttributes[i];
253-
assertNotNull(meta.get(key));
254-
assertEquals(meta.get(key), langAttributes[i+1]);
255-
}
256-
}
257-
258262
public void testLinkExtraction() throws ResourceParseException, IOException {
259263
String testFileName = "link-extraction-test.warc";
260264
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
@@ -434,11 +438,21 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException,
434438
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
435439
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
436440
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
437-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
438-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
439-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
440-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
441-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
441+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "en");
442+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "zh-CN");
443+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "cs-cz");
444+
checkExtractedAttributes(extractor.getNext(), 2, 0, "name", "HTML@/lang", "content", "en");
445+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/xml:lang", "content", "es-MX");
446+
}
447+
448+
public void testBodyMetaElements() throws ResourceParseException, IOException {
449+
String testFileName = "meta-itemprop.warc";
450+
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
451+
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
452+
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
453+
Resource resource = extractor.getNext();
454+
checkExtractedAttributes(resource, 2, 0, "name", "HTML@/lang", "content", "en");
455+
checkExtractedAttributes(resource, 2, 1, "name", "robots", "content", "index,follow");
442456
}
443457

444458
public void testHtmlParserEntityDecoding() {
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
WARC/1.0
2+
WARC-Type: response
3+
WARC-Date: 2024-12-05T10:47:02Z
4+
Content-Length: 710
5+
Content-Type: application/http; msgtype=response
6+
WARC-Target-URI: https://www.example.org/
7+
WARC-Identified-Payload-Type: text/html
8+
9+
HTTP/1.1 200
10+
content-type: text/html; charset=UTF-8
11+
12+
<!DOCTYPE html>
13+
<html lang="en">
14+
<head>
15+
<meta charset="UTF-8">
16+
<meta name=robots content="index,follow">
17+
<title>Test</title>
18+
</head>
19+
<body>
20+
<!-- from https://schema.org/docs/gs.html#advanced_missing -->
21+
<div itemscope itemtype="https://schema.org/Offer">
22+
<span itemprop="name">Blend-O-Matic</span>
23+
<span itemprop="price">$19.95</span>
24+
<div itemprop="reviews" itemscope itemtype="https://schema.org/AggregateRating">
25+
<img src="four-stars.jpg" />
26+
<meta itemprop="ratingValue" content="4" />
27+
<meta itemprop="bestRating" content="5" />
28+
Based on <span itemprop="ratingCount">25</span> user ratings
29+
</div>
30+
</div>
31+
</body>
32+
</html>
33+
34+
35+

0 commit comments

Comments
 (0)