Skip to content

Commit

Permalink
Merge pull request #44 from commoncrawl/43-duplicated-payload-metadata
Browse files Browse the repository at this point in the history
WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length"
  • Loading branch information
sebastian-nagel authored Dec 13, 2024
2 parents bf9a9e0 + 8f4c43f commit 3628dda
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 8 deletions.
2 changes: 2 additions & 0 deletions src/main/java/org/archive/resource/arc/ARCResource.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,12 @@ public ARCResource(MetaData metaData, ResourceContainer container,
}
}

@Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}

@Override
public void notifyEOF() throws IOException {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
String digString = Base32.encode(digIS.getMessageDigest().digest());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public HTTPHeadersResourceFactory(String name, String type) {
parser = new HttpHeaderParser();
}

@Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
Expand All @@ -40,9 +41,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
parentMetaData.putBoolean(HTTP_HEADERS_CORRUPT, true);
}
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);

parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
}
long trailingSlopBytes = StreamCopy.readToEOF(is);
if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
}
if(type != null) {
parentMetaData.putString(PAYLOAD_CONTENT_TYPE, type);
}
Expand Down
14 changes: 11 additions & 3 deletions src/main/java/org/archive/resource/warc/WARCResource.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public WARCResource(MetaData metaData, ResourceContainer container,
countingIS = new CountingInputStream(
ByteStreams.limit(response, length));
} else {
throw new ResourceParseException(null);
throw new ResourceParseException(new Exception("Zero or negative length: " + length));
}
try {
digIS = new DigestInputStream(countingIS,
Expand All @@ -63,14 +63,18 @@ public WARCResource(MetaData metaData, ResourceContainer container,
}
}

@Override
public InputStream getInputStream() {
return new EOFNotifyingInputStream(digIS, this);
}

@Override
public void notifyEOF() throws IOException {
String digString = Base32.encode(digIS.getMessageDigest().digest());
if(container.isCompressed()) {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
if (!metaData.has(PAYLOAD_LENGTH) || countingIS.getCount() != metaData.getLong(PAYLOAD_LENGTH)) {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
}
metaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(response));
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
} else {
Expand All @@ -81,13 +85,17 @@ public void notifyEOF() throws IOException {
(PushBackOneByteInputStream) raw;
long numNewlines = StreamCopy.skipChars(pb1bis, CR_NL_CHARS);
if(numNewlines > 0) {
metaData.putLong(PAYLOAD_LENGTH, countingIS.getCount());
long payloadLength = countingIS.getCount();
if (!metaData.has(PAYLOAD_LENGTH) || payloadLength != metaData.getLong(PAYLOAD_LENGTH)) {
metaData.putLong(PAYLOAD_LENGTH, payloadLength);
}
metaData.putLong(PAYLOAD_SLOP_BYTES, numNewlines);
metaData.putString(PAYLOAD_DIGEST, "sha1:"+digString);
}
}
}
}

public MetaData getEnvelopeMetaData() {
return envelope;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public WARCMetaDataResourceFactory() {
parser = new HttpHeaderParser();
}

@Override
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException,
IOException {
Expand All @@ -33,8 +34,13 @@ public Resource getResource(InputStream is, MetaData parentMetaData,
if(headers.isCorrupt()) {
md.putBoolean(WARC_META_FIELDS_CORRUPT, true);
}
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, StreamCopy.readToEOF(is));
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
long trailingSlopBytes = StreamCopy.readToEOF(is);
if (!parentMetaData.has(PAYLOAD_SLOP_BYTES) || trailingSlopBytes > 0) {
parentMetaData.putLong(PAYLOAD_SLOP_BYTES, trailingSlopBytes);
}
if (!parentMetaData.has(PAYLOAD_LENGTH) || bytes != parentMetaData.getLong(PAYLOAD_LENGTH)) {
parentMetaData.putLong(PAYLOAD_LENGTH, bytes);
}
return new WARCMetaDataResource(md,container, headers);

} catch (HttpParseException e) {
Expand Down
48 changes: 48 additions & 0 deletions src/test/java/org/archive/resource/arc/ARCResourceTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package org.archive.resource.arc;


import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;

import java.io.IOException;

import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.Resource;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.archive.util.StreamCopy;

import com.github.openjson.JSONObject;

import junit.framework.TestCase;

public class ARCResourceTest extends TestCase {

public void testARCResource() throws ResourceParseException, IOException {
String testFileName = "../../format/arc/IAH-20080430204825-00000-blackbook-truncated.arc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);

Resource resource = extractor.getNext();

while (resource != null) {
JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
.getJSONObject("Payload-Metadata");
System.err.println(payloadMD);

if (payloadMD.has(PAYLOAD_LENGTH)) {
assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
}
if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
// does not occur with the tested ARC file
}

StreamCopy.readToEOF(resource.getInputStream());
resource = extractor.getNext();
}
}
}
46 changes: 46 additions & 0 deletions src/test/java/org/archive/resource/warc/WARCResourceTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package org.archive.resource.warc;

import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;

import java.io.IOException;

import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.resource.Resource;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.archive.util.StreamCopy;

import com.github.openjson.JSONObject;

import junit.framework.TestCase;

public class WARCResourceTest extends TestCase {

public void testWARCResource() throws ResourceParseException, IOException {
String testFileName = "../../format/warc/IAH-urls-wget.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);

Resource resource = extractor.getNext();

while (resource != null) {
JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
.getJSONObject("Payload-Metadata");

if (payloadMD.has(PAYLOAD_LENGTH)) {
assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
}
if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES));
}

StreamCopy.readToEOF(resource.getInputStream());
resource = extractor.getNext();
}
}
}

0 comments on commit 3628dda

Please sign in to comment.