Skip to content

Commit c1bcdd9

Browse files
authored
Merge pull request #341 from internetarchive/noplaylist-ydl
youtube-dl --no-playlist
2 parents b2a07b2 + 1b8e7f7 commit c1bcdd9

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import java.util.logging.Logger;
4545

4646
import org.apache.commons.httpclient.URIException;
47+
import org.archive.crawler.frontier.AMQPUrlReceiver;
4748
import org.archive.crawler.reporting.CrawlerLoggerModule;
4849
import org.archive.format.warc.WARCConstants.WARCRecordType;
4950
import org.archive.io.warc.WARCRecordInfo;
@@ -543,6 +544,11 @@ protected boolean shouldExtract(CrawlURI uri) {
543544
return false;
544545
}
545546

547+
// skip crawl uris received from umbra
548+
if (uri.getAnnotations().contains(AMQPUrlReceiver.A_RECEIVED_FROM_AMQP)) {
549+
return false;
550+
}
551+
546552
String mime = uri.getContentType().toLowerCase();
547553
if (mime.startsWith("text/html")
548554
|| mime.startsWith("application/xhtml")

modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOExce
3232
recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO,
3333
'<' + concurrentTo.toString() + '>');
3434
}
35-
recordInfo.setType(WARCRecordType.resource);
35+
recordInfo.setType(WARCRecordType.response);
3636
recordInfo.setUrl(curi.toString());
3737
recordInfo.setCreate14DigitDate(timestamp);
3838
recordInfo.setMimetype(curi.getContentType());

0 commit comments

Comments
 (0)