Skip to content

Commit b2a07b2

Browse files
authored
Merge pull request #348 from internetarchive/fixes-leaky-file-handles
Fixes leaky file handles
2 parents c7b1dfa + a9c0c65 commit b2a07b2

File tree

2 files changed

+71
-16
lines changed

2 files changed

+71
-16
lines changed

contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -117,17 +117,52 @@ public class ExtractorYoutubeDL extends Extractor
117117
// unnamed toethread-local temporary file
118118
protected transient ThreadLocal<RandomAccessFile> tempfile = new ThreadLocal<RandomAccessFile>() {
119119
protected RandomAccessFile initialValue() {
120-
File t;
121-
try {
122-
t = File.createTempFile("ydl", ".json");
123-
RandomAccessFile f = new RandomAccessFile(t, "rw");
124-
t.delete();
125-
return f;
126-
} catch (IOException e) {
127-
throw new RuntimeException(e);
128-
}
120+
return null;
129121
}
130122
};
123+
protected void closeLocalTempFile() {
124+
RandomAccessFile localTemp = tempfile.get();
125+
if(localTemp == null || !isOpen(localTemp))
126+
return; // avoid making a new temp file just to close it immediately
127+
try {
128+
getLocalTempFile().close();
129+
tempfile.set(null);
130+
}
131+
catch (Exception e) {
132+
logger.log(Level.WARNING, "problem closing ydl temp file " + e);
133+
}
134+
}
135+
protected RandomAccessFile getLocalTempFile() {
136+
RandomAccessFile localTemp = tempfile.get();
137+
if(localTemp == null || !isOpen(localTemp)) {
138+
localTemp = openNewTempFile();
139+
tempfile.set(localTemp);
140+
}
141+
logger.info("Getting youtube-dl temp file ");
142+
return localTemp;
143+
}
144+
protected boolean isOpen(RandomAccessFile f) {
145+
try {
146+
f.length();
147+
return true;
148+
}
149+
catch (IOException e) {
150+
logger.info("youtube-dl temp file is not open");
151+
return false ;
152+
}
153+
}
154+
protected RandomAccessFile openNewTempFile() {
155+
logger.info("Opening New youtube-dl temp file ");
156+
File t;
157+
try {
158+
t = File.createTempFile("ydl", ".json");
159+
RandomAccessFile f = new RandomAccessFile(t, "rw");
160+
t.delete();
161+
return f;
162+
} catch (IOException e) {
163+
throw new RuntimeException(e);
164+
}
165+
}
131166

132167
protected CrawlerLoggerModule crawlerLoggerModule;
133168
public CrawlerLoggerModule getCrawlerLoggerModule() {
@@ -447,7 +482,7 @@ public String call() throws IOException {
447482
}
448483
});
449484

450-
YoutubeDLResults results = new YoutubeDLResults(tempfile.get());
485+
YoutubeDLResults results = new YoutubeDLResults(getLocalTempFile());
451486

452487
try {
453488
try {
@@ -525,7 +560,14 @@ public boolean shouldBuildRecord(CrawlURI uri) {
525560
// should build record for containing page, which has an
526561
// annotation like "youtube-dl:3" (no slash)
527562
String annotation = findYdlAnnotation(uri);
528-
return annotation != null && !annotation.contains("/");
563+
boolean shouldBuild = (annotation != null && !annotation.contains("/"));
564+
565+
// If we processed this uri, then we have an open temp file that won't get closed
566+
// for us by the warc writer
567+
if(!shouldBuild)
568+
closeLocalTempFile();
569+
570+
return shouldBuild;
529571
}
530572

531573
@Override
@@ -546,10 +588,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
546588
recordInfo.setMimetype("application/vnd.youtube-dl_formats+json;charset=utf-8");
547589
recordInfo.setEnforceLength(true);
548590

549-
tempfile.get().seek(0);
550-
InputStream inputStream = Channels.newInputStream(tempfile.get().getChannel());
591+
getLocalTempFile().seek(0);
592+
InputStream inputStream = Channels.newInputStream(getLocalTempFile().getChannel());
551593
recordInfo.setContentStream(inputStream);
552-
recordInfo.setContentLength(tempfile.get().length());
594+
recordInfo.setContentLength(getLocalTempFile().length());
553595

554596
logger.info("built record timestamp=" + timestamp + " url=" + recordInfo.getUrl());
555597

@@ -575,7 +617,7 @@ public static void main(String[] args) throws IOException {
575617
ExtractorYoutubeDL e = new ExtractorYoutubeDL();
576618

577619
FileInputStream in = new FileInputStream("/tmp/ydl-single-video.json");
578-
YoutubeDLResults results = new YoutubeDLResults(e.tempfile.get());
620+
YoutubeDLResults results = new YoutubeDLResults(e.getLocalTempFile());
579621
e.streamYdlOutput(in, results);
580622
System.out.println("video urls: " + results.videoUrls);
581623
System.out.println("page urls: " + results.pageUrls);
@@ -591,7 +633,7 @@ public static void main(String[] args) throws IOException {
591633
}
592634

593635
in = new FileInputStream("/tmp/ydl-uncgreensboro-limited.json");
594-
results = new YoutubeDLResults(e.tempfile.get());
636+
results = new YoutubeDLResults(e.getLocalTempFile());
595637
e.streamYdlOutput(in, results);
596638
System.out.println("video urls: " + results.videoUrls);
597639
System.out.println("page urls: " + results.pageUrls);

modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
package org.archive.modules.writer;
22

3+
import java.io.InputStream;
34
import java.io.IOException;
45
import java.net.URI;
56
import java.util.Arrays;
67
import java.util.List;
78
import java.util.logging.Level;
89
import java.util.logging.Logger;
910

11+
import org.apache.commons.io.IOUtils;
1012
import org.archive.io.warc.WARCRecordInfo;
1113
import org.archive.io.warc.WARCWriter;
1214
import org.archive.modules.CrawlURI;
@@ -159,6 +161,17 @@ protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException
159161
WARCRecordInfo record = recordBuilder.buildRecord(curi, concurrentTo);
160162
if (record != null) {
161163
writer.writeRecord(record);
164+
InputStream is = null;
165+
try {
166+
is = record.getContentStream();
167+
is.close();
168+
}
169+
catch (Exception e){
170+
logger.log(Level.WARNING, "problem closing Warc Record Content Stream " + e);
171+
}
172+
finally {
173+
IOUtils.closeQuietly(record.getContentStream()); //Closing one way or the other seems to leave some file handles open. Calling close() and using closeQuietly() handles both FileStreams and FileChannels
174+
}
162175
if (concurrentTo == null) {
163176
concurrentTo = record.getRecordId();
164177
}

0 commit comments

Comments
 (0)