Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
wowasa committed May 18, 2022
1 parent fbb30c7 commit ab68be1
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 11 deletions.
16 changes: 12 additions & 4 deletions src/main/java/eu/clarin/linkchecker/bolt/MetricsFetcherBolt.java
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,8 @@ public void run() {
// no need to wait next time as we won't request from
// that site
asap = true;

countAndLog();
continue;
}

Expand All @@ -516,12 +518,16 @@ public void run() {
}
else {
// pass the info about crawl delay
metadata.setValue(Constants.STATUS_ERROR_CAUSE, "crawl_delay");
metadata.setValue("fetch.message", "Crawl delay too long.");
metadata.setValue("fetch.category", Category.Blocked_By_Robots_txt.name());

collector.emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, fit.t,
new Values(fit.url, metadata, Status.ERROR));
new Values(fit.url, metadata, Status.DISCOVERED));
// no need to wait next time as we won't request
// from that site
asap = true;

countAndLog();
continue;
}
}
Expand All @@ -548,8 +554,9 @@ else if (rules.getCrawlDelay() < fetchQueues.crawlDelay && crawlDelayForce) {
metadata.setValue("fetch.category", Category.Restricted_Access.name());

collector.emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, fit.t,
new Values(fit.url, metadata, Status.ERROR));
new Values(fit.url, metadata, Status.DISCOVERED));

countAndLog();
continue;
}
}
Expand Down Expand Up @@ -647,6 +654,7 @@ else if (Configuration.restrictedAccessStatusCodes.contains(response.getStatusCo
catch (Exception exece) {
// recheck with GET request if the failed check was a GET request
if("true".equals(metadata.getFirstValue("http.method.head"))){
metadata.setValue("http.method.head", "false");
collector.emit(eu.clarin.linkchecker.config.Constants.RedirectStreamName, fit.t, new Values(fit.url, metadata));
continue;
}
Expand Down Expand Up @@ -687,7 +695,7 @@ else if (exece.getCause() instanceof java.net.UnknownHostException
// metadata.setValue("fetch.startTime", Long.toString(start));

// send to status stream
collector.emit(Constants.StatusStreamName, fit.t, new Values(fit.url, metadata, Status.FETCH_ERROR));
collector.emit(Constants.StatusStreamName, fit.t, new Values(fit.url, metadata, Status.DISCOVERED));

}
finally {
Expand Down
23 changes: 16 additions & 7 deletions src/main/java/eu/clarin/linkchecker/bolt/StatusUpdaterBolt.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt {

private static final Pattern INT_PATTERN = Pattern.compile("\\d+");


/**
* Does not shard based on the total number of queues
**/
Expand Down Expand Up @@ -81,7 +80,7 @@ public synchronized void store(String url, Status status, Metadata metadata, Opt
CheckedLink checkedLink = new CheckedLink();

Metadata md = (Metadata) t.getValueByField("metadata");

log.debug("metadata:\n" + md.toString());

String str = null;
Expand All @@ -95,8 +94,9 @@ public synchronized void store(String url, Status status, Metadata metadata, Opt
checkedLink.setStatus(Integer.parseInt(md.getFirstValue("fetch.statusCode")));
}
if (md.getFirstValue("fetch.contentType") != null) {
checkedLink.setContentType((md.getFirstValue("fetch.contentType").length() < 256) ? md.getFirstValue("fetch.contentType")
: md.getFirstValue("fetch.contentType").substring(0, 250) + "...");
checkedLink.setContentType(
(md.getFirstValue("fetch.contentType").length() < 256) ? md.getFirstValue("fetch.contentType")
: md.getFirstValue("fetch.contentType").substring(0, 250) + "...");
}

if ((str = md.getFirstValue("fetch.byteLength")) != null && INT_PATTERN.matcher(str).matches()) {
Expand All @@ -111,9 +111,17 @@ public synchronized void store(String url, Status status, Metadata metadata, Opt
: md.getFirstValue("fetch.message").substring(0, 1020) + "...");
}

checkedLink.setCategory(Category.valueOf(md.getFirstValue("fetch.category")));
if (md.getFirstValue("fetch.category") != null) {
checkedLink.setCategory(Category.valueOf(md.getFirstValue("fetch.category")));
}
else {
log.error("category is null for in metadata:\n{})", md);
checkedLink.setCategory(Category.Undetermined);
}

checkedLink.setCheckingDate(new Timestamp(md.getFirstValue("fetch.startTime")!=null?Long.parseLong(md.getFirstValue("fetch.startTime")):System.currentTimeMillis()));
checkedLink.setCheckingDate(new Timestamp(
md.getFirstValue("fetch.startTime") != null ? Long.parseLong(md.getFirstValue("fetch.startTime"))
: System.currentTimeMillis()));

checkedLink.setRedirectCount(md.getFirstValue("fetch.redirectCount") == null ? 0
: Integer.parseInt(md.getFirstValue("fetch.redirectCount")));
Expand All @@ -125,7 +133,8 @@ public synchronized void store(String url, Status status, Metadata metadata, Opt
try {
Configuration.checkedLinkResource.save(checkedLink);
_collector.ack(t);
} catch (SQLException ex) {
}
catch (SQLException ex) {
log.error("can't save checked link \n{}", checkedLink);
_collector.fail(t);
}
Expand Down

0 comments on commit ab68be1

Please sign in to comment.