diff --git a/CHANGES.md b/CHANGES.md
index a7034f6..4b63dea 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,7 @@
+# version 3.4.0
+- logging number of unchecked links (issue #84)
+- dependency upgrade for linkchecker-persistence
+
# version 3.3.0
- upgrading dependencies to Storm 2.6.1 and Storm Crawler 2.11
=> requirement to change user agent string (issues #78, #79, #80)
diff --git a/pom.xml b/pom.xml
index 18fec79..acbb322 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,15 +5,15 @@
4.0.0
eu.clarin.cmdi
linkchecker
- 3.3.0
+ 3.4.0
jar
UTF-8
- 2.6.1
+ 2.6.2
2.11
- 1.18.24
- 1.0.1
+ 1.18.32
+ 1.0.2
5.15.0
5.8.0
5.10.1
diff --git a/src/main/java/eu/clarin/linkchecker/config/Configuration.java b/src/main/java/eu/clarin/linkchecker/config/Configuration.java
index 4b92cc3..471c042 100644
--- a/src/main/java/eu/clarin/linkchecker/config/Configuration.java
+++ b/src/main/java/eu/clarin/linkchecker/config/Configuration.java
@@ -62,6 +62,8 @@ public class Configuration {
public static List undeterminedStatusCodes;
public static List restrictedAccessStatusCodes;
+
+ public static long logIntervalUncheckedLinks;
public static final AnnotationConfigApplicationContext ctx = new AnnotationConfigApplicationContext();
@@ -84,6 +86,8 @@ public static synchronized void init(Map conf) {
redirectStatusCodes = getIntegerList(Constants.REDIRECT_STATUS_CODES, conf);
undeterminedStatusCodes = getIntegerList(Constants.UNDETERMINED_STATUS_CODES, conf);
restrictedAccessStatusCodes = getIntegerList(Constants.RESTRICTED_ACCESS_STATUS_CODES, conf);
+
+ logIntervalUncheckedLinks = ConfUtils.getLong(conf, Constants.LOG_INTERVAL_UNCHECKED_LINKS, 86400000l);
ConfigurableEnvironment environment = new StandardEnvironment();
MutablePropertySources propertySources = environment.getPropertySources();
diff --git a/src/main/java/eu/clarin/linkchecker/config/Constants.java b/src/main/java/eu/clarin/linkchecker/config/Constants.java
index 1bbf2df..0abe99c 100644
--- a/src/main/java/eu/clarin/linkchecker/config/Constants.java
+++ b/src/main/java/eu/clarin/linkchecker/config/Constants.java
@@ -27,4 +27,6 @@ public class Constants {
public static final String UNDETERMINED_STATUS_CODES = "undetermined.status.codes";
public static final String RESTRICTED_ACCESS_STATUS_CODES = "restricted.access.status.codes";
+ public static final String LOG_INTERVAL_UNCHECKED_LINKS = "logInterval.uncheckedLinks";
+
}
diff --git a/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java b/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java
index 9a1f881..dce383b 100644
--- a/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java
+++ b/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java
@@ -35,7 +35,9 @@ public class LPASpout extends AbstractQueryingSpout {
protected String logIdprefix = "";
private int counter = 0;
- private long lastCheckpoint = System.currentTimeMillis();
+ private long lastCheckpoint = System.currentTimeMillis();
+
+ private static long lastUncheckedLinks = 0;
public LPASpout(String sql) {
@@ -96,7 +98,8 @@ protected void populateBuffer() {
log.info("{} SQL query returned {} hits, distributed on {} queues in {} msec", logIdprefix, buffer.size(),
buffer.numQueues(), timeTaken);
-
+ //we log the number of unchecked links
+ LPASpout.logUncheckedLinks();
}
@@ -126,4 +129,18 @@ public void close() {
Configuration.setActive(null, false);
}
+
+ private static synchronized void logUncheckedLinks(){
+ if((LPASpout.lastUncheckedLinks + Configuration.logIntervalUncheckedLinks) < System.currentTimeMillis()){
+
+ LPASpout.lastUncheckedLinks = System.currentTimeMillis();
+
+ GenericRepository gRep = Configuration.ctx.getBean(GenericRepository.class);
+
+ try(Stream stream = gRep.findAll("SELECT COUNT(*) FROM url u WHERE u.id NOT IN (SELECT s.url_id from status s)", true).stream()){
+
+ stream.forEach(tuple -> log.info("number of unchecked links: {}", tuple.get(0)));
+ }
+ }
+ }
}
diff --git a/src/main/resources/linkchecker-conf.yaml b/src/main/resources/linkchecker-conf.yaml
index f04b454..bd4f7bc 100644
--- a/src/main/resources/linkchecker-conf.yaml
+++ b/src/main/resources/linkchecker-conf.yaml
@@ -102,3 +102,5 @@ config:
restricted.access.status.codes: [401, 403]
directory.share: /share
+
+ logInterval.uncheckedLinks: 86400000