diff --git a/CHANGES.md b/CHANGES.md index a7034f6..4b63dea 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +# version 3.4.0 +- logging number of unchecked links (issue #84) +- dependency upgrade for linkchecker-persistence + # version 3.3.0 - upgrading dependencies to Storm 2.6.1 and Storm Crawler 2.11 => requirement to change user agent string (issues #78, #79, #80) diff --git a/pom.xml b/pom.xml index 18fec79..acbb322 100644 --- a/pom.xml +++ b/pom.xml @@ -5,15 +5,15 @@ 4.0.0 eu.clarin.cmdi linkchecker - 3.3.0 + 3.4.0 jar UTF-8 - 2.6.1 + 2.6.2 2.11 - 1.18.24 - 1.0.1 + 1.18.32 + 1.0.2 5.15.0 5.8.0 5.10.1 diff --git a/src/main/java/eu/clarin/linkchecker/config/Configuration.java b/src/main/java/eu/clarin/linkchecker/config/Configuration.java index 4b92cc3..471c042 100644 --- a/src/main/java/eu/clarin/linkchecker/config/Configuration.java +++ b/src/main/java/eu/clarin/linkchecker/config/Configuration.java @@ -62,6 +62,8 @@ public class Configuration { public static List undeterminedStatusCodes; public static List restrictedAccessStatusCodes; + + public static long logIntervalUncheckedLinks; public static final AnnotationConfigApplicationContext ctx = new AnnotationConfigApplicationContext(); @@ -84,6 +86,8 @@ public static synchronized void init(Map conf) { redirectStatusCodes = getIntegerList(Constants.REDIRECT_STATUS_CODES, conf); undeterminedStatusCodes = getIntegerList(Constants.UNDETERMINED_STATUS_CODES, conf); restrictedAccessStatusCodes = getIntegerList(Constants.RESTRICTED_ACCESS_STATUS_CODES, conf); + + logIntervalUncheckedLinks = ConfUtils.getLong(conf, Constants.LOG_INTERVAL_UNCHECKED_LINKS, 86400000l); ConfigurableEnvironment environment = new StandardEnvironment(); MutablePropertySources propertySources = environment.getPropertySources(); diff --git a/src/main/java/eu/clarin/linkchecker/config/Constants.java b/src/main/java/eu/clarin/linkchecker/config/Constants.java index 1bbf2df..0abe99c 100644 --- a/src/main/java/eu/clarin/linkchecker/config/Constants.java +++ b/src/main/java/eu/clarin/linkchecker/config/Constants.java @@ -27,4 +27,6 @@ public class Constants { public static final String UNDETERMINED_STATUS_CODES = "undetermined.status.codes"; public static final String RESTRICTED_ACCESS_STATUS_CODES = "restricted.access.status.codes"; + public static final String LOG_INTERVAL_UNCHECKED_LINKS = "logInterval.uncheckedLinks"; + } diff --git a/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java b/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java index 9a1f881..dce383b 100644 --- a/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java +++ b/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java @@ -35,7 +35,9 @@ public class LPASpout extends AbstractQueryingSpout { protected String logIdprefix = ""; private int counter = 0; - private long lastCheckpoint = System.currentTimeMillis(); + private long lastCheckpoint = System.currentTimeMillis(); + + private static long lastUncheckedLinks = 0; public LPASpout(String sql) { @@ -96,7 +98,8 @@ protected void populateBuffer() { log.info("{} SQL query returned {} hits, distributed on {} queues in {} msec", logIdprefix, buffer.size(), buffer.numQueues(), timeTaken); - + //we log the number of unchecked links + LPASpout.logUncheckedLinks(); } @@ -126,4 +129,18 @@ public void close() { Configuration.setActive(null, false); } + + private static synchronized void logUncheckedLinks(){ + if((LPASpout.lastUncheckedLinks + Configuration.logIntervalUncheckedLinks) < System.currentTimeMillis()){ + + LPASpout.lastUncheckedLinks = System.currentTimeMillis(); + + GenericRepository gRep = Configuration.ctx.getBean(GenericRepository.class); + + try(Stream stream = gRep.findAll("SELECT COUNT(*) FROM url u WHERE u.id NOT IN (SELECT s.url_id from status s)", true).stream()){ + + stream.forEach(tuple -> log.info("number of unchecked links: {}", tuple.get(0))); + } + } + } } diff --git a/src/main/resources/linkchecker-conf.yaml b/src/main/resources/linkchecker-conf.yaml index f04b454..bd4f7bc 100644 --- a/src/main/resources/linkchecker-conf.yaml +++ b/src/main/resources/linkchecker-conf.yaml @@ -102,3 +102,5 @@ config: restricted.access.status.codes: [401, 403] directory.share: /share + + logInterval.uncheckedLinks: 86400000