From f5ebcb994aef7b3652df6a48b983350a017b630e Mon Sep 17 00:00:00 2001 From: "WolfgangWalter Sauer (wowasa)" Date: Sun, 14 Apr 2024 21:37:28 +0200 Subject: [PATCH 1/3] issue #84 --- CHANGES.md | 3 +++ pom.xml | 4 ++-- .../linkchecker/config/Configuration.java | 4 ++++ .../clarin/linkchecker/config/Constants.java | 2 ++ .../eu/clarin/linkchecker/spout/LPASpout.java | 21 +++++++++++++++++-- src/main/resources/linkchecker-conf.yaml | 2 ++ 6 files changed, 32 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index a7034f6..84e8fe9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,6 @@ +# version 3.4.0 +- logging number of unchecked links (issue #84) + # version 3.3.0 - upgrading dependencies to Storm 2.6.1 and Storm Crawler 2.11 => requirement to change user agent string (issues #78, #79, #80) diff --git a/pom.xml b/pom.xml index 18fec79..668d204 100644 --- a/pom.xml +++ b/pom.xml @@ -5,14 +5,14 @@ 4.0.0 eu.clarin.cmdi linkchecker - 3.3.0 + 3.4.0-SNAPSHOT jar UTF-8 2.6.1 2.11 - 1.18.24 + 1.18.32 1.0.1 5.15.0 5.8.0 diff --git a/src/main/java/eu/clarin/linkchecker/config/Configuration.java b/src/main/java/eu/clarin/linkchecker/config/Configuration.java index 4b92cc3..471c042 100644 --- a/src/main/java/eu/clarin/linkchecker/config/Configuration.java +++ b/src/main/java/eu/clarin/linkchecker/config/Configuration.java @@ -62,6 +62,8 @@ public class Configuration { public static List undeterminedStatusCodes; public static List restrictedAccessStatusCodes; + + public static long logIntervalUncheckedLinks; public static final AnnotationConfigApplicationContext ctx = new AnnotationConfigApplicationContext(); @@ -84,6 +86,8 @@ public static synchronized void init(Map conf) { redirectStatusCodes = getIntegerList(Constants.REDIRECT_STATUS_CODES, conf); undeterminedStatusCodes = getIntegerList(Constants.UNDETERMINED_STATUS_CODES, conf); restrictedAccessStatusCodes = getIntegerList(Constants.RESTRICTED_ACCESS_STATUS_CODES, conf); + + logIntervalUncheckedLinks = ConfUtils.getLong(conf, Constants.LOG_INTERVAL_UNCHECKED_LINKS, 86400000l); ConfigurableEnvironment environment = new StandardEnvironment(); MutablePropertySources propertySources = environment.getPropertySources(); diff --git a/src/main/java/eu/clarin/linkchecker/config/Constants.java b/src/main/java/eu/clarin/linkchecker/config/Constants.java index 1bbf2df..0abe99c 100644 --- a/src/main/java/eu/clarin/linkchecker/config/Constants.java +++ b/src/main/java/eu/clarin/linkchecker/config/Constants.java @@ -27,4 +27,6 @@ public class Constants { public static final String UNDETERMINED_STATUS_CODES = "undetermined.status.codes"; public static final String RESTRICTED_ACCESS_STATUS_CODES = "restricted.access.status.codes"; + public static final String LOG_INTERVAL_UNCHECKED_LINKS = "logInterval.uncheckedLinks"; + } diff --git a/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java b/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java index 9a1f881..dce383b 100644 --- a/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java +++ b/src/main/java/eu/clarin/linkchecker/spout/LPASpout.java @@ -35,7 +35,9 @@ public class LPASpout extends AbstractQueryingSpout { protected String logIdprefix = ""; private int counter = 0; - private long lastCheckpoint = System.currentTimeMillis(); + private long lastCheckpoint = System.currentTimeMillis(); + + private static long lastUncheckedLinks = 0; public LPASpout(String sql) { @@ -96,7 +98,8 @@ protected void populateBuffer() { log.info("{} SQL query returned {} hits, distributed on {} queues in {} msec", logIdprefix, buffer.size(), buffer.numQueues(), timeTaken); - + //we log the number of unchecked links + LPASpout.logUncheckedLinks(); } @@ -126,4 +129,18 @@ public void close() { Configuration.setActive(null, false); } + + private static synchronized void logUncheckedLinks(){ + if((LPASpout.lastUncheckedLinks + Configuration.logIntervalUncheckedLinks) < System.currentTimeMillis()){ + + LPASpout.lastUncheckedLinks = System.currentTimeMillis(); + + GenericRepository gRep = Configuration.ctx.getBean(GenericRepository.class); + + try(Stream stream = gRep.findAll("SELECT COUNT(*) FROM url u WHERE u.id NOT IN (SELECT s.url_id from status s)", true).stream()){ + + stream.forEach(tuple -> log.info("number of unchecked links: {}", tuple.get(0))); + } + } + } } diff --git a/src/main/resources/linkchecker-conf.yaml b/src/main/resources/linkchecker-conf.yaml index f04b454..bd4f7bc 100644 --- a/src/main/resources/linkchecker-conf.yaml +++ b/src/main/resources/linkchecker-conf.yaml @@ -102,3 +102,5 @@ config: restricted.access.status.codes: [401, 403] directory.share: /share + + logInterval.uncheckedLinks: 86400000 From 4bb652a2ea8108b7fcaf85b16bd34ed03f050445 Mon Sep 17 00:00:00 2001 From: "WolfgangWalter Sauer (wowasa)" Date: Sun, 21 Apr 2024 18:59:59 +0200 Subject: [PATCH 2/3] upgrading storm dependency --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 668d204..6cee712 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ UTF-8 - 2.6.1 + 2.6.2 2.11 1.18.32 1.0.1 From 39c3845ea9f2da5ca9fa5213234937f5fcc1cd38 Mon Sep 17 00:00:00 2001 From: "WolfgangWalter Sauer (wowasa)" Date: Thu, 25 Apr 2024 12:23:17 +0200 Subject: [PATCH 3/3] release 3.4.0 --- CHANGES.md | 1 + pom.xml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 84e8fe9..4b63dea 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ # version 3.4.0 - logging number of unchecked links (issue #84) +- dependency upgrade for linkchecker-persistence # version 3.3.0 - upgrading dependencies to Storm 2.6.1 and Storm Crawler 2.11 diff --git a/pom.xml b/pom.xml index 6cee712..acbb322 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 eu.clarin.cmdi linkchecker - 3.4.0-SNAPSHOT + 3.4.0 jar @@ -13,7 +13,7 @@ 2.6.2 2.11 1.18.32 - 1.0.1 + 1.0.2 5.15.0 5.8.0 5.10.1