Skip to content

Commit

Permalink
Merge pull request #85 from clarin-eric/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
wowasa authored Apr 25, 2024
2 parents 7202eb1 + 39c3845 commit a4bd58c
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# version 3.4.0
- logging number of unchecked links (issue #84)
- dependency upgrade for linkchecker-persistence

# version 3.3.0
- upgrading dependencies to Storm 2.6.1 and Storm Crawler 2.11
=> requirement to change user agent string (issues #78, #79, #80)
Expand Down
8 changes: 4 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
<modelVersion>4.0.0</modelVersion>
<groupId>eu.clarin.cmdi</groupId>
<artifactId>linkchecker</artifactId>
<version>3.3.0</version>
<version>3.4.0</version>
<packaging>jar</packaging>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<storm.version>2.6.1</storm.version>
<storm.version>2.6.2</storm.version>
<stormcrawler.version>2.11</stormcrawler.version>
<lombok.version>1.18.24</lombok.version>
<linkchecker-persistence.version>1.0.1</linkchecker-persistence.version>
<lombok.version>1.18.32</lombok.version>
<linkchecker-persistence.version>1.0.2</linkchecker-persistence.version>
<mockserver.version>5.15.0</mockserver.version>
<mockito.version>5.8.0</mockito.version>
<junit.version>5.10.1</junit.version>
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/eu/clarin/linkchecker/config/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ public class Configuration {
public static List<Integer> undeterminedStatusCodes;

public static List<Integer> restrictedAccessStatusCodes;

public static long logIntervalUncheckedLinks;

public static final AnnotationConfigApplicationContext ctx = new AnnotationConfigApplicationContext();

Expand All @@ -84,6 +86,8 @@ public static synchronized void init(Map<String, Object> conf) {
redirectStatusCodes = getIntegerList(Constants.REDIRECT_STATUS_CODES, conf);
undeterminedStatusCodes = getIntegerList(Constants.UNDETERMINED_STATUS_CODES, conf);
restrictedAccessStatusCodes = getIntegerList(Constants.RESTRICTED_ACCESS_STATUS_CODES, conf);

logIntervalUncheckedLinks = ConfUtils.getLong(conf, Constants.LOG_INTERVAL_UNCHECKED_LINKS, 86400000l);

ConfigurableEnvironment environment = new StandardEnvironment();
MutablePropertySources propertySources = environment.getPropertySources();
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/eu/clarin/linkchecker/config/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,6 @@ public class Constants {
public static final String UNDETERMINED_STATUS_CODES = "undetermined.status.codes";
public static final String RESTRICTED_ACCESS_STATUS_CODES = "restricted.access.status.codes";

public static final String LOG_INTERVAL_UNCHECKED_LINKS = "logInterval.uncheckedLinks";

}
21 changes: 19 additions & 2 deletions src/main/java/eu/clarin/linkchecker/spout/LPASpout.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ public class LPASpout extends AbstractQueryingSpout {
protected String logIdprefix = "";

private int counter = 0;
private long lastCheckpoint = System.currentTimeMillis();
private long lastCheckpoint = System.currentTimeMillis();

private static long lastUncheckedLinks = 0;


public LPASpout(String sql) {
Expand Down Expand Up @@ -96,7 +98,8 @@ protected void populateBuffer() {
log.info("{} SQL query returned {} hits, distributed on {} queues in {} msec", logIdprefix, buffer.size(),
buffer.numQueues(), timeTaken);


//we log the number of unchecked links
LPASpout.logUncheckedLinks();
}


Expand Down Expand Up @@ -126,4 +129,18 @@ public void close() {

Configuration.setActive(null, false);
}

private static synchronized void logUncheckedLinks(){
if((LPASpout.lastUncheckedLinks + Configuration.logIntervalUncheckedLinks) < System.currentTimeMillis()){

LPASpout.lastUncheckedLinks = System.currentTimeMillis();

GenericRepository gRep = Configuration.ctx.getBean(GenericRepository.class);

try(Stream<Tuple> stream = gRep.findAll("SELECT COUNT(*) FROM url u WHERE u.id NOT IN (SELECT s.url_id from status s)", true).stream()){

stream.forEach(tuple -> log.info("number of unchecked links: {}", tuple.get(0)));
}
}
}
}
2 changes: 2 additions & 0 deletions src/main/resources/linkchecker-conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,5 @@ config:
restricted.access.status.codes: [401, 403]

directory.share: /share

logInterval.uncheckedLinks: 86400000

0 comments on commit a4bd58c

Please sign in to comment.