Skip to content

Commit f731e0d

Browse files
author
Wolfgang Walter SAUER
authored
Merge pull request #46 from clarin-eric/development
Development
2 parents d82085f + 378a188 commit f731e0d

File tree

32 files changed

+696
-1527
lines changed

32 files changed

+696
-1527
lines changed

curation-module-core/pom.xml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>curation-module</artifactId>
55
<groupId>eu.clarin.cmdi</groupId>
6-
<version>3.0</version>
6+
<version>3.1</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99
<artifactId>curation-module-core</artifactId>
@@ -92,8 +92,8 @@
9292
</dependency>
9393
<dependency>
9494
<groupId>eu.clarin.cmdi</groupId>
95-
<artifactId>linkChecker</artifactId>
96-
<version>3.0</version>
95+
<artifactId>linkchecker</artifactId>
96+
<version>3.0.1</version>
9797
</dependency>
9898
<!--<dependency> -->
9999
<!--<groupId>com.github.mfornos</groupId> -->
@@ -143,5 +143,5 @@
143143
</plugin>
144144
</plugins>
145145
</build>
146-
<version>3.0</version>
146+
<version>3.1</version>
147147
</project>

curation-module-core/src/main/java/eu/clarin/cmdi/curation/cr/ProfileCacheFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import eu.clarin.cmdi.curation.cr.profile_parser.ProfileParserFactory;
1212
import eu.clarin.cmdi.curation.main.Configuration;
1313
import eu.clarin.cmdi.curation.xml.SchemaResourceResolver;
14-
import eu.clarin.curation.linkchecker.httpLinkChecker.HTTPLinkChecker;
14+
import eu.clarin.cmdi.linkchecker.httpLinkChecker.HTTPLinkChecker;
1515
import org.slf4j.Logger;
1616
import org.slf4j.LoggerFactory;
1717
import org.xml.sax.SAXException;

curation-module-core/src/main/java/eu/clarin/cmdi/curation/main/Configuration.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ public class Configuration {
4242
public static String DATABASE_URI;
4343
public static String USERAGENT;
4444
public static String BASE_URL;
45+
public static String CMD_STORAGE_URL;
4546

4647
//this is a boolean that is set by core-module(false) and web-module(true)
4748
public static boolean enableProfileLoadTimer = false;
@@ -123,7 +124,6 @@ private static void readProperties(Properties config) throws IOException {
123124

124125
USERAGENT = config.getProperty("USERAGENT");
125126
BASE_URL = config.getProperty("BASE_URL");
126-
127-
127+
CMD_STORAGE_URL = config.getProperty("CMD_STORAGE_URL");
128128
}
129129
}

curation-module-core/src/main/java/eu/clarin/cmdi/curation/main/CurationModule.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import eu.clarin.cmdi.curation.report.CMDInstanceReport;
88
import eu.clarin.cmdi.curation.report.Report;
99
import eu.clarin.cmdi.curation.utils.FileNameEncoder;
10-
import eu.clarin.curation.linkchecker.httpLinkChecker.HTTPLinkChecker;
10+
import eu.clarin.cmdi.linkchecker.httpLinkChecker.HTTPLinkChecker;
1111

1212
import java.io.IOException;
1313
import java.net.MalformedURLException;

curation-module-core/src/main/java/eu/clarin/cmdi/curation/report/CMDInstanceReport.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ public static class URLElement {
115115
@XmlAttribute(name = "redirectCount")
116116
public int redirectCount;
117117

118-
public URLElement convertFromLinkCheckerURLElement(eu.clarin.curation.linkchecker.urlElements.URLElement urlElement) {
118+
public URLElement convertFromLinkCheckerURLElement(eu.clarin.cmdi.linkchecker.urlElements.URLElement urlElement) {
119119
url = urlElement.getUrl();
120120
method = urlElement.getMethod();
121121
message = urlElement.getMessage();
@@ -200,7 +200,7 @@ private static final synchronized void mergeWithParent(CollectionReport parentRe
200200
parentReport.xmlValidationReport.totNumOfValidRecords += instanceReport.xmlValidityReport.valid ? 1 : 0;
201201
if (!instanceReport.xmlValidityReport.valid) {
202202
Record record = new Record();
203-
record.name = instanceReport.getName();
203+
record.name = instanceReport.fileReport.location;
204204
record.issues = instanceReport.xmlValidityReport.issues;
205205
parentReport.xmlValidationReport.record.add(record);
206206
}

curation-module-core/src/main/java/eu/clarin/cmdi/curation/report/ProfilesReport.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ public static class Profile {
9797
@XmlElementWrapper(name = "facets")
9898
@XmlElement(name = "facet")
9999
private List<Facet> facets = new ArrayList<Facet>();
100+
@XmlElement
101+
private double collectionUsage;
102+
@XmlElement
103+
private double instanceUsage;
100104

101105
public Profile() {
102106

@@ -108,11 +112,12 @@ public Profile(CMDProfileReport report) {
108112
this.reportName = report.getName();
109113
this.score = report.score;
110114
this.facetCoverage = report.facet.profileCoverage;
111-
this.percOfElementsWithConcept = report.elements.percWithConcept;
112-
115+
this.percOfElementsWithConcept = report.elements.percWithConcept;
116+
this.collectionUsage = report.collectionUsage.size();
117+
118+
report.collectionUsage.forEach(usage -> this.instanceUsage+=usage.count);
113119
report.facet.coverage.forEach(f -> facets.add(new Facet(f.name, f.coveredByProfile)));
114120
}
115-
116121
}
117122

118123
@XmlRootElement

curation-module-core/src/main/java/eu/clarin/cmdi/curation/subprocessor/FileSizeValidator.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,15 @@ public void process(CMDInstance entity, CMDInstanceReport report) throws IOExcep
120120
if (entity.getUrl() != null) {
121121
report.fileReport.location = FileNameEncoder.encode(entity.getUrl());
122122
} else {
123-
report.fileReport.location = entity.getPath().toString();
123+
int index = -1;
124+
String path = entity.getPath().toString();
125+
126+
if((index = path.indexOf("/clarin")) != -1)
127+
report.fileReport.location = Configuration.CMD_STORAGE_URL + path.substring(index +1);
128+
else if((index = path.indexOf("/europeana")) != -1)
129+
report.fileReport.location = Configuration.CMD_STORAGE_URL + path.substring(index +1);
130+
else
131+
report.fileReport.location = path;
124132
}
125133

126134
if (report.fileReport.size > Configuration.MAX_FILE_SIZE) {

curation-module-core/src/main/java/eu/clarin/cmdi/curation/subprocessor/URLValidator.java

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,19 @@
55
import com.mongodb.client.*;
66
import com.mongodb.client.model.Filters;
77
import com.mongodb.client.model.IndexOptions;
8-
import com.mongodb.client.model.Indexes;
98
import eu.clarin.cmdi.curation.entities.CMDInstance;
109
import eu.clarin.cmdi.curation.main.Configuration;
1110
import eu.clarin.cmdi.curation.report.CMDInstanceReport;
1211
import eu.clarin.cmdi.curation.report.CMDInstanceReport.URLReport;
1312
import eu.clarin.cmdi.curation.report.Score;
1413
import eu.clarin.cmdi.curation.report.Severity;
1514
import eu.clarin.cmdi.curation.utils.TimeUtils;
15+
import eu.clarin.cmdi.linkchecker.httpLinkChecker.HTTPLinkChecker;
16+
import eu.clarin.cmdi.linkchecker.urlElements.URLElement;
17+
import eu.clarin.cmdi.linkchecker.urlElements.URLElementToBeChecked;
1618
import eu.clarin.cmdi.vlo.importer.CMDIData;
1719
import eu.clarin.cmdi.vlo.importer.Resource;
1820
import eu.clarin.cmdi.vlo.importer.processor.ValueSet;
19-
import eu.clarin.curation.linkchecker.httpLinkChecker.HTTPLinkChecker;
20-
import eu.clarin.curation.linkchecker.urlElements.URLElement;
21-
import eu.clarin.curation.linkchecker.urlElements.URLElementToBeChecked;
2221
import org.bson.Document;
2322
import org.bson.conversions.Bson;
2423
import org.slf4j.Logger;
@@ -31,7 +30,7 @@
3130
import static com.mongodb.client.model.Filters.eq;
3231

3332
/**
34-
33+
*
3534
*/
3635

3736
public class URLValidator extends CMDSubprocessor {
@@ -118,7 +117,7 @@ public void process(CMDInstance entity, CMDInstanceReport report, String parentN
118117

119118
_logger.info("Checking database for url: " + url);
120119

121-
Bson filter = Filters.and(eq("collection", parentName), eq("record", report.getName()), eq("url", url));
120+
Bson filter = Filters.and(eq("collection", parentName), eq("url", url));
122121
MongoCursor<Document> cursor = linksChecked.find(filter).iterator();
123122

124123
//because urls are unique in the database if cursor has next, it found the only one. If not, the url wasn't found.
@@ -232,19 +231,21 @@ private void removeOldURLs(Collection<String> links, String recordName, String c
232231
//some old runs may have produced links that are not in the records anymore.
233232
//so to clean up the database, we move all of such links to history.
234233

235-
Bson filter = Filters.and(Filters.eq("collection", collectionName), Filters.eq("record", recordName));
234+
Bson filter = Filters.and(Filters.eq("collection", collectionName), Filters.eq("record", recordName), Filters.not(Filters.in("url", links)));
236235
MongoCursor<Document> cursor = linksChecked.find(filter).iterator();
237236

238237
while (cursor.hasNext()) {
239238

240239
URLElement urlElement = new URLElement(cursor.next());
241-
String url = urlElement.getUrl();
242240

243-
if (!links.contains(url)) {
244-
moveToHistory(urlElement);
245-
}
241+
moveToHistory(urlElement);
242+
246243
}
247244

245+
//also remove them from linkstobechecked so that they are not checked unnecessarily
246+
linksToBeChecked.deleteMany(filter);
247+
248+
248249
}
249250

250251
private void moveToHistory(URLElement urlElement) {

curation-module-core/src/main/resources/config.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ SAVE_REPORT=true
88
OUTPUT_DIRECTORY=
99
CACHE_DIRECTORY=
1010
COLLECTION_HTML_DIRECTORY=
11+
CMD_STORAGE_URL=https://vlo.clarin.eu/data/
1112
FACETS=languageCode, collection, resourceClass, modality, format, keywords, genre, subject, country, organisation, name, description, license, availability
1213

1314

0 commit comments

Comments
 (0)