Skip to content

Commit

Permalink
Merge pull request #49 from clarin-eric/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
Wolfgang Walter SAUER authored Aug 8, 2019
2 parents f731e0d + fe55312 commit 943191c
Show file tree
Hide file tree
Showing 17 changed files with 316 additions and 520 deletions.
12 changes: 9 additions & 3 deletions curation-module-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>curation-module</artifactId>
<groupId>eu.clarin.cmdi</groupId>
<version>3.1</version>
<version>3.1.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>curation-module-core</artifactId>
Expand Down Expand Up @@ -93,7 +93,13 @@
<dependency>
<groupId>eu.clarin.cmdi</groupId>
<artifactId>linkchecker</artifactId>
<version>3.0.1</version>
<version>4.0</version>
</dependency>

<dependency>
<groupId>eu.clarin.cmdi</groupId>
<artifactId>resource-availability-status-api</artifactId>
<version>1.6-SNAPSHOT</version>
</dependency>
<!--<dependency> -->
<!--<groupId>com.github.mfornos</groupId> -->
Expand Down Expand Up @@ -143,5 +149,5 @@
</plugin>
</plugins>
</build>
<version>3.1</version>
<version>3.1.1</version>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
import java.util.Properties;
import java.util.stream.Collectors;

import eu.clarin.cmdi.rasa.helpers.RasaFactory;
import eu.clarin.cmdi.rasa.helpers.impl.ACDHRasaFactory;
import eu.clarin.cmdi.rasa.linkResources.CheckedLinkResource;
import eu.clarin.cmdi.rasa.linkResources.LinkToBeCheckedResource;
import eu.clarin.cmdi.rasa.linkResources.StatisticsResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -30,7 +35,7 @@ public class Configuration {
public static Path OUTPUT_DIRECTORY = null;
public static Path CACHE_DIRECTORY = null;
public static Path COLLECTION_HTML_DIRECTORY = null;
public static int THREAD_POOL_SIZE=100;
public static int THREAD_POOL_SIZE = 100;
public static Collection<String> FACETS = null;
public static int REDIRECT_FOLLOW_LIMIT;
public static int TIMEOUT;
Expand All @@ -44,6 +49,10 @@ public class Configuration {
public static String BASE_URL;
public static String CMD_STORAGE_URL;

public static CheckedLinkResource checkedLinkResource;
public static LinkToBeCheckedResource linkToBeCheckedResource;
public static StatisticsResource statisticsResource;

//this is a boolean that is set by core-module(false) and web-module(true)
public static boolean enableProfileLoadTimer = false;

Expand All @@ -63,6 +72,7 @@ public static void initDefault() throws IOException {
config.load(Configuration.class.getResourceAsStream("/config.properties"));
readProperties(config);
//readProperties(new PropertiesConfiguration("config.properties"));

}

private static void readProperties(Properties config) throws IOException {
Expand All @@ -80,7 +90,7 @@ private static void readProperties(Properties config) throws IOException {
} else {
TIMEOUT = Integer.parseInt(timeout);
}
THREAD_POOL_SIZE = Integer.valueOf(config.getProperty("THREAD_POOL_SIZE","100"));
THREAD_POOL_SIZE = Integer.valueOf(config.getProperty("THREAD_POOL_SIZE", "100"));

String[] facets = config.getProperty("FACETS").split(",");
FACETS = Arrays.asList(facets).stream().map(f -> f.trim()).collect(Collectors.toList());
Expand All @@ -105,10 +115,18 @@ private static void readProperties(Properties config) throws IOException {
REDIRECT_FOLLOW_LIMIT = Integer.parseInt(redirectFollowLimit);
}



DATABASE = Boolean.parseBoolean(config.getProperty("DATABASE"));

if (DATABASE) {
DATABASE_NAME = config.getProperty("DATABASE_NAME");
DATABASE_URI = config.getProperty("DATABASE_URI");

RasaFactory factory = new ACDHRasaFactory(DATABASE_NAME, DATABASE_URI);
checkedLinkResource = factory.getCheckedLinkResource();
linkToBeCheckedResource = factory.getLinkToBeCheckedResource();
statisticsResource = factory.getStatisticsResource();
}

String vloConfigLocation = config.getProperty("VLO_CONFIG_LOCATION");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import eu.clarin.cmdi.curation.report.CollectionReport.Record;
import eu.clarin.cmdi.curation.utils.TimeUtils;
import eu.clarin.cmdi.curation.xml.XMLMarshaller;
import eu.clarin.cmdi.rasa.links.CheckedLink;

/**
*
Expand Down Expand Up @@ -115,11 +116,11 @@ public static class URLElement {
@XmlAttribute(name = "redirectCount")
public int redirectCount;

public URLElement convertFromLinkCheckerURLElement(eu.clarin.cmdi.linkchecker.urlElements.URLElement urlElement) {
url = urlElement.getUrl();
method = urlElement.getMethod();
message = urlElement.getMessage();
status = urlElement.getStatus();
public URLElement convertFromLinkCheckerURLElement(CheckedLink checkedLink) {
url = checkedLink.getUrl();
method = checkedLink.getMethod();
message = checkedLink.getMessage();
status = checkedLink.getStatus();
//todo put this in its own class (categoryDeterminer)
if (status == 200) {
category = "Ok";
Expand All @@ -128,12 +129,12 @@ public URLElement convertFromLinkCheckerURLElement(eu.clarin.cmdi.linkchecker.ur
} else {
category = "Broken";
}
contentType = urlElement.getContentType();
expectedContentType = urlElement.getExpectedMimeType();
byteSize = urlElement.getByteSize();
duration = TimeUtils.humanizeToTime(urlElement.getDuration());
timestamp = TimeUtils.humanizeToDate(urlElement.getTimestamp());
redirectCount = urlElement.getRedirectCount();
contentType = checkedLink.getContentType();
expectedContentType = checkedLink.getExpectedMimeType();
byteSize = checkedLink.getByteSize();
duration = TimeUtils.humanizeToTime(checkedLink.getDuration());
timestamp = TimeUtils.humanizeToDate(checkedLink.getTimestamp());
redirectCount = checkedLink.getRedirectCount();

return this;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,26 +1,16 @@
package eu.clarin.cmdi.curation.report;

import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import javax.xml.bind.annotation.*;

import com.mongodb.client.*;
import com.mongodb.client.model.Accumulators;
import com.mongodb.client.model.Aggregates;
import com.mongodb.client.model.Filters;
import eu.clarin.cmdi.curation.main.Configuration;
import eu.clarin.cmdi.curation.utils.TimeUtils;
import eu.clarin.cmdi.curation.xml.XMLMarshaller;
import org.bson.Document;
import org.bson.conversions.Bson;
import eu.clarin.cmdi.rasa.filters.impl.ACDHStatisticsFilter;

import static com.mongodb.client.model.Filters.eq;
import static com.mongodb.client.model.Sorts.ascending;
import static com.mongodb.client.model.Sorts.orderBy;
import javax.xml.bind.annotation.*;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;

/**
*
Expand Down Expand Up @@ -279,53 +269,27 @@ public void calculateAverageValues() {
//statistics

if (Configuration.DATABASE) {
MongoClient mongoClient;
if (Configuration.DATABASE_URI == null || Configuration.DATABASE_URI.isEmpty()) {//if it is empty, try localhost
mongoClient = MongoClients.create();
} else {
mongoClient = MongoClients.create(Configuration.DATABASE_URI);
}

MongoDatabase database = mongoClient.getDatabase(Configuration.DATABASE_NAME);

MongoCollection<Document> linksChecked = database.getCollection("linksChecked");
MongoCollection<Document> linksToBeChecked = database.getCollection("linksToBeChecked");

AggregateIterable<Document> iterable = linksChecked.aggregate(Arrays.asList(
Aggregates.match(eq("collection", getName())),
Aggregates.group("$status",
Accumulators.sum("count", 1),
Accumulators.avg("avg_resp", "$duration"),
Accumulators.max("max_resp", "$duration")
),
Aggregates.sort(orderBy(ascending("_id")))
));

for (Document doc : iterable) {
Statistics statistics = new Statistics();
statistics.avgRespTime = doc.getDouble("avg_resp");
statistics.maxRespTime = doc.getLong("max_resp");
statistics.statusCode = doc.getInteger("_id");
if (statistics.statusCode == 200) {
statistics.category = "Ok";
} else if (statistics.statusCode == 401 || statistics.statusCode == 405 || statistics.statusCode == 429) {
statistics.category = "Undetermined";
} else {
statistics.category = "Broken";
}
statistics.count = doc.getInteger("count");
urlReport.status.add(statistics);
List<eu.clarin.cmdi.rasa.links.Statistics> stats = Configuration.statisticsResource.getStatusStatistics(getName());
for (eu.clarin.cmdi.rasa.links.Statistics statistics : stats) {
Statistics xmlStatistics = new Statistics();
xmlStatistics.avgRespTime = statistics.getAvgRespTime();
xmlStatistics.maxRespTime = statistics.getMaxRespTime();
xmlStatistics.statusCode = statistics.getStatus();
xmlStatistics.category = statistics.getCategory();
xmlStatistics.count = statistics.getCount();
urlReport.status.add(xmlStatistics);
}


long numOfCheckedLinks = linksChecked.countDocuments(eq("collection", getName()));
ACDHStatisticsFilter filter = new ACDHStatisticsFilter(getName(), null, false, false);
long numOfCheckedLinks = Configuration.statisticsResource.countLinksChecked(Optional.of(filter));

filter = new ACDHStatisticsFilter(getName(), null, true, false);
long numOfBrokenLinks = Configuration.statisticsResource.countLinksChecked(Optional.of(filter));

Bson brokenLinksFilter = Filters.and(Filters.eq("collection", getName()), Filters.not(Filters.in("status", 200, 302, 401, 405, 429)));
long numOfBrokenLinks = linksChecked.countDocuments(brokenLinksFilter);

Bson undeterminedLinksFilter = Filters.and(Filters.eq("collection", getName()), Filters.in("status", 401, 405, 429));
urlReport.totNumOfUndeterminedLinks = (int) linksChecked.countDocuments(undeterminedLinksFilter);
filter = new ACDHStatisticsFilter(getName(), null, false, true);
urlReport.totNumOfUndeterminedLinks = (int) Configuration.statisticsResource.countLinksChecked(Optional.of(filter));

urlReport.totNumOfBrokenLinks = (int) numOfBrokenLinks;
urlReport.totNumOfCheckedLinks = (int) (numOfCheckedLinks);
Expand All @@ -336,38 +300,20 @@ public void calculateAverageValues() {
//because url validator works on a record basis and not collection basis, so the program
//can only know about unique link numbers in a single record and not the whole collection.
//thats why some database magic on the whole collection needed.
iterable = linksToBeChecked.aggregate(Arrays.asList(
Aggregates.match(eq("collection", getName())),
Aggregates.lookup("linksChecked", "url", "url", "checked")
));
int duplicates = 0;
for (Document doc : iterable) {
if (!((List<?>) doc.get("checked")).isEmpty()) {
duplicates++;
}
}
int duplicates = Configuration.statisticsResource.getDuplicateCount(getName());

int numOfLinksToBeChecked = (int) linksToBeChecked.countDocuments(eq("collection", getName()));
filter = new ACDHStatisticsFilter(getName(), null, false, false);
int numOfLinksToBeChecked = (int) Configuration.statisticsResource.countLinksToBeChecked(Optional.of(filter));
urlReport.totNumOfUniqueLinks = ((int) numOfCheckedLinks) + numOfLinksToBeChecked - duplicates;


urlReport.avgNumOfLinks = (double) urlReport.totNumOfLinks / fileReport.numOfFiles;
urlReport.avgNumOfUniqueLinks = (double) urlReport.totNumOfUniqueLinks / fileReport.numOfFiles;
urlReport.avgNumOfBrokenLinks = 1.0 * (double) urlReport.totNumOfBrokenLinks / fileReport.numOfFiles;

AggregateIterable<Document> aggregate = linksChecked.aggregate(
Arrays.asList(
Aggregates.match(eq("collection", getName())),
Aggregates.group(null,
Accumulators.avg("avg_resp", "$duration"),
Accumulators.max("max_resp", "$duration")
)));
Document result = aggregate.first();

if(result!=null){
urlReport.avgRespTime = result.getDouble("avg_resp");
urlReport.maxRespTime = result.getLong("max_resp");
}
eu.clarin.cmdi.rasa.links.Statistics statistics = Configuration.statisticsResource.getOverallStatistics(getName());
urlReport.avgRespTime = statistics.getAvgRespTime();
urlReport.maxRespTime = statistics.getMaxRespTime();

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ public Collection(CollectionReport report) {
this.numOfResProxies = report.resProxyReport.totNumOfResProxies;
this.ratioOfValidRecords = report.xmlValidationReport.ratioOfValidRecords;
this.avgNumOfEmptyXMLElements = report.xmlPopulatedReport.avgXMLEmptyElement;
this.avgFacetCoverage = report.facetReport.coverage;

report.facetReport.facet.forEach(f -> this.facets.add(new Facet(f.name, f.coverage)));
}
Expand Down
Loading

0 comments on commit 943191c

Please sign in to comment.