Skip to content

Commit

Permalink
Merge pull request #43 from clarin-eric/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
Wolfgang Walter SAUER authored Jun 7, 2019
2 parents 76029fd + 296628d commit 6c662b3
Show file tree
Hide file tree
Showing 136 changed files with 5,156 additions and 19,618 deletions.
6 changes: 3 additions & 3 deletions curation-module-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>curation-module</artifactId>
<groupId>eu.clarin.cmdi</groupId>
<version>2.3</version>
<version>3.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>curation-module-core</artifactId>
Expand Down Expand Up @@ -93,7 +93,7 @@
<dependency>
<groupId>eu.clarin.cmdi</groupId>
<artifactId>linkChecker</artifactId>
<version>2.3</version>
<version>3.0</version>
</dependency>
<!--<dependency> -->
<!--<groupId>com.github.mfornos</groupId> -->
Expand Down Expand Up @@ -143,5 +143,5 @@
</plugin>
</plugins>
</build>
<version>2.3</version>
<version>3.0</version>
</project>
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
package eu.clarin.cmdi.curation.cr;

import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.CharBuffer;
import java.util.Collection;
import java.util.concurrent.ExecutionException;
import java.util.regex.Matcher;
Expand Down Expand Up @@ -60,11 +65,35 @@ public ProfileHeader createProfileHeader(String schemaLocation, String cmdiVersi

if(header == null){
header = new ProfileHeader();
header.setId(getIdFromSchemaLocation(schemaLocation));
header.setSchemaLocation(schemaLocation);
header.setCmdiVersion(cmdiVersion);
header.setPublic(false);

header.setId(getIdFromSchemaLocation(schemaLocation));
header.setCmdiVersion(cmdiVersion);
header.setPublic(false);

if(header.getId() == null) { // when the id can't be extracted from the schema location we have to get it from the file content
CharBuffer buffer = CharBuffer.allocate(1000);

InputStreamReader reader;
try {
reader = new InputStreamReader(new URL(schemaLocation).openStream());
reader.read(buffer);
String content = buffer.rewind().toString();

Matcher matcher = PROFILE_ID_PATTERN.matcher(content);

if(matcher.find())
header.setId(matcher.group());

if(!content.contains("http://www.clarin.eu/cmd/1"))
header.setCmdiVersion("1.1");
}
catch (MalformedURLException ex) {
_logger.error("schema location " + schemaLocation + " is no valid URL", ex);
}
catch (IOException ex) {
_logger.error("couldn't read from schema location " + schemaLocation, ex);
}
}
}
header.setLocalFile(isLocalFile);
return header;
Expand Down Expand Up @@ -117,5 +146,4 @@ public String getIdFromSchemaLocation(String schemaLocation) {

return matcher.find()? matcher.group():null;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@
import javax.xml.validation.SchemaFactory;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.concurrent.TimeUnit;

class ProfileCacheFactory {
Expand Down Expand Up @@ -74,7 +78,7 @@ public ProfileCacheLoader(boolean isPublicProfilesCache) {


@Override
public ProfileCacheEntry load(ProfileHeader header) throws IOException, VTDException, SAXException {
public ProfileCacheEntry load(ProfileHeader header) throws IOException, VTDException, SAXException, URISyntaxException {


_logger.info("Profile {} is not in the cache, it will be loaded", header.getId());
Expand Down Expand Up @@ -103,6 +107,7 @@ public ProfileCacheEntry load(ProfileHeader header) throws IOException, VTDExcep
}

} else {//non-public profiles are not cached on disk

_logger.debug("schema {} is not public. Schema will be downloaded in temp folder", header.getId());


Expand All @@ -125,8 +130,13 @@ public ProfileCacheEntry load(ProfileHeader header) throws IOException, VTDExcep


_logger.info("XSD for the {} is not in the local cache, it will be downloaded", header.getId());
new HTTPLinkChecker(15000, 5, Configuration.USERAGENT).download(header.getSchemaLocation(), xsd.toFile());


if(header.getSchemaLocation().startsWith("file:")) {
Files.move(Paths.get(new URI(header.getSchemaLocation())), xsd, StandardCopyOption.REPLACE_EXISTING);
}
else {
new HTTPLinkChecker(15000, 5, Configuration.USERAGENT).download(header.getSchemaLocation(), xsd.toFile());
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public class ProfileHeader {

private String id;
private String schemaLocation;
private String url;
// private String url;
private String name;
private String description;
private String cmdiVersion;
Expand Down Expand Up @@ -122,11 +122,9 @@ public String toString() {
.toString();
}

public String getUrl() {
return url;
}

public void setUrl(String url) {
this.url = url;
}
/*
* public String getUrl() { return url; }
*
* public void setUrl(String url) { this.url = url; }
*/
}
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
package eu.clarin.cmdi.curation.entities;

import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.ArrayDeque;
import java.util.Deque;


import eu.clarin.cmdi.curation.processor.AbstractProcessor;
import eu.clarin.cmdi.curation.processor.CollectionProcessor;

public class CMDCollection extends CurationEntity {

List<CurationEntity> children;
Deque<CurationEntity> children;

long numOfFiles;
long maxFileSize = 0;
long minFileSize = Long.MAX_VALUE;

public CMDCollection(Path path) {
super(path);
children = new ArrayList<CurationEntity>();
children = new ArrayDeque<CurationEntity>();
}

@Override
Expand Down Expand Up @@ -53,7 +54,7 @@ private void aggregateWithDir(CMDCollection child) {
minFileSize = child.minFileSize;
}

public List<CurationEntity> getChildren() {
public Deque<CurationEntity> getChildren() {
return children;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package eu.clarin.cmdi.curation.entities;

import java.nio.file.Path;

import eu.clarin.cmdi.curation.processor.AbstractProcessor;
import eu.clarin.cmdi.curation.processor.CMDProfileProcessor;

Expand All @@ -13,6 +15,10 @@ public class CMDProfile extends CurationEntity {

private String schemaLocation;
private String cmdiVersion;

public CMDProfile(Path path) {
super(path);
}

public CMDProfile(String schemaLocation, String cmdiVersion) {
super(null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public class InstanceParser {

static{
TransformerFactory factory = TransformerFactory.newInstance();
Source xslt = new StreamSource(InstanceParser.class.getResourceAsStream("/instanceTransformer.xsl"));
Source xslt = new StreamSource(InstanceParser.class.getResourceAsStream("/xslt/instanceTransformer.xsl"));
try {
tranformer = factory.newTransformer(xslt);
} catch (TransformerConfigurationException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ public class Configuration {
public static Path OUTPUT_DIRECTORY = null;
public static Path CACHE_DIRECTORY = null;
public static Path COLLECTION_HTML_DIRECTORY = null;
public static int THREAD_POOL_SIZE=100;
public static Collection<String> FACETS = null;
public static int REDIRECT_FOLLOW_LIMIT;
public static int TIMEOUT;
private static final int TIMEOUTDEFAULT = 5000;//in ms(if config file doesnt have it)
private static int TIMEOUTDEFAULT = 5000;//in ms(if config file doesnt have it)

public static VloConfig VLO_CONFIG;
public static boolean DATABASE;
Expand Down Expand Up @@ -78,7 +79,7 @@ private static void readProperties(Properties config) throws IOException {
} else {
TIMEOUT = Integer.parseInt(timeout);
}

THREAD_POOL_SIZE = Integer.valueOf(config.getProperty("THREAD_POOL_SIZE","100"));

String[] facets = config.getProperty("FACETS").split(",");
FACETS = Arrays.asList(facets).stream().map(f -> f.trim()).collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import eu.clarin.curation.linkchecker.httpLinkChecker.HTTPLinkChecker;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
Expand All @@ -30,9 +31,14 @@ public Report<?> processCMDProfile(URL schemaLocation) {

return new CMDProfile(schemaLocation.toString(), "1.x").generateReport(null);
}

@Override
public Report<?> processCMDProfile(Path path) throws MalformedURLException {

@Override
return processCMDProfile(path.toUri().toURL());
}

@Override
public Report<?> processCMDInstance(Path path) throws IOException {
if (Files.notExists(path))
throw new IOException(path.toString() + " doesn't exist!");
Expand All @@ -45,10 +51,10 @@ public Report<?> processCMDInstance(Path path) throws IOException {
@Override
public Report<?> processCMDInstance(URL url) throws IOException {
String path = FileNameEncoder.encode(url.toString()) + ".xml";
Path cmdiFile = Paths.get(System.getProperty("java.io.tmpdir"), path);
new HTTPLinkChecker(15000, 5, Configuration.USERAGENT).download(url.toString(), cmdiFile.toFile());
long size = Files.size(cmdiFile);
CMDInstance cmdInstance = new CMDInstance(cmdiFile, size);
Path cmdiFilePath = Paths.get(System.getProperty("java.io.tmpdir"), path);
new HTTPLinkChecker(15000, 5, Configuration.USERAGENT).download(url.toString(), cmdiFilePath.toFile());
long size = Files.size(cmdiFilePath);
CMDInstance cmdInstance = new CMDInstance(cmdiFilePath, size);
cmdInstance.setUrl(url.toString());

Report<?> report = cmdInstance.generateReport(null);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package eu.clarin.cmdi.curation.main;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.util.Collection;
Expand All @@ -12,9 +13,11 @@
public interface CurationModuleInterface {

public Report processCMDProfile(String profileId);

public Report processCMDProfile(Path path) throws MalformedURLException, IOException;


public Report processCMDProfile(URL schemaLocation);
public Report processCMDProfile(URL schemaLocation) throws IOException;

/*
* throws Exception if file doesn't exist or is invalid
Expand Down
Loading

0 comments on commit 6c662b3

Please sign in to comment.