From 8b064ba5125e6f2e8eb6d63b917f9764d58b252b Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 6 Feb 2025 16:25:00 -0500 Subject: [PATCH] A simple implementation of a feature where an OAI client can be instructed to return harvested metadata records unparsed #284 --- .../model/oaipmh/results/record/Metadata.java | 5 ++++ .../java/io/gdcc/xoai/xml/EchoElement.java | 7 +++++ .../xoai/serviceprovider/model/Context.java | 24 +++++++++++++++ .../serviceprovider/parsers/RecordParser.java | 29 ++++++++++++------- 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/xoai-common/src/main/java/io/gdcc/xoai/model/oaipmh/results/record/Metadata.java b/xoai-common/src/main/java/io/gdcc/xoai/model/oaipmh/results/record/Metadata.java index 5d4b5470..be523b34 100644 --- a/xoai-common/src/main/java/io/gdcc/xoai/model/oaipmh/results/record/Metadata.java +++ b/xoai-common/src/main/java/io/gdcc/xoai/model/oaipmh/results/record/Metadata.java @@ -77,6 +77,11 @@ public XOAIMetadata getXoaiMetadata() { if (element instanceof XOAIMetadata) return (XOAIMetadata) element; else return null; } + + public String getMetadataAsString() { + if (element instanceof EchoElement) return ((EchoElement)element).asUnparsedString(); + return null; + } /** * This is here for Dataverse 4/5 backward compatibility. diff --git a/xoai-common/src/main/java/io/gdcc/xoai/xml/EchoElement.java b/xoai-common/src/main/java/io/gdcc/xoai/xml/EchoElement.java index 6aefe98a..664afd95 100644 --- a/xoai-common/src/main/java/io/gdcc/xoai/xml/EchoElement.java +++ b/xoai-common/src/main/java/io/gdcc/xoai/xml/EchoElement.java @@ -52,6 +52,13 @@ public void write(final XmlWriter writer) throws XmlWriteException { "Cannot write XML when none given (both stream and string null)"); } } + + public String asUnparsedString() { + if (xmlString != null) { + return xmlString; + } + return null; + } private void write(final XmlWriter writer, final InputStream inStream) throws XmlWriteException { diff --git a/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/model/Context.java b/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/model/Context.java index de66c9ae..f652adf8 100644 --- a/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/model/Context.java +++ b/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/model/Context.java @@ -24,6 +24,7 @@ public class Context { private final Map metadataTransformers = new HashMap<>(); private String baseUrl; private Granularity granularity; + private boolean saveUnparsedMetadata = false; private OAIClient client; public Context() { @@ -90,6 +91,29 @@ public Context withOAIClient(OAIClient client) { public OAIClient getClient() { return client; } + + /** + * Should this harvester skip parsing the "metadata" sections of oai records + * in the bodies of GetRecord and ListRecords responses, and cache and make + * them available as unparsed Strings instead. + * + * @return boolean + */ + public boolean isSaveUnparsedMetadata() { + return this.saveUnparsedMetadata; + } + + /** + * Instruct this harvester not to attempt to parse the "metadata" sections of + * oai records in the bodies of GetRecord and ListRecords responses, but + * cache and make them available as Strings instead. + * + * @return A Context + */ + public Context withSaveUnparsedMetadata() { + this.saveUnparsedMetadata = true; + return this; + } public enum KnownTransformer { OAI_DC("to_xoai/oai_dc.xsl"); diff --git a/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/parsers/RecordParser.java b/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/parsers/RecordParser.java index a493e643..dac9cfb7 100644 --- a/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/parsers/RecordParser.java +++ b/xoai-service-provider/src/main/java/io/gdcc/xoai/serviceprovider/parsers/RecordParser.java @@ -47,18 +47,27 @@ public Record parse(XmlReader reader) throws XmlReaderException { if (!record.getHeader().isDeleted()) { reader.next(elementName(localPart(equalTo("metadata")))).next(aStartElement()); String content = reader.retrieveCurrentAsString(); - ByteArrayInputStream inputStream = - new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); - XSLPipeline pipeline = - new XSLPipeline(inputStream, true) - .apply(context.getMetadataTransformer(metadataPrefix)); + System.out.println("Metadata content: "+content); + + if (this.context.isSaveUnparsedMetadata()) { + record.withMetadata(new Metadata(content)); + } else { + ByteArrayInputStream inputStream + = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); - if (context.hasTransformer()) pipeline.apply(context.getTransformer()); + XSLPipeline pipeline + = new XSLPipeline(inputStream, true) + .apply(context.getMetadataTransformer(metadataPrefix)); - try { - record.withMetadata(new Metadata(new MetadataParser().parse(pipeline.process()))); - } catch (TransformerException e) { - throw new InternalHarvestException("Unable to process transformer", e); + if (context.hasTransformer()) { + pipeline.apply(context.getTransformer()); + } + + try { + record.withMetadata(new Metadata(new MetadataParser().parse(pipeline.process()))); + } catch (TransformerException e) { + throw new InternalHarvestException("Unable to process transformer", e); + } } }