diff --git a/fuji_server/controllers/fair_check.py b/fuji_server/controllers/fair_check.py index 82da7046..9631fd2e 100644 --- a/fuji_server/controllers/fair_check.py +++ b/fuji_server/controllers/fair_check.py @@ -30,6 +30,9 @@ import urllib.request as urllib from typing import List, Any from urllib.parse import urlparse, urljoin + +import extruct +import idutils import pandas as pd import lxml import rdflib @@ -77,6 +80,7 @@ from fuji_server.helper.repository_helper import RepositoryHelper from fuji_server.helper.identifier_helper import IdentifierHelper from fuji_server.helper.linked_vocab_helper import linked_vocab_helper +from fuji_server.helper.request_helper import RequestHelper, AcceptTypes class FAIRCheck: @@ -167,6 +171,7 @@ def __init__(self, # in case log messages shall be sent to a remote server self.remoteLogPath = None self.remoteLogHost = None + self.weblogger = None if self.isDebug: self.logStreamHandler = logging.StreamHandler(self.logger_message_stream) formatter = logging.Formatter('%(message)s|%(levelname)s') @@ -174,15 +179,14 @@ def __init__(self, self.logger.propagate = False self.logger.setLevel(logging.INFO) # set to debug in testing environment self.logger.addHandler(self.logStreamHandler) - self.weblogger = None + if Preprocessor.remote_log_host: self.weblogger = logging.handlers.HTTPHandler(Preprocessor.remote_log_host, Preprocessor.remote_log_path + '?testid=' + str(self.test_id), method='POST') self.webformatter = logging.Formatter('%(levelname)s - %(message)s \r\n') self.count = 0 - self.embedded_retrieved = False FAIRCheck.load_predata() - self.extruct = None + #self.extruct = None self.extruct_result = {} self.tika_content_types_list = [] self.lov_helper = linked_vocab_helper(self.LINKED_VOCAB_INDEX) @@ -253,20 +257,26 @@ def validate_service_url(self): else: return False - def merge_metadata(self, metadict, sourceurl, method, format, schema='', namespaces = []): + def merge_metadata(self, metadict, sourceurl, method_source, format, schema='', namespaces = []): if not isinstance(namespaces, list): namespaces = [namespaces] if isinstance(metadict,dict): + #self.metadata_sources.append((method_source, 'negotiated')) + for r in metadict.keys(): if r in self.reference_elements: self.metadata_merged[r] = metadict[r] self.reference_elements.remove(r) + if metadict.get('related_resources'): self.related_resources.extend(metadict.get('related_resources')) + if metadict.get('object_content_identifier'): + self.logger.info('FsF-F3-01M : Found data links in '+str(format)+' metadata -: ' + + str(metadict.get('object_content_identifier'))) ## add: mechanism ('content negotiation', 'typed links', 'embedded') ## add: format namespace self.metadata_unmerged.append( - {'method' : method, + {'method' : method_source, 'url' : sourceurl, 'format' : format, 'schema' : schema, @@ -274,26 +284,7 @@ def merge_metadata(self, metadict, sourceurl, method, format, schema='', namespa 'namespaces' : namespaces} ) - def retrieve_metadata(self, extruct_metadata): - embedded_exists = {} - if isinstance(extruct_metadata, dict): - embedded_exists = {k: v for k, v in extruct_metadata.items() if v} - self.extruct = embedded_exists.copy() - ''' - if embedded_exists: # retrieve metadata from landing page - self.logger.info( - 'FsF-F2-01M : Formats of structured metadata embedded in HTML markup detected by extruct - {}'.format( - list(embedded_exists.keys()))) - #self.retrieve_metadata_embedded(embedded_exists) - else: - self.logger.warning('FsF-F2-01M : NO structured metadata embedded in HTML') - ''' - #if self.reference_elements: # this will be always true as we need datacite client id - # if include_embedded ==True: - # self.retrieve_metadata_embedded(embedded_exists) - # self.retrieve_metadata_external() - - # ========= clean merged metadata, delete all entries which are None or '' + def clean_metadata(self): data_objects = self.metadata_merged.get('object_content_identifier') if data_objects == {'url': None} or data_objects == [None]: data_objects = self.metadata_merged['object_content_identifier'] = None @@ -315,14 +306,27 @@ def retrieve_metadata(self, extruct_metadata): self.metadata_merged['object_content_identifier'][oi]['type'] = c['type'][0] mime_parts = str(c.get('type')).split('/') if len(mime_parts) > 2: - if mime_parts[-2] in ['application','audio','font','example','image','message','model','multipart','text','video']: - self.metadata_merged['object_content_identifier'][oi]['type'] = str(mime_parts[-2])+'/'+str(mime_parts[-1]) - oi+=1 - + if mime_parts[-2] in ['application', 'audio', 'font', 'example', 'image', 'message', + 'model', 'multipart', 'text', 'video']: + self.metadata_merged['object_content_identifier'][oi]['type'] = str( + mime_parts[-2]) + '/' + str(mime_parts[-1]) + oi += 1 + #clean empty entries for mk, mv in list(self.metadata_merged.items()): if mv == '' or mv is None: del self.metadata_merged[mk] + def harvest_all_metadata(self): + #if isinstance(extruct_metadata, dict): + # embedded_exists = {k: v for k, v in extruct_metadata.items() if v} + # self.extruct = embedded_exists.copy() + + # ========= clean merged metadata, delete all entries which are None or '' + + self.retrieve_metadata_embedded() + self.retrieve_metadata_external() + self.clean_metadata() + self.logger.info('FsF-F2-01M : Type of object described by the metadata -: {}'.format( self.metadata_merged.get('object_type'))) @@ -333,7 +337,6 @@ def retrieve_metadata(self, extruct_metadata): if self.namespace_uri: self.namespace_uri = list(set(self.namespace_uri)) - #print(json.dumps(self.metadata_unmerged)) def retrieve_apis_standards(self): if self.landing_url is not None: @@ -444,18 +447,101 @@ def retrieve_apis_standards(self): '{} : Skipped external ressources (e.g. OAI, re3data) checks since landing page could not be resolved'. format('FsF-R1.3-01M')) - def retrieve_metadata_embedded(self, extruct_metadata={}): - isPid = False - if self.pid_scheme: - isPid = True - self.embedded_retrieved = True + def retrieve_metadata_embedded_extruct(self): + # extract contents from the landing page using extruct, which returns a dict with + # keys 'json-ld', 'microdata', 'microformat','opengraph','rdfa' + syntaxes = ['microdata', 'opengraph', 'json-ld'] + extracted = {} + if self.landing_html: + try: + extruct_target = self.landing_html.encode('utf-8') + except Exception as e: + extruct_target = self.landing_html + pass + try: + self.logger.info('%s : Trying to identify EMBEDDED Microdata, OpenGraph or JSON -: %s' % + ('FsF-F2-01M', self.landing_url)) + extracted = extruct.extract(extruct_target, syntaxes=syntaxes) + except Exception as e: + extracted = {} + self.logger.warning('%s : Failed to parse HTML embedded microdata or JSON -: %s' % + ('FsF-F2-01M', self.landing_url + ' ' + str(e))) + if isinstance(extracted, dict): + extracted = dict([(k, v) for k, v in extracted.items() if len(v) > 0]) + if len(extracted) == 0: + extracted = {} + else: + print('NO LANDING HTML') + return extracted + + def retrieve_metadata_embedded(self): + # ======= RETRIEVE METADATA FROM LANDING PAGE ======= + response_status = None + try: + self.logger.info('FsF-F2-01M : Trying to resolve input URL -: ' + str(self.id)) + # check if is PID in this case complete to URL + input_url = None + extruct_metadata = {} + input_urlscheme = urlparse(self.id).scheme + if input_urlscheme not in self.STANDARD_PROTOCOLS: + idhelper = IdentifierHelper(self.id) + self.pid_url = input_url = idhelper.get_identifier_url() + else: + input_url = self.id + if input_url: + requestHelper = RequestHelper(input_url, self.logger) + requestHelper.setAcceptType(AcceptTypes.default) # request + neg_source, landingpage_html = requestHelper.content_negotiate('FsF-F1-02D', ignore_html=False) + if not 'html' in str(requestHelper.content_type): + self.logger.info('FsF-F2-01M :Content type is ' + str(requestHelper.content_type) + + ', therefore skipping Embedded metadata (microdata, RDFa) tests') + + + response_status = requestHelper.response_status + if requestHelper.response_content: + self.landing_url = requestHelper.redirect_url + else: + self.logger.warning('FsF-F2-01M :Skipping Embedded tests, no scheme/protocol detected to be able to resolve '+(str(self.id))) + + except Exception as e: + self.logger.error('FsF-F2-01M : Resource inaccessible -: ' +str(e)) + pass + if self.landing_url: + if self.landing_url not in ['https://datacite.org/invalid.html']: + if response_status == 200: + self.raise_warning_if_javascript_page(requestHelper.response_content) + up = urlparse(self.landing_url) + self.landing_origin = '{uri.scheme}://{uri.netloc}'.format(uri=up) + self.landing_html = requestHelper.getResponseContent() + self.landing_content_type = requestHelper.content_type + elif response_status in [401, 402, 403]: + self.isMetadataAccessible = False + self.logger.warning( + 'FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}'.format( + code=response_status)) + else: + self.isMetadataAccessible = False + self.logger.warning( + 'FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}'.format( + code=response_status)) + else: + self.logger.warning( + 'FsF-F1-02D : Invalid DOI, identifier resolved to -: {code}'.format(code=self.fuji.landing_url)) + self.landing_url = None + #we have to test landin_url again, because above it may have been set to None again.. (invalid DOI) if self.landing_url: self.set_html_typed_links() - self.logger.info('FsF-F2-01M : Starting to identify EMBEDDED metadata at -: ' + str(self.landing_url)) + self.set_signposting_header_links(requestHelper.response_content, requestHelper.getResponseHeader()) + + self.set_signposting_typeset_links() + + self.logger.info('FsF-F2-01M : Starting to analyse EMBEDDED metadata at -: ' + str(self.landing_url)) #test if content is html otherwise skip embedded tests #print(self.landing_content_type) if 'html' in str(self.landing_content_type): # ========= retrieve schema.org (embedded, or from via content-negotiation if pid provided) ========= + extruct_metadata = self.retrieve_metadata_embedded_extruct() + #if extruct_metadata: ext_meta = extruct_metadata.get('json-ld') self.logger.info('FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata from html page') @@ -471,11 +557,12 @@ def retrieve_metadata_embedded(self, extruct_metadata={}): self.metadata_sources.append((source_schemaorg, 'embedded')) if schemaorg_dict.get('related_resources'): self.related_resources.extend(schemaorg_dict.get('related_resources')) - if schemaorg_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in Schema.org metadata -: ' + - str(schemaorg_dict.get('object_content_identifier'))) + #if schemaorg_dict.get('object_content_identifier'): + # self.logger.info('FsF-F3-01M : Found data links in Schema.org metadata -: ' + + # str(schemaorg_dict.get('object_content_identifier'))) # add object type for future reference self.merge_metadata(schemaorg_dict, self.landing_url, source_schemaorg, 'application/ld+json','http://schema.org', schemaorg_collector.namespaces) + ''' for i in schemaorg_dict.keys(): if i in self.reference_elements: @@ -571,7 +658,7 @@ def retrieve_metadata_embedded(self, extruct_metadata={}): #========= retrieve signposting data links self.logger.info('FsF-F2-01M : Trying to identify Typed Links to data items in html page') - data_sign_links = self.get_signposting_links('item') + data_sign_links = self.get_signposting_header_links('item') if data_sign_links: self.logger.info('FsF-F3-01M : Found data links in response header (signposting) -: ' + str(data_sign_links)) @@ -606,8 +693,23 @@ def retrieve_metadata_embedded(self, extruct_metadata={}): if self.metadata_merged.get('object_content_identifier') is None: self.metadata_merged['object_content_identifier'] = data_meta_links # self.metadata_sources.append((MetaDataCollector.Sources.TYPED_LINK.value,'linked')) - #Now if an identifier has been detected in the metadata, potentially check for persistent identifier has to be repeated.. - self.check_pidtest_repeat() + # signposting pid links + signposting_header_pids = self.get_signposting_header_links('cite-as') + signposting_html_pids = self.get_html_typed_links('cite-as') + signposting_pids = [] + if isinstance(signposting_header_pids, list): + signposting_pids = signposting_header_pids + if isinstance(signposting_html_pids, list): + signposting_pids.extend(signposting_html_pids) + if signposting_pids: + for signpid in signposting_pids: + if self.metadata_merged.get('object_identifier'): + if isinstance(self.metadata_merged.get('object_identifier'), list): + if signpid not in self.metadata_merged.get('object_identifier'): + self.metadata_merged['object_identifier'].append(signpid.get('url')) + else: + self.metadata_merged['object_identifier'] = [signpid.get('url')] + else: self.logger.warning('FsF-F2-01M : Skipped EMBEDDED metadata identification of landing page at -: ' + str(self.landing_url) + ' expected html content but received: ' + @@ -615,9 +717,11 @@ def retrieve_metadata_embedded(self, extruct_metadata={}): else: self.logger.warning( 'FsF-F2-01M : Skipped EMBEDDED metadata identification, no landing page URL could be determined') + self.check_pidtest_repeat() def check_pidtest_repeat(self): - self.repeat_pid_check = False + if not self.repeat_pid_check: + self.repeat_pid_check = False if self.related_resources: for relation in self.related_resources: if relation.get('relation_type') == 'isPartOf': @@ -636,7 +740,7 @@ def check_pidtest_repeat(self): idhelper = IdentifierHelper(pidcandidate) found_id_scheme = idhelper.preferred_schema if idhelper.is_persistent: - found_pids[found_id_scheme] = pidcandidate + found_pids[found_id_scheme] = idhelper.get_identifier_url() if len(found_pids) >= 1 and self.repeat_pid_check == False: self.logger.info( 'FsF-F2-01M : Found object identifier in metadata, repeating PID check for FsF-F1-02D') @@ -645,10 +749,10 @@ def check_pidtest_repeat(self): 'FsF-F1-02D : Found object identifier in metadata during FsF-F2-01M, PID check was repeated') self.repeat_pid_check = True if 'doi' in found_pids: - self.id = found_pids['doi'] + self.pid_url = found_pids['doi'] self.pid_scheme = 'doi' else: - self.pid_scheme, self.id = next(iter(found_pids.items())) + self.pid_scheme, self.pid_url = next(iter(found_pids.items())) # Comment: not sure if we really need a separate class as proposed below. Instead we can use a dictionary # TODO (important) separate class to represent https://www.iana.org/assignments/link-relations/link-relations.xhtml @@ -678,72 +782,149 @@ def set_html_typed_links(self): type += '+xml' #signposting links #https://www.w3.org/2001/sw/RDFCore/20031212-rdfinhtml/ recommends: link rel="meta" as well as "alternate meta" - if rel in ['meta','alternate meta','metadata','author','collection','describes','item','type','search','alternate','describedby','cite-as']: + if rel in ['meta','alternate meta','metadata','author','collection','describes','item','type','search','alternate','describedby','cite-as','linkset','license']: + source = 'typed' + if rel in ['describedby', 'item','license','type','collection', 'linkset','cite-as']: + source = 'signposting' self.typed_links.append({ 'url': href, 'type': type, 'rel': rel, 'profile': profile, - 'source' : 'typed' + 'source' : source }) except: self.logger.info('FsF-F2-01M : Typed links identification failed -:') else: self.logger.info('FsF-F2-01M : Expected HTML to check for typed links but received empty string ') - def set_signposting_links(self, content, header): + def set_signposting_typeset_links(self): + linksetlinks = [] + linksetlink ={} + if self.get_html_typed_links('linkset'): + linksetlinks = self.get_html_typed_links('linkset') + elif self.get_signposting_header_links('linkset'): + linksetlinks = self.get_signposting_header_links('linkset') + if linksetlinks: + linksetlink = linksetlinks[0] + try: + if linksetlink.get('url'): + requestHelper = RequestHelper(linksetlink.get('url'), self.logger) + requestHelper.setAcceptType(AcceptTypes.linkset) + neg_source, linkset_data= requestHelper.content_negotiate('FsF-F1-02D') + if isinstance(linkset_data, dict): + if isinstance(linkset_data.get('linkset'),list): + validlinkset = None + for candidatelinkset in linkset_data.get('linkset'): + if isinstance(candidatelinkset, dict): + if candidatelinkset.get('anchor') in [self.pid_url,self.landing_url]: + validlinkset = candidatelinkset + break + if validlinkset: + for linktype, links in validlinkset.items(): + if linktype != 'anchor': + if not isinstance(links, list): + links = [links] + for link in links: + self.typed_links.append({ + 'url': link.get('href'), + 'type': link.get('type'), + 'rel': linktype, + 'profile': link.get('profile'), + 'source' : 'signposting' + }) + self.logger.info('FsF-F2-01M : Found valid Signposting Linkset in provided JSON file') + else: + self.logger.warning('FsF-F2-01M : Found Signposting Linkset but none of the given anchors matches landing oage or PID') + else: + validlinkset = False + parsed_links = self.parse_signposting_http_link_format(linkset_data.decode()) + try: + if parsed_links[0].get('anchor'): + self.logger.info('FsF-F2-01M : Found valid Signposting Linkset in provided text file') + for parsed_link in parsed_links: + if parsed_link.get('anchor') in [self.pid_url, self.landing_url]: + self.typed_links.append(parsed_link) + validlinkset = True + if not validlinkset: + self.logger.warning( + 'FsF-F2-01M : Found Signposting Linkset but none of the given anchors matches landing oage or PID') + except Exception as e: + self.logger.warning('FsF-F2-01M : Found Signposting Linkset but could not correctly parse the file') + print(e) + + except Exception as e: + self.logger.warning('FsF-F2-01M : Failed to parse Signposting Linkset -: '+str(e)) + + + def get_signposting_object_identifier(self): + # check if there is a cite-as signposting link + signposting_pid = None + signposting_pid_link_list = self.get_signposting_header_links('cite-as') + if signposting_pid_link_list: + for signposting_pid_link in signposting_pid_link_list: + signposting_pid = signposting_pid_link.get('url') + if signposting_pid: + self.logger.info( + 'FsF-F1-02D : Found object identifier (cite-as) in signposting header links -:' + str( + signposting_pid)) + if not signposting_pid_link.get('type'): + self.logger.warning( + 'FsF-F1-02D : Found cite-as signposting links has no type attribute-:' + str( + signposting_pid)) + signidhelper = IdentifierHelper(signposting_pid) + found_id = signidhelper.preferred_schema + if signidhelper.is_persistent and self.pid_scheme is None: + self.pid_scheme = found_id + self.pid_url = signposting_pid + + def parse_signposting_http_link_format(self, signposting_link_format_text): + found_signposting_links = [] + for preparsed_link in signposting_link_format_text.split(','): + found_link = None + found_type, type_match, anchor_match = None, None, None + found_rel, rel_match = None, None + found_formats, formats_match = None, None + parsed_link = preparsed_link.strip().split(';') + found_link = parsed_link[0].strip() + for link_prop in parsed_link[1:]: + link_prop = str(link_prop).strip() + if link_prop.startswith('anchor'): + anchor_match = re.search('anchor\s*=\s*\"?([^,;"]+)\"?', link_prop) + if link_prop.startswith('rel'): + rel_match = re.search('rel\s*=\s*\"?([^,;"]+)\"?', link_prop) + elif link_prop.startswith('type'): + type_match = re.search('type\s*=\s*\"?([^,;"]+)\"?', link_prop) + elif link_prop.startswith('formats'): + formats_match = re.search('formats\s*=\s*\"?([^,;"]+)\"?', link_prop) + if type_match: + found_type = type_match[1] + if rel_match: + found_rel = rel_match[1] + if formats_match: + found_formats = formats_match[1] + signposting_link_dict = { + 'url': found_link[1:-1], + 'type': found_type, + 'rel': found_rel, + 'profile': found_formats, + 'source': 'signposting' + } + if anchor_match: + signposting_link_dict['anchor'] = anchor_match[1] + if signposting_link_dict.get('url'): + found_signposting_links.append(signposting_link_dict) + return found_signposting_links + + def set_signposting_header_links(self, content, header): header_link_string = header.get('Link') if header_link_string is not None: - for preparsed_link in header_link_string.split(','): - found_link = None - found_type, type_match = None, None - found_rel, rel_match = None, None - found_formats, formats_match = None, None - parsed_link = preparsed_link.strip().split(';') - found_link = parsed_link[0].strip() - for link_prop in parsed_link[1:]: - link_prop = str(link_prop).strip() - if link_prop.startswith('rel'): - rel_match = re.search('rel\s*=\s*\"?([^,;"]+)\"?', link_prop) - elif link_prop.startswith('type'): - type_match = re.search('type\s*=\s*\"?([^,;"]+)\"?', link_prop) - elif link_prop.startswith('formats'): - formats_match = re.search('formats\s*=\s*\"?([^,;"]+)\"?', link_prop) - if type_match: - found_type = type_match[1] - if rel_match: - found_rel = rel_match[1] - if formats_match: - found_formats = formats_match[1] - signposting_link_dict = { - 'url': found_link[1:-1], - 'type': found_type, - 'rel': found_rel, - 'profile': found_formats - } - if signposting_link_dict.get('url'): - self.signposting_header_links.append(signposting_link_dict) + self.signposting_header_links = self.parse_signposting_http_link_format(header_link_string) self.logger.info('FsF-F1-02D : Found signposting links in response header of landingpage -: ' + str( self.signposting_header_links)) - # check if there is a cite-as signposting link - signposting_pid = None - signposting_pid_link_list = self.get_signposting_links('cite-as') - if signposting_pid_link_list: - for signposting_pid_link in signposting_pid_link_list: - signposting_pid = signposting_pid_link.get('url') - if signposting_pid: - self.logger.info( - 'FsF-F1-02D : Found object identifier (cite-as) in signposting header links -:' + str( - signposting_pid)) - if not signposting_pid_link.get('type'): - self.logger.warning( - 'FsF-F1-02D : Found cite-as signposting links has no type attribute-:' + str( - signposting_pid)) - signidhelper = IdentifierHelper(signposting_pid) - found_id = signidhelper.preferred_schema - if signidhelper.is_persistent and self.pid_scheme is None: - self.pid_scheme = found_id + + def get_html_typed_links(self, rel='item', allkeys=True): # Use Typed Links in HTTP Link headers to help machines find the resources that make up a publication. @@ -754,18 +935,18 @@ def get_html_typed_links(self, rel='item', allkeys=True): for typed_link in self.typed_links: if typed_link.get('rel') in rel: if not allkeys: - typed_link = {tlkey: typed_link[tlkey] for tlkey in ['url','type']} + typed_link = {tlkey: typed_link[tlkey] for tlkey in ['url','type','source']} datalinks.append((typed_link)) return datalinks - def get_signposting_links(self, rel='item', allkeys=True): + def get_signposting_header_links(self, rel='item', allkeys=True): signlinks = [] if not isinstance(rel, list): rel = [rel] for signposting_links in self.signposting_header_links: if signposting_links.get('rel') in rel: if not allkeys: - signposting_links = {slkey: signposting_links[slkey] for slkey in ['url','type']} + signposting_links = {slkey: signposting_links[slkey] for slkey in ['url','type','source']} signlinks.append(signposting_links) if signlinks == []: signlinks = None @@ -829,89 +1010,18 @@ def get_preferred_links(self, linklist): other_links.append(link) return preferred_links + other_links - def retrieve_metadata_external(self): - test_content_negotiation = False - test_typed_links = False - test_signposting = False - test_embedded = False - self.logger.info( - 'FsF-F2-01M : Starting to identify EXTERNAL metadata through content negotiation or typed links') - - # ========= retrieve xml metadata namespaces by content negotiation ======== - if self.landing_url: - if self.use_datacite is True: - target_url_list = [self.pid_url, self.landing_url] - else: - target_url_list = [self.landing_url] - if not target_url_list: - target_url_list = [self.origin_url] - - target_url_list = set(tu for tu in target_url_list if tu is not None) - for target_url in target_url_list: - self.logger.info('FsF-F2-01M : Trying to retrieve XML metadata through content negotiation from URL -: '+str(target_url)) - negotiated_xml_collector = MetaDataCollectorXML(loggerinst=self.logger, - target_url=self.landing_url, - link_type='negotiated') - source_neg_xml, metadata_neg_dict = negotiated_xml_collector.parse_metadata() - #print('### ',metadata_neg_dict) - neg_namespace = 'unknown xml' - metadata_neg_dict = self.exclude_null(metadata_neg_dict) - if len(negotiated_xml_collector.getNamespaces()) > 0: - self.namespace_uri.extend(negotiated_xml_collector.getNamespaces()) - neg_namespace = negotiated_xml_collector.getNamespaces()[0] - self.linked_namespace_uri.update(negotiated_xml_collector.getLinkedNamespaces()) - if metadata_neg_dict: - self.metadata_sources.append((source_neg_xml, 'negotiated')) - #if metadata_neg_dict.get('related_resources'): - # self.related_resources.extend(metadata_neg_dict.get('related_resources')) - if metadata_neg_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in XML metadata -: ' + - str(metadata_neg_dict.get('object_content_identifier'))) - - self.merge_metadata(metadata_neg_dict, self.landing_url, source_neg_xml,negotiated_xml_collector.getContentType(),neg_namespace) - #### - self.logger.log( - self.LOG_SUCCESS, 'FsF-F2-01M : Found XML metadata through content negotiation-: ' + - str(metadata_neg_dict.keys())) - self.namespace_uri.extend(negotiated_xml_collector.getNamespaces()) - # also add found xml namespaces without recognized data - elif len(negotiated_xml_collector.getNamespaces())>0: - self.merge_metadata({}, self.landing_url, source_neg_xml,negotiated_xml_collector.getContentType(),neg_namespace) - # ========= retrieve json-ld/schema.org metadata namespaces by content negotiation ======== - self.logger.info( - 'FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata through content negotiation from URL -: '+str(target_url)) - schemaorg_collector = MetaDataCollectorSchemaOrg(loggerinst=self.logger, - sourcemetadata=None, - mapping=Mapper.SCHEMAORG_MAPPING, - pidurl=target_url) - source_schemaorg, schemaorg_dict = schemaorg_collector.parse_metadata() - schemaorg_dict = self.exclude_null(schemaorg_dict) - if schemaorg_dict: - self.namespace_uri.extend(schemaorg_collector.namespaces) - self.metadata_sources.append((source_schemaorg, 'negotiated')) - #if schemaorg_dict.get('related_resources'): - # self.related_resources.extend(schemaorg_dict.get('related_resources')) - if schemaorg_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in Schema.org metadata -: ' + - str(schemaorg_dict.get('object_content_identifier'))) - # add object type for future reference - self.merge_metadata(metadata_neg_dict, target_url, source_schemaorg, 'application/ld+json', 'http://www.schema.org',schemaorg_collector.namespaces) - - self.logger.log( - self.LOG_SUCCESS, 'FsF-F2-01M : Found Schema.org metadata through content negotiation-: ' + - str(schemaorg_dict.keys())) - else: - self.logger.info('FsF-F2-01M : Schema.org metadata through content negotiation UNAVAILABLE') - - # ========= retrieve rdf metadata namespaces by content negotiation ======== - source = MetaDataCollector.Sources.LINKED_DATA.value - #TODO: handle this the same way as with datacite based content negotiation->use the use_datacite switch - if self.pid_scheme == 'purl': - targeturl = self.pid_url - else: - targeturl = self.landing_url - self.logger.info('FsF-F2-01M : Trying to retrieve RDF metadata through content negotiation from URL -: '+str(targeturl)) + def retrieve_metadata_external_rdf_negotiated(self,target_url_list=[]): + # ========= retrieve rdf metadata namespaces by content negotiation ======== + source = MetaDataCollector.Sources.LINKED_DATA.value + #if self.pid_scheme == 'purl': + # targeturl = self.pid_url + #else: + # targeturl = self.landing_url + for targeturl in target_url_list: + self.logger.info( + 'FsF-F2-01M : Trying to retrieve RDF metadata through content negotiation from URL -: ' + str( + targeturl)) neg_rdf_collector = MetaDataCollectorRdf(loggerinst=self.logger, target_url=targeturl, source=source) if neg_rdf_collector is not None: source_rdf, rdf_dict = neg_rdf_collector.parse_metadata() @@ -923,20 +1033,93 @@ def retrieve_metadata_external(self): self.namespace_uri.extend(neg_rdf_collector.getNamespaces()) rdf_dict = self.exclude_null(rdf_dict) if rdf_dict: - if rdf_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in RDF metadata -: ' + - str(len(rdf_dict.get('object_content_identifier')))) test_content_negotiation = True self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found Linked Data metadata -: {}'.format(str(rdf_dict.keys()))) self.metadata_sources.append((source_rdf, 'negotiated')) - self.merge_metadata(rdf_dict, targeturl, source_rdf, neg_rdf_collector.getContentType(), 'http://www.w3.org/1999/02/22-rdf-syntax-ns', neg_rdf_collector.getNamespaces()) + self.merge_metadata(rdf_dict, targeturl, source_rdf, neg_rdf_collector.getContentType(), + 'http://www.w3.org/1999/02/22-rdf-syntax-ns', neg_rdf_collector.getNamespaces()) else: self.logger.info('FsF-F2-01M : Linked Data metadata UNAVAILABLE') - # ========= retrieve datacite json metadata based on pid ========= + def retrieve_metadata_external_schemaorg_negotiated(self,target_url_list=[]): + for target_url in target_url_list: + # ========= retrieve json-ld/schema.org metadata namespaces by content negotiation ======== + self.logger.info( + 'FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata through content negotiation from URL -: ' + str( + target_url)) + schemaorg_collector = MetaDataCollectorSchemaOrg(loggerinst=self.logger, + sourcemetadata=None, + mapping=Mapper.SCHEMAORG_MAPPING, + pidurl=target_url) + source_schemaorg, schemaorg_dict = schemaorg_collector.parse_metadata() + schemaorg_dict = self.exclude_null(schemaorg_dict) + if schemaorg_dict: + self.namespace_uri.extend(schemaorg_collector.namespaces) + self.metadata_sources.append((source_schemaorg, 'negotiated')) + + # add object type for future reference + self.merge_metadata(schemaorg_dict, target_url, source_schemaorg, 'application/ld+json', + 'http://www.schema.org', schemaorg_collector.namespaces) + + self.logger.log( + self.LOG_SUCCESS, 'FsF-F2-01M : Found Schema.org metadata through content negotiation-: ' + + str(schemaorg_dict.keys())) + else: + self.logger.info('FsF-F2-01M : Schema.org metadata through content negotiation UNAVAILABLE') + + def retrieve_metadata_external_xml_negotiated(self,target_url_list=[]): + for target_url in target_url_list: + self.logger.info( + 'FsF-F2-01M : Trying to retrieve XML metadata through content negotiation from URL -: ' + str(target_url)) + negotiated_xml_collector = MetaDataCollectorXML(loggerinst=self.logger, + target_url=self.landing_url, + link_type='negotiated') + source_neg_xml, metadata_neg_dict = negotiated_xml_collector.parse_metadata() + # print('### ',metadata_neg_dict) + neg_namespace = 'unknown xml' + metadata_neg_dict = self.exclude_null(metadata_neg_dict) + if len(negotiated_xml_collector.getNamespaces()) > 0: + self.namespace_uri.extend(negotiated_xml_collector.getNamespaces()) + neg_namespace = negotiated_xml_collector.getNamespaces()[0] + self.linked_namespace_uri.update(negotiated_xml_collector.getLinkedNamespaces()) + if metadata_neg_dict: + self.metadata_sources.append((source_neg_xml, 'negotiated')) + + self.merge_metadata(metadata_neg_dict, self.landing_url, source_neg_xml, + negotiated_xml_collector.getContentType(), neg_namespace) + #### + self.logger.log( + self.LOG_SUCCESS, 'FsF-F2-01M : Found XML metadata through content negotiation-: ' + + str(metadata_neg_dict.keys())) + self.namespace_uri.extend(negotiated_xml_collector.getNamespaces()) + # also add found xml namespaces without recognized data + elif len(negotiated_xml_collector.getNamespaces()) > 0: + self.merge_metadata({}, self.landing_url, source_neg_xml, negotiated_xml_collector.getContentType(), + neg_namespace) + def retrieve_metadata_external_oai_ore(self): + oai_link = self.get_html_typed_links('resourcemap') + if oai_link: + if oai_link.get('type') in ['application/atom+xml']: + #elif metadata_link['type'] in ['application/atom+xml'] and metadata_link['rel'] == 'resourcemap': + self.logger.info( + 'FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to OAI ORE (atom) Metadata -: (' + + str(oai_link['type'] + ')')) + ore_atom_collector = MetaDataCollectorOreAtom(loggerinst=self.logger, + target_url=oai_link['url']) + source_ore, ore_dict = ore_atom_collector.parse_metadata() + ore_dict = self.exclude_null(ore_dict) + if ore_dict: + self.logger.log(self.LOG_SUCCESS, + 'FsF-F2-01M : Found OAI ORE metadata -: {}'.format(str(ore_dict.keys()))) + self.metadata_sources.append((source_ore, 'linked')) + self.merge_metadata(ore_dict, oai_link['url'], source_ore, ore_atom_collector.getContentType(), + 'http://www.openarchives.org/ore/terms', + 'http://www.openarchives.org/ore/terms') + + def retrieve_metadata_external_datacite(self): if self.pid_scheme: # ================= datacite by content negotiation =========== # in case use_datacite id false use the landing page URL for content negotiation, otherwise the pid url @@ -955,52 +1138,56 @@ def retrieve_metadata_external(self): self.metadata_sources.append((source_dcitejsn, 'negotiated')) self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found Datacite metadata -: {}'.format(str(dcitejsn_dict.keys()))) - if dcitejsn_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in Datacite metadata -: ' + - str(dcitejsn_dict.get('object_content_identifier'))) - #if dcitejsn_dict.get('related_resources'): - # self.related_resources.extend(dcitejsn_dict.get('related_resources')) + self.namespace_uri.extend(dcite_collector.getNamespaces()) self.merge_metadata(dcitejsn_dict,datacite_target_url,source_dcitejsn,dcite_collector.getContentType(), 'http://datacite.org/schema',dcite_collector.getNamespaces()) - else: self.logger.info('FsF-F2-01M : Datacite metadata UNAVAILABLE') else: self.logger.info('FsF-F2-01M : Not a PID, therefore Datacite metadata (json) not requested.') - sign_header_links = [] - rel_meta_links = [] - sign_meta_links = [] - - #typed_metadata_links = self.typed_links - typed_metadata_links = self.get_html_typed_links(['describedby', 'meta','alternate meta','metadata'], False) - #signposting header links - if self.get_signposting_links('describedby'): - sign_header_links = self.get_signposting_links('describedby', False) + def get_connected_metadata_links(self,allowedmethods = ['signposting','typed','guessed']): + # get all links which lead to metadata are given by signposting, typed links, guessing or in html href + connected_metadata_links = [] + signposting_header_links = [] + # signposting html links + signposting_html_links = self.get_html_typed_links(['describedby']) + # signposting header links + if self.get_signposting_header_links('describedby'): + signposting_header_links = self.get_signposting_header_links('describedby', False) self.logger.info( - 'FsF-F1-02D : Found metadata link as (describedby) signposting header links -:' + str( - sign_header_links)) + 'FsF-F2-01M : Found metadata link as (describedby) signposting header links -:' + str( + signposting_header_links)) self.metadata_sources.append((MetaDataCollector.Sources.SIGN_POSTING.value, 'signposting')) - guessed_metadata_link = self.get_guessed_xml_link() - href_metadata_links = self.get_html_xml_links() - - if sign_header_links: - typed_metadata_links.extend(sign_header_links) - - if href_metadata_links: - typed_metadata_links.extend(href_metadata_links) - - if guessed_metadata_link is not None: - typed_metadata_links.append(guessed_metadata_link) - + if signposting_header_links: + connected_metadata_links.extend(signposting_header_links) + if signposting_html_links: + connected_metadata_links.extend(signposting_html_links) + + #if signposting_typeset_links: + # connected_metadata_links.extend(signposting_typeset_links) + + html_typed_links = self.get_html_typed_links(['meta', 'alternate meta', 'metadata','alternate'], False) + if html_typed_links: + connected_metadata_links.extend(html_typed_links) + if 'guessed' in allowedmethods: + guessed_metadata_link = self.get_guessed_xml_link() + href_metadata_links = self.get_html_xml_links() + if href_metadata_links: + connected_metadata_links.extend(href_metadata_links) + if guessed_metadata_link is not None: + connected_metadata_links.append(guessed_metadata_link) + return connected_metadata_links + + def retrieve_metadata_external_linked_metadata(self, allowedmethods = ['signposting', 'typed', 'guessed']): + # follow all links identified as typed links, signposting links and get xml or rdf metadata from there + typed_metadata_links = self.get_connected_metadata_links(allowedmethods) if typed_metadata_links: - typed_rdf_collector = None - #unique entries for typed links + # unique entries for typed links typed_metadata_links = [dict(t) for t in {tuple(d.items()) for d in typed_metadata_links}] typed_metadata_links = self.get_preferred_links(typed_metadata_links) - for metadata_link in typed_metadata_links: if not metadata_link['type']: # guess type based on e.g. file suffix @@ -1008,14 +1195,13 @@ def retrieve_metadata_external(self): metadata_link['type'] = mimetypes.guess_type(metadata_link['url'])[0] except Exception: pass - if re.search(r'[\/+](rdf(\+xml)?|(?:x-)?turtle|ttl|n3|n-triples|ld\+json)+$', str(metadata_link['type'])) : + if re.search(r'[\/+](rdf(\+xml)?|(?:x-)?turtle|ttl|n3|n-triples|ld\+json)+$', str(metadata_link['type'])): self.logger.info('FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to RDF Metadata -: (' + str(metadata_link['type']) + ' ' + str(metadata_link['url']) + ')') source = MetaDataCollector.Sources.RDF_TYPED_LINKS.value - #if not typed_rdf_collector: typed_rdf_collector = MetaDataCollectorRdf(loggerinst=self.logger, - target_url=metadata_link['url'], - source=source) + target_url=metadata_link['url'], + source=source) if typed_rdf_collector is not None: source_rdf, rdf_dict = typed_rdf_collector.parse_metadata() self.namespace_uri.extend(typed_rdf_collector.getNamespaces()) @@ -1023,40 +1209,24 @@ def retrieve_metadata_external(self): if rdf_dict: test_typed_links = True self.logger.log(self.LOG_SUCCESS, - 'FsF-F2-01M : Found Linked Data metadata -: {}'.format( + 'FsF-F2-01M : Found Linked Data (RDF) metadata -: {}'.format( str(rdf_dict.keys()))) - self.metadata_sources.append((source_rdf, 'linked')) - self.merge_metadata(rdf_dict, metadata_link['url'], source_rdf,typed_rdf_collector.getContentType(),'http://www.w3.org/1999/02/22-rdf-syntax-ns',typed_rdf_collector.getNamespaces()) + self.metadata_sources.append((source_rdf, metadata_link['source'])) + self.merge_metadata(rdf_dict, metadata_link['url'], source_rdf, + typed_rdf_collector.getContentType(), + 'http://www.w3.org/1999/02/22-rdf-syntax-ns', + typed_rdf_collector.getNamespaces()) else: self.logger.info('FsF-F2-01M : Linked Data metadata UNAVAILABLE') - #else: - # self.logger.info( - # 'FsF-F2-01M :A RDF metadata representation already has been collected, therefore ignoring Typed Link in HTML Header linking to RDF Metadata -: (' + - # str(metadata_link['type']) + ' ' + str(metadata_link['url']) + ')') - elif metadata_link['type'] in ['application/atom+xml'] and metadata_link['rel'] == 'resourcemap': - self.logger.info( - 'FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to OAI ORE (atom) Metadata -: (' + - str(metadata_link['type'] + ')')) - ore_atom_collector = MetaDataCollectorOreAtom(loggerinst=self.logger, - target_url=metadata_link['url']) - source_ore, ore_dict = ore_atom_collector.parse_metadata() - ore_dict = self.exclude_null(ore_dict) - if ore_dict: - self.logger.log(self.LOG_SUCCESS, - 'FsF-F2-01M : Found OAI ORE metadata -: {}'.format(str(ore_dict.keys()))) - self.metadata_sources.append((source_ore, 'linked')) - self.merge_metadata(ore_dict,metadata_link['url'],source_ore,ore_atom_collector.getContentType(), 'http://www.openarchives.org/ore/terms','http://www.openarchives.org/ore/terms') - elif re.search(r'[+\/]xml$', str(metadata_link['type'])): - #elif metadata_link['type'] in ['text/xml', 'application/xml', 'application/x-ddi-l+xml', - # 'application/x-ddametadata+xml']: self.logger.info('FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to XML Metadata -: (' + - str(metadata_link['type'] + ' '+metadata_link['url']+')')) + str(metadata_link['type'] + ' ' + metadata_link['url'] + ')')) linked_xml_collector = MetaDataCollectorXML(loggerinst=self.logger, target_url=metadata_link['url'], - link_type=metadata_link.get('source'), pref_mime_type=metadata_link['type']) + link_type=metadata_link.get('source'), + pref_mime_type=metadata_link['type']) if linked_xml_collector is not None: source_linked_xml, linked_xml_dict = linked_xml_collector.parse_metadata() @@ -1064,29 +1234,48 @@ def retrieve_metadata_external(self): if len(linked_xml_collector.getNamespaces()) > 0: lkd_namespace = linked_xml_collector.getNamespaces()[0] self.namespace_uri.extend(linked_xml_collector.getNamespaces()) - #print(lkd_namespace) - print('XML LINKED NS',linked_xml_collector.getLinkedNamespaces()) self.linked_namespace_uri.update(linked_xml_collector.getLinkedNamespaces()) if linked_xml_dict: - self.metadata_sources.append((source_linked_xml, 'linked')) - #if linked_xml_dict.get('related_resources'): - # self.related_resources.extend(linked_xml_dict.get('related_resources')) - if linked_xml_dict.get('object_content_identifier'): - self.logger.info('FsF-F3-01M : Found data links in XML metadata -: ' + - str(linked_xml_dict.get('object_content_identifier'))) - # add object type for future reference - - self.merge_metadata(linked_xml_dict,metadata_link['url'],source_linked_xml,linked_xml_collector.getContentType(), lkd_namespace) - - self.logger.log( - self.LOG_SUCCESS, + self.metadata_sources.append((MetaDataCollector.Sources.XML_TYPED_LINKS.value, metadata_link['source'])) + self.merge_metadata(linked_xml_dict, metadata_link['url'], source_linked_xml, + linked_xml_collector.getContentType(), lkd_namespace) + + self.logger.log(self.LOG_SUCCESS, 'FsF-F2-01M : Found XML metadata through typed links-: ' + str(linked_xml_dict.keys())) # also add found xml namespaces without recognized data - elif len(linked_xml_collector.getNamespaces())>0: - print('Unidentified XML: ', str(lkd_namespace)) - self.merge_metadata(dict(), metadata_link['url'], source_linked_xml,linked_xml_collector.getContentType(),lkd_namespace, linked_xml_collector.getNamespaces()) + elif len(linked_xml_collector.getNamespaces()) > 0: + self.merge_metadata(dict(), metadata_link['url'], source_linked_xml, + linked_xml_collector.getContentType(), lkd_namespace, + linked_xml_collector.getNamespaces()) else: - self.logger.info('FsF-F2-01M : Found typed link or signposting link but cannot handle given mime type -:'+str(metadata_link['type'])) + self.logger.info( + 'FsF-F2-01M : Found typed link or signposting link but cannot handle given mime type -:' + str( + metadata_link['type'])) + + + def retrieve_metadata_external(self, target_url = None): + + self.logger.info( + 'FsF-F2-01M : Starting to identify EXTERNAL metadata through content negotiation or typed (signposting) links') + if self.landing_url: + if self.use_datacite is True: + target_url_list = [self.pid_url, self.landing_url] + else: + target_url_list = [self.landing_url] + if not target_url_list: + target_url_list = [self.origin_url] + + if isinstance(target_url, str): + target_url_list = [target_url] + + target_url_list = set(tu for tu in target_url_list if tu is not None) + + self.retrieve_metadata_external_xml_negotiated(target_url_list) + self.retrieve_metadata_external_schemaorg_negotiated(target_url_list) + self.retrieve_metadata_external_rdf_negotiated(target_url_list) + self.retrieve_metadata_external_datacite() + self.retrieve_metadata_external_linked_metadata(['signposting', 'linked']) + self.retrieve_metadata_external_oai_ore() if self.reference_elements: self.logger.debug('FsF-F2-01M : Reference metadata elements NOT FOUND -: {}'.format( @@ -1139,6 +1328,7 @@ def check_persistent_identifier(self): return persistent_identifier_check.getResult() def check_unique_persistent(self): + self.get_signposting_object_identifier() return self.check_unique_identifier(), self.check_persistent_identifier() def check_minimal_metatadata(self, include_embedded=True):