diff --git a/miscutil/lib/collclean_cron.py b/miscutil/lib/collclean_cron.py new file mode 100644 index 000000000..ec5a0353e --- /dev/null +++ b/miscutil/lib/collclean_cron.py @@ -0,0 +1,150 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +import os +import re +import codecs +import datetime + +from collclean_lib import coll_cleanforthe +from collclean_lib import coll_clean710 +from collclean_lib import coll_split + +#this should come from a KB +expcoll = [] + +def coll_check(colls, logtext): + global expcoll + for coll in colls: + # misspelled collaboration? + if re.search('coll', coll.lower()) or re.search('borati', coll.lower()): + if re.search('collider', coll.lower()) or re.search('college', coll.lower()): + pass + else: + logtext += 'COLL?? %s ' % coll + + # for splitted colls: is this string listed in EXP 710__g? + if len(colls) > 1: + unknown = False + for coll in colls: + if not coll in expcoll: + unknown = True + if unknown: + colls = None + logtext += ' Dont dare to change\n' + return colls, logtext + +def coll_splitandclean(recid, value, logtext): + newcolls = [] + for original in coll_split(value): + (coll, author) = coll_cleanforthe(original) + coll = coll_clean710(coll) + newcolls.append(coll) + if author: + logtext += '%09i found author: %s\n' % (recid,author) + logtext += ' in: %s\n' % original + (newcolls, logtext) = coll_check(newcolls, logtext) + return newcolls, logtext + +def writexml(recid, mark, changed): + ind1 = mark[1] + ind2 = mark[2] + extracolls = [] + logtext = '' + xmltext = '' + curator = False + for field in mark[0]: + if field[0] == '9' and field[1].upper().strip() == 'CURATOR': + curator = True + if curator: + logtext += '%09i Skipping %s\n' % (recid, mark) + + xmltext += ' \n' % (ind1, ind2) + for field in mark[0]: + subf = field[0] + value = field[1] + if ind1 == ' ' and ind2 == ' ' and subf == 'g' and not curator: + logtext += '%09i %s\n' % (recid, value) + (colls, logtext) = coll_splitandclean(recid, value, logtext) + if colls: + if not colls[0] == value.strip(): + changed = True + for coll in colls: + logtext += ' %s\n' % coll + value = colls[0] + extracolls = colls[1:] + + xmltext += ' %s\n' % (subf, value) + xmltext += ' \n' + if extracolls: + for value in extracolls: + xmltext += ' \n' % (ind1, ind2) + xmltext += ' %s\n' % (subf, value) + xmltext += ' \n' + return(xmltext, logtext, changed) + + +def main(): + from invenio.search_engine import get_collection_reclist + from invenio.search_engine import search_pattern + from invenio.search_engine import get_record + from invenio.search_engine import get_fieldvalues + global expcoll + now = datetime.datetime.now() + stopdate = now + startdate = stopdate + datetime.timedelta(days=-10) + filedate = '%4d%02d%02d' % (stopdate.year, stopdate.month, stopdate.day) + stampofstopdate = '%4d-%02d-%02d' % (stopdate.year, stopdate.month, stopdate.day) + stampofstartdate = '%4d-%02d-%02d' % (startdate.year, startdate.month, startdate.day) + + exp = get_collection_reclist("Experiments") + hep = get_collection_reclist("HEP") + + recall = search_pattern(p="710__g:/^./") + recexp = recall.intersection(exp) + for rec in recexp: + expcoll += get_fieldvalues(rec, '710__g') + + reccoll = search_pattern(p="710__g:'collaboration'") + recids = search_pattern(p="dadd:%s->%s" % (stampofstartdate,stampofstopdate)) + recids = recids.intersection(recall) + recids = recids.union(reccoll) + recids = recids.intersection(hep) + logtext = '' + xmlpath = "/afs/desy.de/user/l/library/dok/inspire/correct/" + filename = '%scoll_%s.correct' % (xmlpath,filedate) + try: + filexml = codecs.EncodedFile(codecs.open(filename, 'w'), 'utf8') + except IOError: + logtext += 'Cant open file %s\n' % filename + return +# filelog = open('coll_%s.log' % filedate, 'w') + xmlall = '\n' + mail_subject = 'CollClean %s' % filedate + logtext += 'Processing coll_%s\n' % filedate + for rec in recids: + changed = False + xmltext = '\n %i\n' % rec + m710 = get_record(rec).get('710') + for mark in m710: + (thisxml, thislog, thischange) = writexml(rec, mark, changed) + logtext += thislog + if thischange: + changed = True + xmltext += thisxml + if changed: + xmltext += '\n' + xmlall += xmltext + else: + logtext += ' unchanged\n' + xmlall += '\n' + filexml.write(xmlall) + filexml.close() + + os.system('echo "%s" | mail -s "%s" %s ' % (logtext,mail_subject,'kirsten.sachs@desy.de')) +# filelog.write(logtext) +# filelog.close() + + +if __name__ == "__main__": + main() diff --git a/miscutil/lib/collclean_lib.py b/miscutil/lib/collclean_lib.py new file mode 100644 index 000000000..84f964ddb --- /dev/null +++ b/miscutil/lib/collclean_lib.py @@ -0,0 +1,159 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +import os +import re +import codecs + +def coll_split(value): + """ split at 'and' and ',' """ + colls = [] + # split at 'and' and ',' + for val in value.split(' and '): + colls += val.split(', ') + return colls + +def coll_cleanforthe(coll): + """ Cleanup collaboration, try to find author """ + import re + author = None + re_for_the = re.compile(r'(?:^| )+(?:for the|on behalf of the|on behalf of|representing the|representing)(?: |$)+', re.IGNORECASE) + re_start = re.compile(r'^ *(group|team|consortium) +(.*) *$', re.IGNORECASE) + re_the = re.compile(r'^ *the +', re.IGNORECASE) + re_for = re.compile(r'^ *for +', re.IGNORECASE) + re_coll = re.compile(r'(?:^| |\/|-)+collaborations?\.?(?=\W|$)', re.IGNORECASE) + re_group = re.compile(r'(?:^| |\/|-)+group(?=\W|$)', re.IGNORECASE) + re_wgroup = re.compile(r'(?:^| |\/|-)+working group(?=\W|$)', re.IGNORECASE) + re_team = re.compile(r'(?:^| |\/|-)+team(?=\W|$)', re.IGNORECASE) + re_consortium = re.compile(r'(?:^| |\/|-)+consortium(?=\W|$)', re.IGNORECASE) + + + #replace trailing brackets only if there are leading brackets + if re.search('^ *\(.*\) *$', coll): + coll = coll.strip('.; ()') + else: + coll = coll.strip('.; ') + if re_for_the.search(coll): + if re.search('ASSOCIATION FOR THE', coll, flags=re.IGNORECASE) or \ + re.search('CENTER FOR THE', coll, flags=re.IGNORECASE) or \ + re.search('INSTITUTE FOR THE', coll, flags=re.IGNORECASE) or\ + re.search('FOR THE DEVELOPMENT', coll, flags=re.IGNORECASE): + return coll, author + else: + # get strings leading and trailing 'for the' + (l, t) = re_for_the.split(coll, maxsplit=1) + if re.search(r'\w', l): + lead = re.split(' ', l) + else: + lead = [] + if re.search(r'\w', t): + tail = re.split(' ', t) + else: + tail = [] + if len(tail) == 0: + if len(lead) == 0: + # nothing left over + coll = '' + else: + # ATLAS John Doe for the + coll = lead[0] + if len(lead) > 1: + author = ' '.join(lead[1:]) + else: + coll = ' '.join(tail) + if re.search(r'collaboration$', coll, flags=re.IGNORECASE) or \ + re.search(r' team$', coll, flags=re.IGNORECASE): + # John Doe for the ATLAS Collaboration + if len(lead) > 0: + author = ' '.join(lead) + else: + if len(lead) > 0: + if len(lead) > 1: + # Collaboration John Doe for the ATLAS + # John Doe for ATLAS + author = ' '.join(lead) + author = re_coll.sub('', author) + else: + # Collaboration for the ATLAS + coll = coll + ' ' + lead[0] + if author: + # John Doe -> Doe, John + author = re.sub(r'^ *([\w.-]+) (.+)$', r'\2, \1', author) + + coll = re_the.sub('', coll) + coll = re_start.sub(r'\2 \1', coll) + coll = re_coll.sub('', coll) + coll = re_for.sub('', coll) + coll = re_group.sub(' Group', coll) + coll = re_wgroup.sub(' Working Group', coll) + coll = re_team.sub(' Team', coll) + coll = re_consortium.sub(' Consortium', coll) + return coll, author + +def coll_cleansimple(value): + ### Unify case, get rid of hypen, bring Coll to front ### + knowncoll = ['ATLAS', 'CALICE', 'ALICE', 'CMS', 'CDF', 'LHCb', 'LHCf', 'H1', + 'ZEUS', 'CLEO', 'HERMES', 'HERA-B', 'ALEPH', 'DELPHI', 'OPAL', 'L3', + 'CosmoALEPH', 'SLD', 'AMS', 'BTeV', 'BaBar', 'RHIC', 'NuSTAR', 'PHENIX', + 'STAR', 'BooNE', 'MiniBooNE', 'MicroBooNE', 'SciBooNE', 'CAST', + 'CELSIUS', 'CERES', 'CMD', 'CTA', 'GERDA', 'K2K', 'T2K', 'MAGIC', + 'NuTeV', 'Planck', 'PANDA', 'Hyper-Kamiokande', 'Super-Kamiokande', + 'KLOE', 'KM3NeT', 'NEMO', 'Swift', 'IceCube', 'ARGUS', 'CUORE', + 'CUORICINO', 'DarkSide', 'Daya Bay', 'Fermi-LAT', 'GLAST', 'KASCADE', + 'VERITAS', 'VIRGO', 'Pierre Auger', 'Majorana', 'MINERvA', 'MINOS', + 'Muon g-2', 'XENON', 'Muon Collider', 'Linear Collider', 'European Muon'] + knownsubcoll = {'Belle':'-', 'BES':'', 'CDF':'-', 'Kamiokande':'-', 'CLEO':'-'} + for kc in knowncoll: + start = re.compile(r' +%s[ \/-]+' % kc, re.IGNORECASE) + extent = re.compile(r' +%s[ \/-]*([0-9]?[A-Z0-9]) ' % kc, re.IGNORECASE) + front = re.compile(r'^ +(.+)[ -]+%s +' % kc, re.IGNORECASE) + value = front.sub(r' %s \1 ' % kc, value) + value = start.sub(r' %s ' % kc, value) + value = extent.sub(r' %s-\1 ' % kc, value) + for kc in knownsubcoll.keys(): + letter = knownsubcoll[kc] + # correct spelling(case), get rid of '-' + start = re.compile(r' +%s[ \/-]+' % kc, re.IGNORECASE) + # if only one trailing character, use '-' + extent = re.compile(r' +%s[ \/-]+([A-Z0-9]) ' % kc, re.IGNORECASE) + # deal with roman numbering + subco = re.compile(r' +%s[ \/-]*(I+) ' % kc, re.IGNORECASE) + value = start.sub(r' %s ' % kc, value) + value = subco.sub(r' %s%s\1' % (kc, letter), value) + value = extent.sub(r' %s-\1 ' % kc, value) + return value + +def coll_clean710(value): + #to make things easier, add leading and trailing space + value = ' %s ' % value + re_dzero = re.compile(r' DZero ', re.IGNORECASE) + re_do = re.compile(r' (?:DO|DØ) ') + re_panda = re.compile(r' \W*(?:bar|overline)\W*P\W*ANDA\W* ', re.IGNORECASE) + re_fermilat = re.compile(r' +Fermi[ \/-](?:LAT|Large[ -]Area[ -]Telescope) ', re.IGNORECASE) + re_glastlat = re.compile(r' +GLAST[ \/-](?:LAT|Large[ -]Area[ -]Telescope) ', re.IGNORECASE) + re_dchooz = re.compile(r' Double[ \/-]Chooz ', re.IGNORECASE) + re_dbay = re.compile(r' Daya[ \/-]Bay ', re.IGNORECASE) + value = re_do.sub(' D0 ', value) + value = re_dzero.sub(' D0 ', value) + value = re_fermilat.sub(r' Fermi-LAT ', value) + value = re_glastlat.sub(r' GLAST LAT ', value) + value = re_panda.sub(r' PANDA ', value) + value = re_dchooz.sub(r' Double Chooz ', value) + value = re_dbay.sub(r' Daya Bay ', value) + value = re.sub('\$B\W*small A}B\W*small AR}\$', 'BaBar', value) + value = re.sub(r' +LHC[ \/-]*([a-z])[ \/-]+', r' LHC\1 ', value) + value = re.sub(r' R.and.D ', ' R&D ', value) + value = re.sub(r' H\. ?E\. ?S\. ?S\.? +',' HESS ', value) + value = re.sub(r' PROMICE[ \/-]WASA ', ' PROMICE/WASA ', value) + value = re.sub(r' WASA[ \/-]PROMICE ', ' PROMICE/WASA ', value) + value = re.sub(r' CELSIUS[ \/-]WASA ', ' CELSIUS/WASA ', value) + value = re.sub(r' WASA[ \/-]*[aA][tT][\/-]*COSY ', ' WASA-at-COSY ', value) + value = re.sub(r' CERES[ \/-]NA', ' CERES/NA', value) + value = re.sub(r' EHS[ \/-]NA', ' EHS/NA', value) + value = coll_cleansimple(value) + value = re.sub(r' +',' ',value) + value = value.strip() + # replace & for xml output + value = re.sub('&',u'\u0026',value) + return value +