inspirehep · ksachs · May 4, 2015
diff --git a/miscutil/lib/collclean_cron.py b/miscutil/lib/collclean_cron.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+import os
+import re
+import codecs
+import datetime
+
+from collclean_lib import coll_cleanforthe
+from collclean_lib import coll_clean710
+from collclean_lib import coll_split
+
+#this should come from a KB
+expcoll = []
+
+def coll_check(colls, logtext):
+    global expcoll 
+    for coll in colls:
+        # misspelled collaboration?
+        if re.search('coll', coll.lower()) or re.search('borati', coll.lower()):
+            if re.search('collider', coll.lower()) or re.search('college', coll.lower()):
+                pass
+            else:
+                logtext += 'COLL?? %s ' % coll
+
+    # for splitted colls: is this string listed in EXP 710__g?    
+    if len(colls) > 1:
+        unknown = False
+        for coll in colls:
+            if not coll in expcoll:
+                unknown = True
+        if unknown:
+            colls = None
+            logtext += '          Dont dare to change\n' 
+    return colls, logtext
+
+def coll_splitandclean(recid, value, logtext):
+    newcolls = []
+    for original in coll_split(value):    
+        (coll, author) = coll_cleanforthe(original)
+        coll = coll_clean710(coll)
+        newcolls.append(coll)
+        if author:
+            logtext += '%09i found author: %s\n' % (recid,author)
+            logtext += '           in: %s\n' % original
+    (newcolls, logtext) = coll_check(newcolls, logtext)
+    return newcolls, logtext
+
+def writexml(recid, mark, changed):
+    ind1 = mark[1]
+    ind2 = mark[2]
+    extracolls = []
+    logtext = ''
+    xmltext = ''
+    curator = False
+    for field in mark[0]:
+        if field[0] == '9' and field[1].upper().strip() == 'CURATOR':
+            curator = True
+    if curator:
+        logtext += '%09i Skipping %s\n' % (recid, mark)
+
+    xmltext += '  <datafield tag="710" ind1="%s" ind2="%s">\n' % (ind1, ind2)
+    for field in mark[0]:
+        subf = field[0]
+        value = field[1]
+        if ind1 == ' ' and ind2 == ' ' and subf == 'g' and not curator:
+            logtext += '%09i %s\n' % (recid, value)
+            (colls, logtext) = coll_splitandclean(recid, value, logtext)
+            if colls:
+                if not colls[0] == value.strip():
+                    changed = True
+                    for coll in colls:
+                        logtext += '          %s\n' % coll 
+                value = colls[0]
+                extracolls = colls[1:]
+
+        xmltext += '    <subfield code="%s">%s</subfield>\n' % (subf, value)
+    xmltext += '  </datafield>\n' 
+    if extracolls:
+        for value in extracolls:
+            xmltext += '  <datafield tag="710" ind1="%s" ind2="%s">\n' % (ind1, ind2)
+            xmltext += '    <subfield code="%s">%s</subfield>\n' % (subf, value)
+            xmltext += '  </datafield>\n' 
+    return(xmltext, logtext, changed)
+
+
+def main():
+    from invenio.search_engine import get_collection_reclist
+    from invenio.search_engine import search_pattern
+    from invenio.search_engine import get_record
+    from invenio.search_engine import get_fieldvalues
+    global expcoll 
+    now = datetime.datetime.now()
+    stopdate = now
+    startdate = stopdate + datetime.timedelta(days=-10)
+    filedate = '%4d%02d%02d' % (stopdate.year, stopdate.month, stopdate.day)
+    stampofstopdate = '%4d-%02d-%02d' % (stopdate.year, stopdate.month, stopdate.day)
+    stampofstartdate = '%4d-%02d-%02d' % (startdate.year, startdate.month, startdate.day)
+
+    exp = get_collection_reclist("Experiments")
+    hep = get_collection_reclist("HEP")
+
+    recall = search_pattern(p="710__g:/^./")
+    recexp = recall.intersection(exp)
+    for rec in recexp:
+        expcoll += get_fieldvalues(rec, '710__g')
+
+    reccoll = search_pattern(p="710__g:'collaboration'")
+    recids = search_pattern(p="dadd:%s->%s" % (stampofstartdate,stampofstopdate))
+    recids = recids.intersection(recall)
+    recids = recids.union(reccoll)
+    recids = recids.intersection(hep)
+    logtext =  ''
+    xmlpath = "/afs/desy.de/user/l/library/dok/inspire/correct/"
+    filename = '%scoll_%s.correct' % (xmlpath,filedate)
+    try:
+        filexml = codecs.EncodedFile(codecs.open(filename, 'w'), 'utf8')
+    except IOError:
+        logtext += 'Cant open file %s\n' % filename
+        return
+#    filelog = open('coll_%s.log' % filedate, 'w')
+    xmlall = '<collection>\n'
+    mail_subject = 'CollClean %s' % filedate
+    logtext += 'Processing coll_%s\n' % filedate
+    for rec in recids:
+        changed = False
+        xmltext = '<record>\n  <controlfield tag="001">%i</controlfield>\n' % rec
+        m710 = get_record(rec).get('710')
+        for mark in m710:
+            (thisxml, thislog, thischange) = writexml(rec, mark, changed)
+            logtext += thislog
+            if thischange:
+                changed = True
+                xmltext += thisxml
+        if changed:
+            xmltext += '</record>\n'
+            xmlall += xmltext 
+        else:
+            logtext += '          unchanged\n'
+    xmlall += '</collection>\n'
+    filexml.write(xmlall)
+    filexml.close()
+
+    os.system('echo "%s" | mail -s "%s" %s ' % (logtext,mail_subject,'[email protected]'))
+#    filelog.write(logtext)
+#    filelog.close()
+
+
+if __name__ == "__main__":
+        main()
diff --git a/miscutil/lib/collclean_lib.py b/miscutil/lib/collclean_lib.py
@@ -0,0 +1,159 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+import os
+import re
+import codecs
+
+def coll_split(value):
+    """ split at 'and' and ',' """
+    colls = []
+    # split at 'and' and ','
+    for val in value.split(' and '):
+        colls += val.split(', ')
+    return colls
+
+def coll_cleanforthe(coll):
+    """ Cleanup collaboration, try to find author """
+    import re
+    author = None
+    re_for_the = re.compile(r'(?:^| )+(?:for the|on behalf of the|on behalf of|representing the|representing)(?: |$)+', re.IGNORECASE)
+    re_start = re.compile(r'^ *(group|team|consortium) +(.*) *$', re.IGNORECASE)
+    re_the = re.compile(r'^ *the +', re.IGNORECASE)
+    re_for = re.compile(r'^ *for +', re.IGNORECASE)
+    re_coll = re.compile(r'(?:^| |\/|-)+collaborations?\.?(?=\W|$)', re.IGNORECASE)
+    re_group = re.compile(r'(?:^| |\/|-)+group(?=\W|$)', re.IGNORECASE)
+    re_wgroup = re.compile(r'(?:^| |\/|-)+working group(?=\W|$)', re.IGNORECASE)
+    re_team = re.compile(r'(?:^| |\/|-)+team(?=\W|$)', re.IGNORECASE)
+    re_consortium = re.compile(r'(?:^| |\/|-)+consortium(?=\W|$)', re.IGNORECASE)
+
+
+    #replace trailing brackets only if there are leading brackets
+    if re.search('^ *\(.*\) *$', coll):
+        coll = coll.strip('.; ()')
+    else:
+        coll = coll.strip('.; ')
+    if re_for_the.search(coll):
+        if re.search('ASSOCIATION FOR THE', coll, flags=re.IGNORECASE) or \
+            re.search('CENTER FOR THE', coll, flags=re.IGNORECASE) or \
+            re.search('INSTITUTE FOR THE', coll, flags=re.IGNORECASE) or\
+            re.search('FOR THE DEVELOPMENT', coll, flags=re.IGNORECASE):
+            return coll, author
+        else:
+            # get strings leading and trailing 'for the'
+            (l, t) = re_for_the.split(coll, maxsplit=1)
+            if re.search(r'\w', l):
+                lead = re.split(' ', l)
+            else:
+                lead = []
+            if re.search(r'\w', t):
+                tail = re.split(' ', t)
+            else:
+                tail = []
+            if len(tail) == 0:
+                if len(lead) == 0:
+                    # nothing left over
+                    coll = ''
+                else:
+                    # ATLAS John Doe for the
+                    coll = lead[0]
+                    if len(lead) > 1:
+                        author = ' '.join(lead[1:])
+            else:
+                coll = ' '.join(tail)
+                if re.search(r'collaboration$', coll, flags=re.IGNORECASE) or \
+                    re.search(r' team$', coll, flags=re.IGNORECASE):
+                    #   John Doe for the ATLAS Collaboration
+                    if len(lead) > 0:
+                        author = ' '.join(lead)
+                else:
+                    if len(lead) > 0:
+                        if len(lead) > 1:
+                            # Collaboration John Doe for the ATLAS
+                            # John Doe for ATLAS
+                            author = ' '.join(lead)
+                            author = re_coll.sub('', author)
+                        else:
+                            # Collaboration for the ATLAS
+                            coll = coll + ' ' + lead[0]
+            if author:
+                # John Doe -> Doe, John
+                author = re.sub(r'^ *([\w.-]+) (.+)$', r'\2, \1', author)
+
+    coll = re_the.sub('', coll)
+    coll = re_start.sub(r'\2 \1', coll)
+    coll = re_coll.sub('', coll)
+    coll = re_for.sub('', coll)
+    coll = re_group.sub(' Group', coll)
+    coll = re_wgroup.sub(' Working Group', coll)
+    coll = re_team.sub(' Team', coll)
+    coll = re_consortium.sub(' Consortium', coll)
+    return coll, author
+
+def coll_cleansimple(value): 
+    ### Unify case, get rid of hypen, bring Coll to front ###
+    knowncoll = ['ATLAS', 'CALICE', 'ALICE', 'CMS', 'CDF', 'LHCb', 'LHCf', 'H1',
+        'ZEUS', 'CLEO', 'HERMES', 'HERA-B', 'ALEPH', 'DELPHI', 'OPAL', 'L3', 
+        'CosmoALEPH', 'SLD', 'AMS', 'BTeV', 'BaBar', 'RHIC', 'NuSTAR', 'PHENIX',
+         'STAR', 'BooNE', 'MiniBooNE', 'MicroBooNE', 'SciBooNE', 'CAST', 
+        'CELSIUS', 'CERES', 'CMD', 'CTA', 'GERDA', 'K2K', 'T2K', 'MAGIC', 
+        'NuTeV', 'Planck', 'PANDA', 'Hyper-Kamiokande', 'Super-Kamiokande', 
+        'KLOE', 'KM3NeT', 'NEMO', 'Swift', 'IceCube', 'ARGUS', 'CUORE', 
+        'CUORICINO', 'DarkSide', 'Daya Bay', 'Fermi-LAT', 'GLAST', 'KASCADE', 
+        'VERITAS', 'VIRGO', 'Pierre Auger', 'Majorana', 'MINERvA', 'MINOS', 
+        'Muon g-2', 'XENON', 'Muon Collider', 'Linear Collider', 'European Muon']
+    knownsubcoll = {'Belle':'-', 'BES':'', 'CDF':'-', 'Kamiokande':'-', 'CLEO':'-'}
+    for kc in knowncoll:
+        start = re.compile(r' +%s[ \/-]+' % kc, re.IGNORECASE)
+        extent = re.compile(r' +%s[ \/-]*([0-9]?[A-Z0-9]) ' % kc, re.IGNORECASE)
+        front = re.compile(r'^ +(.+)[ -]+%s +' % kc, re.IGNORECASE)
+        value = front.sub(r' %s \1 ' % kc, value)
+        value = start.sub(r' %s ' % kc, value)
+        value = extent.sub(r' %s-\1 ' % kc, value)
+    for kc in knownsubcoll.keys():
+        letter = knownsubcoll[kc]
+        # correct spelling(case), get rid of '-'
+        start = re.compile(r' +%s[ \/-]+' % kc, re.IGNORECASE)
+        # if only one trailing character, use '-'
+        extent = re.compile(r' +%s[ \/-]+([A-Z0-9]) ' % kc, re.IGNORECASE)
+        # deal with roman numbering
+        subco = re.compile(r' +%s[ \/-]*(I+) ' % kc, re.IGNORECASE)
+        value = start.sub(r' %s ' % kc, value)
+        value = subco.sub(r' %s%s\1' % (kc, letter), value)
+        value = extent.sub(r' %s-\1 ' % kc, value)
+    return value
+
+def coll_clean710(value):
+    #to make things easier, add leading and trailing space
+    value = ' %s ' % value
+    re_dzero = re.compile(r' DZero ', re.IGNORECASE)
+    re_do = re.compile(r' (?:DO|DØ) ')
+    re_panda = re.compile(r' \W*(?:bar|overline)\W*P\W*ANDA\W* ', re.IGNORECASE)
+    re_fermilat = re.compile(r' +Fermi[ \/-](?:LAT|Large[ -]Area[ -]Telescope) ', re.IGNORECASE)
+    re_glastlat = re.compile(r' +GLAST[ \/-](?:LAT|Large[ -]Area[ -]Telescope) ', re.IGNORECASE)
+    re_dchooz = re.compile(r' Double[ \/-]Chooz ', re.IGNORECASE)
+    re_dbay = re.compile(r' Daya[ \/-]Bay ', re.IGNORECASE)
+    value = re_do.sub(' D0 ', value)
+    value = re_dzero.sub(' D0 ', value)
+    value = re_fermilat.sub(r' Fermi-LAT ', value)
+    value = re_glastlat.sub(r' GLAST LAT ', value)
+    value = re_panda.sub(r' PANDA ', value)
+    value = re_dchooz.sub(r' Double Chooz ', value)
+    value = re_dbay.sub(r' Daya Bay ', value)
+    value = re.sub('\$B\W*small A}B\W*small AR}\$', 'BaBar', value)    
+    value = re.sub(r' +LHC[ \/-]*([a-z])[ \/-]+', r' LHC\1 ', value) 
+    value = re.sub(r' R.and.D ', ' R&D ', value) 
+    value = re.sub(r' H\. ?E\. ?S\. ?S\.? +',' HESS ', value) 
+    value = re.sub(r' PROMICE[ \/-]WASA ', ' PROMICE/WASA ', value)
+    value = re.sub(r' WASA[ \/-]PROMICE ', ' PROMICE/WASA ', value)
+    value = re.sub(r' CELSIUS[ \/-]WASA ', ' CELSIUS/WASA ', value)
+    value = re.sub(r' WASA[ \/-]*[aA][tT][\/-]*COSY ', ' WASA-at-COSY ', value)
+    value = re.sub(r' CERES[ \/-]NA', ' CERES/NA', value)
+    value = re.sub(r' EHS[ \/-]NA', ' EHS/NA', value)
+    value = coll_cleansimple(value)
+    value = re.sub(r'  +',' ',value)
+    value = value.strip()
+    # replace & for xml output
+    value = re.sub('&',u'\u0026',value)
+    return value
+