Skip to content
This repository has been archived by the owner on Sep 20, 2021. It is now read-only.

draft version of collaboration cleanup (710) #86

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions miscutil/lib/collclean_cron.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import os
import re
import codecs
import datetime

from collclean_lib import coll_cleanforthe
from collclean_lib import coll_clean710
from collclean_lib import coll_split

#this should come from a KB
expcoll = []

def coll_check(colls, logtext):
global expcoll
for coll in colls:
# misspelled collaboration?
if re.search('coll', coll.lower()) or re.search('borati', coll.lower()):
if re.search('collider', coll.lower()) or re.search('college', coll.lower()):
pass
else:
logtext += 'COLL?? %s ' % coll

# for splitted colls: is this string listed in EXP 710__g?
if len(colls) > 1:
unknown = False
for coll in colls:
if not coll in expcoll:
unknown = True
if unknown:
colls = None
logtext += ' Dont dare to change\n'
return colls, logtext

def coll_splitandclean(recid, value, logtext):
newcolls = []
for original in coll_split(value):
(coll, author) = coll_cleanforthe(original)
coll = coll_clean710(coll)
newcolls.append(coll)
if author:
logtext += '%09i found author: %s\n' % (recid,author)
logtext += ' in: %s\n' % original
(newcolls, logtext) = coll_check(newcolls, logtext)
return newcolls, logtext

def writexml(recid, mark, changed):
ind1 = mark[1]
ind2 = mark[2]
extracolls = []
logtext = ''
xmltext = ''
curator = False
for field in mark[0]:
if field[0] == '9' and field[1].upper().strip() == 'CURATOR':
curator = True
if curator:
logtext += '%09i Skipping %s\n' % (recid, mark)

xmltext += ' <datafield tag="710" ind1="%s" ind2="%s">\n' % (ind1, ind2)
for field in mark[0]:
subf = field[0]
value = field[1]
if ind1 == ' ' and ind2 == ' ' and subf == 'g' and not curator:
logtext += '%09i %s\n' % (recid, value)
(colls, logtext) = coll_splitandclean(recid, value, logtext)
if colls:
if not colls[0] == value.strip():
changed = True
for coll in colls:
logtext += ' %s\n' % coll
value = colls[0]
extracolls = colls[1:]

xmltext += ' <subfield code="%s">%s</subfield>\n' % (subf, value)
xmltext += ' </datafield>\n'
if extracolls:
for value in extracolls:
xmltext += ' <datafield tag="710" ind1="%s" ind2="%s">\n' % (ind1, ind2)
xmltext += ' <subfield code="%s">%s</subfield>\n' % (subf, value)
xmltext += ' </datafield>\n'
return(xmltext, logtext, changed)


def main():
from invenio.search_engine import get_collection_reclist
from invenio.search_engine import search_pattern
from invenio.search_engine import get_record
from invenio.search_engine import get_fieldvalues
global expcoll
now = datetime.datetime.now()
stopdate = now
startdate = stopdate + datetime.timedelta(days=-10)
filedate = '%4d%02d%02d' % (stopdate.year, stopdate.month, stopdate.day)
stampofstopdate = '%4d-%02d-%02d' % (stopdate.year, stopdate.month, stopdate.day)
stampofstartdate = '%4d-%02d-%02d' % (startdate.year, startdate.month, startdate.day)

exp = get_collection_reclist("Experiments")
hep = get_collection_reclist("HEP")

recall = search_pattern(p="710__g:/^./")
recexp = recall.intersection(exp)
for rec in recexp:
expcoll += get_fieldvalues(rec, '710__g')

reccoll = search_pattern(p="710__g:'collaboration'")
recids = search_pattern(p="dadd:%s->%s" % (stampofstartdate,stampofstopdate))
recids = recids.intersection(recall)
recids = recids.union(reccoll)
recids = recids.intersection(hep)
logtext = ''
xmlpath = "/afs/desy.de/user/l/library/dok/inspire/correct/"
filename = '%scoll_%s.correct' % (xmlpath,filedate)
try:
filexml = codecs.EncodedFile(codecs.open(filename, 'w'), 'utf8')
except IOError:
logtext += 'Cant open file %s\n' % filename
return
# filelog = open('coll_%s.log' % filedate, 'w')
xmlall = '<collection>\n'
mail_subject = 'CollClean %s' % filedate
logtext += 'Processing coll_%s\n' % filedate
for rec in recids:
changed = False
xmltext = '<record>\n <controlfield tag="001">%i</controlfield>\n' % rec
m710 = get_record(rec).get('710')
for mark in m710:
(thisxml, thislog, thischange) = writexml(rec, mark, changed)
logtext += thislog
if thischange:
changed = True
xmltext += thisxml
if changed:
xmltext += '</record>\n'
xmlall += xmltext
else:
logtext += ' unchanged\n'
xmlall += '</collection>\n'
filexml.write(xmlall)
filexml.close()

os.system('echo "%s" | mail -s "%s" %s ' % (logtext,mail_subject,'[email protected]'))
# filelog.write(logtext)
# filelog.close()


if __name__ == "__main__":
main()
159 changes: 159 additions & 0 deletions miscutil/lib/collclean_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import os
import re
import codecs

def coll_split(value):
""" split at 'and' and ',' """
colls = []
# split at 'and' and ','
for val in value.split(' and '):
colls += val.split(', ')
return colls

def coll_cleanforthe(coll):
""" Cleanup collaboration, try to find author """
import re
author = None
re_for_the = re.compile(r'(?:^| )+(?:for the|on behalf of the|on behalf of|representing the|representing)(?: |$)+', re.IGNORECASE)
re_start = re.compile(r'^ *(group|team|consortium) +(.*) *$', re.IGNORECASE)
re_the = re.compile(r'^ *the +', re.IGNORECASE)
re_for = re.compile(r'^ *for +', re.IGNORECASE)
re_coll = re.compile(r'(?:^| |\/|-)+collaborations?\.?(?=\W|$)', re.IGNORECASE)
re_group = re.compile(r'(?:^| |\/|-)+group(?=\W|$)', re.IGNORECASE)
re_wgroup = re.compile(r'(?:^| |\/|-)+working group(?=\W|$)', re.IGNORECASE)
re_team = re.compile(r'(?:^| |\/|-)+team(?=\W|$)', re.IGNORECASE)
re_consortium = re.compile(r'(?:^| |\/|-)+consortium(?=\W|$)', re.IGNORECASE)


#replace trailing brackets only if there are leading brackets
if re.search('^ *\(.*\) *$', coll):
coll = coll.strip('.; ()')
else:
coll = coll.strip('.; ')
if re_for_the.search(coll):
if re.search('ASSOCIATION FOR THE', coll, flags=re.IGNORECASE) or \
re.search('CENTER FOR THE', coll, flags=re.IGNORECASE) or \
re.search('INSTITUTE FOR THE', coll, flags=re.IGNORECASE) or\
re.search('FOR THE DEVELOPMENT', coll, flags=re.IGNORECASE):
return coll, author
else:
# get strings leading and trailing 'for the'
(l, t) = re_for_the.split(coll, maxsplit=1)
if re.search(r'\w', l):
lead = re.split(' ', l)
else:
lead = []
if re.search(r'\w', t):
tail = re.split(' ', t)
else:
tail = []
if len(tail) == 0:
if len(lead) == 0:
# nothing left over
coll = ''
else:
# ATLAS John Doe for the
coll = lead[0]
if len(lead) > 1:
author = ' '.join(lead[1:])
else:
coll = ' '.join(tail)
if re.search(r'collaboration$', coll, flags=re.IGNORECASE) or \
re.search(r' team$', coll, flags=re.IGNORECASE):
# John Doe for the ATLAS Collaboration
if len(lead) > 0:
author = ' '.join(lead)
else:
if len(lead) > 0:
if len(lead) > 1:
# Collaboration John Doe for the ATLAS
# John Doe for ATLAS
author = ' '.join(lead)
author = re_coll.sub('', author)
else:
# Collaboration for the ATLAS
coll = coll + ' ' + lead[0]
if author:
# John Doe -> Doe, John
author = re.sub(r'^ *([\w.-]+) (.+)$', r'\2, \1', author)

coll = re_the.sub('', coll)
coll = re_start.sub(r'\2 \1', coll)
coll = re_coll.sub('', coll)
coll = re_for.sub('', coll)
coll = re_group.sub(' Group', coll)
coll = re_wgroup.sub(' Working Group', coll)
coll = re_team.sub(' Team', coll)
coll = re_consortium.sub(' Consortium', coll)
return coll, author

def coll_cleansimple(value):
### Unify case, get rid of hypen, bring Coll to front ###
knowncoll = ['ATLAS', 'CALICE', 'ALICE', 'CMS', 'CDF', 'LHCb', 'LHCf', 'H1',
'ZEUS', 'CLEO', 'HERMES', 'HERA-B', 'ALEPH', 'DELPHI', 'OPAL', 'L3',
'CosmoALEPH', 'SLD', 'AMS', 'BTeV', 'BaBar', 'RHIC', 'NuSTAR', 'PHENIX',
'STAR', 'BooNE', 'MiniBooNE', 'MicroBooNE', 'SciBooNE', 'CAST',
'CELSIUS', 'CERES', 'CMD', 'CTA', 'GERDA', 'K2K', 'T2K', 'MAGIC',
'NuTeV', 'Planck', 'PANDA', 'Hyper-Kamiokande', 'Super-Kamiokande',
'KLOE', 'KM3NeT', 'NEMO', 'Swift', 'IceCube', 'ARGUS', 'CUORE',
'CUORICINO', 'DarkSide', 'Daya Bay', 'Fermi-LAT', 'GLAST', 'KASCADE',
'VERITAS', 'VIRGO', 'Pierre Auger', 'Majorana', 'MINERvA', 'MINOS',
'Muon g-2', 'XENON', 'Muon Collider', 'Linear Collider', 'European Muon']
knownsubcoll = {'Belle':'-', 'BES':'', 'CDF':'-', 'Kamiokande':'-', 'CLEO':'-'}
for kc in knowncoll:
start = re.compile(r' +%s[ \/-]+' % kc, re.IGNORECASE)
extent = re.compile(r' +%s[ \/-]*([0-9]?[A-Z0-9]) ' % kc, re.IGNORECASE)
front = re.compile(r'^ +(.+)[ -]+%s +' % kc, re.IGNORECASE)
value = front.sub(r' %s \1 ' % kc, value)
value = start.sub(r' %s ' % kc, value)
value = extent.sub(r' %s-\1 ' % kc, value)
for kc in knownsubcoll.keys():
letter = knownsubcoll[kc]
# correct spelling(case), get rid of '-'
start = re.compile(r' +%s[ \/-]+' % kc, re.IGNORECASE)
# if only one trailing character, use '-'
extent = re.compile(r' +%s[ \/-]+([A-Z0-9]) ' % kc, re.IGNORECASE)
# deal with roman numbering
subco = re.compile(r' +%s[ \/-]*(I+) ' % kc, re.IGNORECASE)
value = start.sub(r' %s ' % kc, value)
value = subco.sub(r' %s%s\1' % (kc, letter), value)
value = extent.sub(r' %s-\1 ' % kc, value)
return value

def coll_clean710(value):
#to make things easier, add leading and trailing space
value = ' %s ' % value
re_dzero = re.compile(r' DZero ', re.IGNORECASE)
re_do = re.compile(r' (?:DO|DØ) ')
re_panda = re.compile(r' \W*(?:bar|overline)\W*P\W*ANDA\W* ', re.IGNORECASE)
re_fermilat = re.compile(r' +Fermi[ \/-](?:LAT|Large[ -]Area[ -]Telescope) ', re.IGNORECASE)
re_glastlat = re.compile(r' +GLAST[ \/-](?:LAT|Large[ -]Area[ -]Telescope) ', re.IGNORECASE)
re_dchooz = re.compile(r' Double[ \/-]Chooz ', re.IGNORECASE)
re_dbay = re.compile(r' Daya[ \/-]Bay ', re.IGNORECASE)
value = re_do.sub(' D0 ', value)
value = re_dzero.sub(' D0 ', value)
value = re_fermilat.sub(r' Fermi-LAT ', value)
value = re_glastlat.sub(r' GLAST LAT ', value)
value = re_panda.sub(r' PANDA ', value)
value = re_dchooz.sub(r' Double Chooz ', value)
value = re_dbay.sub(r' Daya Bay ', value)
value = re.sub('\$B\W*small A}B\W*small AR}\$', 'BaBar', value)
value = re.sub(r' +LHC[ \/-]*([a-z])[ \/-]+', r' LHC\1 ', value)
value = re.sub(r' R.and.D ', ' R&D ', value)
value = re.sub(r' H\. ?E\. ?S\. ?S\.? +',' HESS ', value)
value = re.sub(r' PROMICE[ \/-]WASA ', ' PROMICE/WASA ', value)
value = re.sub(r' WASA[ \/-]PROMICE ', ' PROMICE/WASA ', value)
value = re.sub(r' CELSIUS[ \/-]WASA ', ' CELSIUS/WASA ', value)
value = re.sub(r' WASA[ \/-]*[aA][tT][\/-]*COSY ', ' WASA-at-COSY ', value)
value = re.sub(r' CERES[ \/-]NA', ' CERES/NA', value)
value = re.sub(r' EHS[ \/-]NA', ' EHS/NA', value)
value = coll_cleansimple(value)
value = re.sub(r' +',' ',value)
value = value.strip()
# replace & for xml output
value = re.sub('&',u'\u0026',value)
return value