Skip to content

Commit a169b8a

Browse files
committed
Added tools (python, bash) for processing XML files (cf-convention#470)
Regarding cf-convention#469: Just to test the workflow the current XSD link in XML files points to my repo.
1 parent 444f308 commit a169b8a

File tree

6 files changed

+542
-0
lines changed

6 files changed

+542
-0
lines changed

ISSUE-457-TOOLS/COMPACT_ERRORS.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import re
2+
import argparse
3+
import numpy as np
4+
5+
6+
def do_the_work(file_name):
7+
with open(file_name, "r") as fh:
8+
in_text = fh.readlines()
9+
10+
err_dict = {}
11+
for line in in_text:
12+
line = line.strip()
13+
if line:
14+
if line.startswith("/home"):
15+
version = re.search(r"(?<=names/)\d{1,2}(?=/src)", line)
16+
version = version.group()
17+
else:
18+
line = re.sub(r"Line \d+? : ", "", line)
19+
if line in err_dict.keys():
20+
err_dict[line].append(version)
21+
else:
22+
err_dict[line] = [version]
23+
out_dict = {}
24+
for line, version_list in err_dict.items():
25+
v0 = int(version_list[0])
26+
version_string = ", ".join(version_list)
27+
text = f"{version_string} | {line}"
28+
if v0 in out_dict:
29+
out_dict[v0].append(text)
30+
else:
31+
out_dict[v0] = [text]
32+
for v0 in range(1, 84):
33+
text_list = out_dict.pop(v0, "")
34+
for text in text_list:
35+
print(text)
36+
37+
if __name__ == "__main__":
38+
parser = argparse.ArgumentParser(
39+
prog="compact_errors",
40+
description=("\nCompact error lists to show in which version each error occur.")
41+
)
42+
parser.add_argument("-f", "--file_name", type=str,
43+
help="Name of input error file")
44+
args = parser.parse_args()
45+
46+
do_the_work(args.file_name)

ISSUE-457-TOOLS/LIST_ERRORS.py

+154
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import os
2+
import re
3+
import argparse
4+
import urllib.request
5+
from io import BytesIO
6+
7+
from cfunits import Units as cfUnits
8+
from cf_units import Unit as uuUnits
9+
from lxml import etree
10+
11+
12+
13+
PATH0 = "/home/a001257/CODE/"
14+
BASE_PATH = PATH0 + "cf-conventions/cf-convention.github.io/Data/cf-standard-names/"
15+
16+
def parse_xml(xml_raw):
17+
try:
18+
xml_tree = etree.parse(BytesIO(xml_raw))
19+
except etree.XMLSyntaxError:
20+
print(f"{':'*100}\n{xml_raw[:1000]}")
21+
raise
22+
return xml_tree
23+
24+
25+
def get_schema(xml_tree):
26+
root = xml_tree.getroot()
27+
xsd_uri = root.values()[0]
28+
link = urllib.request.urlopen(xsd_uri)
29+
xsd_raw = link.read()
30+
xsd_tree = parse_xml(xsd_raw)
31+
schema = etree.XMLSchema(xsd_tree)
32+
return schema
33+
34+
35+
def find_xml_errors(xml_tree, schema, xml_raw):
36+
try:
37+
schema.assertValid(xml_tree)
38+
print(" ---- Valid and Well-formed")
39+
except etree.DocumentInvalid:
40+
xml_list = xml_raw.split(b"\n")
41+
for error in schema.error_log:
42+
for element in ["description", "canonical_units"]:
43+
if f"( {element}" in error.message:
44+
std_name = xml_list[error.line - 1].split(b'"')[1].decode("utf-8)")
45+
print(f"Line {error.line} : Standard name entry for '{std_name}' has no <{element}>")
46+
break
47+
else:
48+
print(f"Line {error.line} : {error.message}")
49+
50+
51+
def check_units(can_units, std_name):
52+
uu = cfUnits(can_units)
53+
if not uu.isvalid:
54+
print(f"Canonical units '{can_units}' is not accepted by CF-UNITS for '{std_name}'")
55+
else:
56+
try:
57+
uu = uuUnits(can_units)
58+
if " -" in can_units:
59+
try:
60+
uu = uuUnits(can_units.replace(" -", "-"))
61+
print(f"Canonical units '{can_units}' has a spurious space for '{std_name}'")
62+
except ValueError:
63+
print(f"Canonical unit '{can_units}' is really weird for '{std_name}'")
64+
elif "/" in can_units:
65+
print(f"Canonical units '{can_units}' used '/' for '{std_name}'")
66+
except ValueError:
67+
print(f"Canonical unit '{can_units} is a special CF unit for '{std_name}'")
68+
69+
70+
def find_missing_and_duplicates(xml_raw, old_entry_list, old_alias_list):
71+
def _extract_entries(xml_raw):
72+
entry_list = []
73+
for entry in re.finditer(rb'<entry id=\".+?\">.+?</entry>', xml_raw, re.S):
74+
e = re.search(rb'(?<=\").+?(?=\")', entry.group())
75+
std_name = e.group().decode("utf-8")
76+
entry_list.append(std_name)
77+
can_units = re.search(rb'(?<=_units>).+?(?=</canonical)', entry.group())
78+
if can_units:
79+
can_units = can_units.group().decode("utf-8")
80+
check_units(can_units, std_name)
81+
return entry_list
82+
83+
def _extract_aliases(xml_raw):
84+
alias_dict = {}
85+
for alias in re.finditer(rb'<alias id=.+?</alias>', xml_raw, re.S):
86+
alias_from = re.search(rb'(?<=\").+?(?=\")', alias.group())
87+
alias_to = re.search(rb'(?<=entry_id>).+?(?=</entry_id)', alias.group())
88+
alias_to = alias_to.group().decode("utf-8")
89+
alias_from = alias_from.group().decode("utf-8")
90+
alias_dict[alias_from] = alias_to
91+
return alias_dict
92+
93+
new_entry_list = _extract_entries(xml_raw)
94+
alias_dict = _extract_aliases(xml_raw)
95+
new_alias_list = sorted(alias_dict.keys())
96+
for alias_from, alias_to in alias_dict.items():
97+
if alias_from in new_entry_list:
98+
print(f"Both defining and aliasing standard name '{alias_from}' into '{alias_to}'")
99+
elif (alias_from not in old_entry_list) and (alias_from not in old_alias_list):
100+
print(f"Aliasing the undefined standard name '{alias_from}' into '{alias_to}'")
101+
elif alias_to not in new_entry_list:
102+
print(f"Aliasing standard name '{alias_from}' into into the non-existing '{alias_to}'")
103+
_ = [print(f"Standard name '{s}' is discontinued") for
104+
s in sorted(list(set(old_entry_list) -
105+
(set(new_entry_list) | set(new_alias_list))))]
106+
return new_entry_list, new_alias_list
107+
108+
109+
def do_the_work(version, severity, entry_list, alias_list):
110+
xml_file = f"{BASE_PATH}{version}/src/cf-standard-name-table.xml"
111+
with open(xml_file, "rb") as fh:
112+
xml_raw = fh.read()
113+
print(xml_file)
114+
115+
xml_tree = parse_xml(xml_raw)
116+
schema = get_schema(xml_tree)
117+
if severity != 1:
118+
find_xml_errors(xml_tree, schema, xml_raw)
119+
if severity:
120+
entry_list, alias_list = find_missing_and_duplicates(xml_raw, entry_list, alias_list)
121+
return entry_list, alias_list
122+
123+
124+
if __name__ == "__main__":
125+
parser = argparse.ArgumentParser(
126+
prog="LIST_ERRORS",
127+
description=("\nList (most) XML errors in standard name files.")
128+
)
129+
parser.add_argument("-v", "--version", type=int, default = 0,
130+
help="Check a specific version (default is 0 (='all').")
131+
parser.add_argument("-s", "--severity", type=int, default = 0,
132+
help="Level of error checks (0=xml (default), 1=CF, 2=both.")
133+
args = parser.parse_args()
134+
severity = args.severity
135+
136+
if args.version == 0:
137+
version_list = range(1, 100)
138+
elif severity > 0:
139+
version_list = range(1, args.version + 1)
140+
else:
141+
version_list = [args.version]
142+
143+
entry_list = []
144+
alias_list = []
145+
for version in version_list:
146+
try:
147+
if version != 38:
148+
print("\n")
149+
entry_list, alias_list = do_the_work(
150+
version, args.severity, entry_list, alias_list
151+
)
152+
except:
153+
break
154+
print()

ISSUE-457-TOOLS/STEP_1-2.py

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# -*- coding: <encoding name> -*-
2+
3+
import re
4+
from datetime import datetime, UTC
5+
from pathlib import Path
6+
7+
MY_PATH = "/home/a001257/CODE/"
8+
BASE_PATH = MY_PATH + "cf-conventions/cf-convention.github.io/Data/cf-standard-names/"
9+
# NEW_XSD = b"../../schema-files/cf-standard-name-table-2.0.xsd"
10+
NEW_XSD = (b"https://raw.githubusercontent.com/larsbarring/cf-convention.github.io/"
11+
b"test-all-issue-457/Data/schema-files/cf-standard-name-table-2.0.xsd")
12+
13+
def fix_v1_datetime(xml_raw):
14+
txt1 = b">1</version_number>\n"
15+
txt2 = txt1 + b" <last_modified>2002-04-02T12:00:00Z</last_modified>\n"
16+
xml_raw = xml_raw.replace(txt1, txt2)
17+
print("ADDED : DATETIME in version 1")
18+
return xml_raw
19+
20+
21+
def fix_v71_datetime(xml_raw):
22+
if b"2020-02-04T12:00Z" in xml_raw:
23+
xml_raw = xml_raw.replace(b"2020-02-04T12:00Z", b"2020-02-04T12:00:00Z")
24+
print("FIXED : DATETIME in version 71")
25+
return xml_raw
26+
27+
def fix_v12_duplicate_entry(xml_raw):
28+
pat = rb'\n *<entry id="sea_surface_height_above_reference_ellipsoid">.+?</entry> *?(?=\n)'
29+
xml_raw = re.sub(pat, b"", xml_raw, 1, re.S)
30+
print("FIXED : Removed first duplicate of 'sea_surface_height_above_reference_ellipsoid'")
31+
return xml_raw
32+
33+
34+
def add_modified_date(xml_raw):
35+
time_stamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8")
36+
modified = b"last_modified"
37+
modified_start = b"<" + modified + b">"
38+
modified_end = modified_start.replace(b"<", b"</")
39+
modified_element = modified_start + time_stamp + modified_end
40+
inst_text = b"<institution>"
41+
n = len( inst_text)
42+
inst = re.search((b"\n( *)" + inst_text), xml_raw)
43+
spaces = inst.group()[1: -n]
44+
position = inst.span()[0]
45+
xml_raw = xml_raw[:position] + b"\n" + spaces + modified_element + xml_raw[position:]
46+
print("ADDED : MODIFIED DATE")
47+
return xml_raw
48+
49+
50+
def do_the_work(version):
51+
xml_original = f"{BASE_PATH}{version}/src/cf-standard-name-table.xml"
52+
xml_saved = xml_original.replace("-table", "-table__SAVED")
53+
54+
my_file = Path(xml_saved)
55+
if my_file.is_file():
56+
# work on original files that are already saved
57+
with open(xml_saved, "rb") as fh:
58+
xml_raw = fh.read()
59+
print(f"READING SAVED ORIGINAL FILE: {xml_original}")
60+
else:
61+
# work on original files that have not yet been saved
62+
with open(xml_original, "rb") as fh:
63+
xml_raw = fh.read()
64+
# then save the original before changing the original
65+
with open(xml_saved, "wb") as fh:
66+
fh.write(xml_raw)
67+
print(f"READING AND SAVING ORIGINAL FILE: {xml_original}")
68+
69+
if xml_raw[:6] != b"<?xml ":
70+
xml_raw = b'<?xml version="1.0"?>\n' + xml_raw
71+
print("ADDED : '<?xml ...>")
72+
for old_xsd in [b"CFStandardNameTable-1.0.xsd",
73+
b"CFStandardNameTable-1.1.xsd",
74+
b"cf-standard-name-table-1.1.xsd"]:
75+
if old_xsd in xml_raw:
76+
xml_raw = xml_raw.replace(old_xsd, NEW_XSD)
77+
print(f"CHANGED : XSD FILE NAME {old_xsd.decode('utf-8')} --> {NEW_XSD.decode('utf-8')}")
78+
79+
if version == 1:
80+
xml_raw = fix_v1_datetime(xml_raw)
81+
elif version == 12:
82+
xml_raw = fix_v12_duplicate_entry(xml_raw)
83+
elif version == 71:
84+
xml_raw = fix_v71_datetime(xml_raw)
85+
86+
xml_raw = xml_raw.replace(b"last_modified", b"first_published_date")
87+
print("CHANGED : 'last_modified' --> 'first_published_date'")
88+
89+
xml_raw = add_modified_date(xml_raw)
90+
91+
with open(xml_original, "wb") as fh:
92+
fh.write(xml_raw)
93+
94+
95+
if __name__ == "__main__":
96+
for version in range(1, 100):
97+
try:
98+
if version != 38:
99+
print("\n")
100+
do_the_work(version)
101+
except:
102+
break

0 commit comments

Comments
 (0)