|
| 1 | +import os |
| 2 | +import re |
| 3 | +import argparse |
| 4 | +import urllib.request |
| 5 | +from io import BytesIO |
| 6 | + |
| 7 | +from cfunits import Units as cfUnits |
| 8 | +from cf_units import Unit as uuUnits |
| 9 | +from lxml import etree |
| 10 | + |
| 11 | + |
| 12 | + |
| 13 | +PATH0 = "/home/a001257/CODE/" |
| 14 | +BASE_PATH = PATH0 + "cf-conventions/cf-convention.github.io/Data/cf-standard-names/" |
| 15 | + |
| 16 | +def parse_xml(xml_raw): |
| 17 | + try: |
| 18 | + xml_tree = etree.parse(BytesIO(xml_raw)) |
| 19 | + except etree.XMLSyntaxError: |
| 20 | + print(f"{':'*100}\n{xml_raw[:1000]}") |
| 21 | + raise |
| 22 | + return xml_tree |
| 23 | + |
| 24 | + |
| 25 | +def get_schema(xml_tree): |
| 26 | + root = xml_tree.getroot() |
| 27 | + xsd_uri = root.values()[0] |
| 28 | + link = urllib.request.urlopen(xsd_uri) |
| 29 | + xsd_raw = link.read() |
| 30 | + xsd_tree = parse_xml(xsd_raw) |
| 31 | + schema = etree.XMLSchema(xsd_tree) |
| 32 | + return schema |
| 33 | + |
| 34 | + |
| 35 | +def find_xml_errors(xml_tree, schema, xml_raw): |
| 36 | + try: |
| 37 | + schema.assertValid(xml_tree) |
| 38 | + print(" ---- Valid and Well-formed") |
| 39 | + except etree.DocumentInvalid: |
| 40 | + xml_list = xml_raw.split(b"\n") |
| 41 | + for error in schema.error_log: |
| 42 | + for element in ["description", "canonical_units"]: |
| 43 | + if f"( {element}" in error.message: |
| 44 | + std_name = xml_list[error.line - 1].split(b'"')[1].decode("utf-8)") |
| 45 | + print(f"Line {error.line} : Standard name entry for '{std_name}' has no <{element}>") |
| 46 | + break |
| 47 | + else: |
| 48 | + print(f"Line {error.line} : {error.message}") |
| 49 | + |
| 50 | + |
| 51 | +def check_units(can_units, std_name): |
| 52 | + uu = cfUnits(can_units) |
| 53 | + if not uu.isvalid: |
| 54 | + print(f"Canonical units '{can_units}' is not accepted by CF-UNITS for '{std_name}'") |
| 55 | + else: |
| 56 | + try: |
| 57 | + uu = uuUnits(can_units) |
| 58 | + if " -" in can_units: |
| 59 | + try: |
| 60 | + uu = uuUnits(can_units.replace(" -", "-")) |
| 61 | + print(f"Canonical units '{can_units}' has a spurious space for '{std_name}'") |
| 62 | + except ValueError: |
| 63 | + print(f"Canonical unit '{can_units}' is really weird for '{std_name}'") |
| 64 | + elif "/" in can_units: |
| 65 | + print(f"Canonical units '{can_units}' used '/' for '{std_name}'") |
| 66 | + except ValueError: |
| 67 | + print(f"Canonical unit '{can_units} is a special CF unit for '{std_name}'") |
| 68 | + |
| 69 | + |
| 70 | +def find_missing_and_duplicates(xml_raw, old_entry_list, old_alias_list): |
| 71 | + def _extract_entries(xml_raw): |
| 72 | + entry_list = [] |
| 73 | + for entry in re.finditer(rb'<entry id=\".+?\">.+?</entry>', xml_raw, re.S): |
| 74 | + e = re.search(rb'(?<=\").+?(?=\")', entry.group()) |
| 75 | + std_name = e.group().decode("utf-8") |
| 76 | + entry_list.append(std_name) |
| 77 | + can_units = re.search(rb'(?<=_units>).+?(?=</canonical)', entry.group()) |
| 78 | + if can_units: |
| 79 | + can_units = can_units.group().decode("utf-8") |
| 80 | + check_units(can_units, std_name) |
| 81 | + return entry_list |
| 82 | + |
| 83 | + def _extract_aliases(xml_raw): |
| 84 | + alias_dict = {} |
| 85 | + for alias in re.finditer(rb'<alias id=.+?</alias>', xml_raw, re.S): |
| 86 | + alias_from = re.search(rb'(?<=\").+?(?=\")', alias.group()) |
| 87 | + alias_to = re.search(rb'(?<=entry_id>).+?(?=</entry_id)', alias.group()) |
| 88 | + alias_to = alias_to.group().decode("utf-8") |
| 89 | + alias_from = alias_from.group().decode("utf-8") |
| 90 | + alias_dict[alias_from] = alias_to |
| 91 | + return alias_dict |
| 92 | + |
| 93 | + new_entry_list = _extract_entries(xml_raw) |
| 94 | + alias_dict = _extract_aliases(xml_raw) |
| 95 | + new_alias_list = sorted(alias_dict.keys()) |
| 96 | + for alias_from, alias_to in alias_dict.items(): |
| 97 | + if alias_from in new_entry_list: |
| 98 | + print(f"Both defining and aliasing standard name '{alias_from}' into '{alias_to}'") |
| 99 | + elif (alias_from not in old_entry_list) and (alias_from not in old_alias_list): |
| 100 | + print(f"Aliasing the undefined standard name '{alias_from}' into '{alias_to}'") |
| 101 | + elif alias_to not in new_entry_list: |
| 102 | + print(f"Aliasing standard name '{alias_from}' into into the non-existing '{alias_to}'") |
| 103 | + _ = [print(f"Standard name '{s}' is discontinued") for |
| 104 | + s in sorted(list(set(old_entry_list) - |
| 105 | + (set(new_entry_list) | set(new_alias_list))))] |
| 106 | + return new_entry_list, new_alias_list |
| 107 | + |
| 108 | + |
| 109 | +def do_the_work(version, severity, entry_list, alias_list): |
| 110 | + xml_file = f"{BASE_PATH}{version}/src/cf-standard-name-table.xml" |
| 111 | + with open(xml_file, "rb") as fh: |
| 112 | + xml_raw = fh.read() |
| 113 | + print(xml_file) |
| 114 | + |
| 115 | + xml_tree = parse_xml(xml_raw) |
| 116 | + schema = get_schema(xml_tree) |
| 117 | + if severity != 1: |
| 118 | + find_xml_errors(xml_tree, schema, xml_raw) |
| 119 | + if severity: |
| 120 | + entry_list, alias_list = find_missing_and_duplicates(xml_raw, entry_list, alias_list) |
| 121 | + return entry_list, alias_list |
| 122 | + |
| 123 | + |
| 124 | +if __name__ == "__main__": |
| 125 | + parser = argparse.ArgumentParser( |
| 126 | + prog="LIST_ERRORS", |
| 127 | + description=("\nList (most) XML errors in standard name files.") |
| 128 | + ) |
| 129 | + parser.add_argument("-v", "--version", type=int, default = 0, |
| 130 | + help="Check a specific version (default is 0 (='all').") |
| 131 | + parser.add_argument("-s", "--severity", type=int, default = 0, |
| 132 | + help="Level of error checks (0=xml (default), 1=CF, 2=both.") |
| 133 | + args = parser.parse_args() |
| 134 | + severity = args.severity |
| 135 | + |
| 136 | + if args.version == 0: |
| 137 | + version_list = range(1, 100) |
| 138 | + elif severity > 0: |
| 139 | + version_list = range(1, args.version + 1) |
| 140 | + else: |
| 141 | + version_list = [args.version] |
| 142 | + |
| 143 | + entry_list = [] |
| 144 | + alias_list = [] |
| 145 | + for version in version_list: |
| 146 | + try: |
| 147 | + if version != 38: |
| 148 | + print("\n") |
| 149 | + entry_list, alias_list = do_the_work( |
| 150 | + version, args.severity, entry_list, alias_list |
| 151 | + ) |
| 152 | + except: |
| 153 | + break |
| 154 | + print() |
0 commit comments