introducing schema in config yaml so that user can supply a different schema vetted by MSD.

aradhakrishnanGFDL · aradhakrishnanGFDL · commit 3fc1bc6c9f5c · 2024-08-06T13:49:27.000-04:00
This is useful for MDTF as MDTF may need tiny tweaks until we can stablize and align completetely
diff --git a/catalogbuilder/cats/mdtf_template.json b/catalogbuilder/cats/mdtf_template.json
@@ -0,0 +1,139 @@
+{
+  "esmcat_version": "0.0.1",
+  "attributes": [
+    {
+      "column_name": "activity_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "institution_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "source_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "experiment_id",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "frequency",
+      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json",
+      "required": true
+    },
+    {
+      "column_name": "realm",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "table_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "member_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "grid_label",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "variable_id",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "time_range",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "chunk_freq",
+      "required": false
+    },
+    {
+      "column_name":"platform",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name":"target",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+     "column_name": "cell_methods",
+      "vocabulary": "",
+      "required": "enhanced"
+    },
+    {
+      "column_name": "path",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "dimensions",
+      "vocabulary": "",
+      "required": "enhanced"
+    },
+    {
+      "column_name": "version_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "standard_name",
+      "vocabulary": "",
+      "required": "enhanced"
+    }
+  ],
+  "assets": {
+    "column_name": "path",
+    "format": "netcdf",
+    "format_column_name": null
+  },
+  "aggregation_control": {
+    "variable_column_name": "variable_id",
+    "groupby_attrs": [
+      "source_id",
+      "experiment_id",
+      "frequency",
+      "table_id",
+      "grid_label", 
+      "realm",
+      "member_id",
+      "chunk_freq",
+      "time_range"
+    ],
+    "aggregations": [
+      {
+        "type": "union",
+        "attribute_name": "variable_id",
+        "options": {}
+      },
+      {
+        "type": "join_existing",
+        "attribute_name": "time_range",
+        "options": {
+          "dim": "time",
+          "coords": "minimal",
+          "compat": "override"
+        }
+      }
+    ]
+  },
+  "id": "esm_catalog_ESM4",
+  "description": null,
+  "title": null,
+  "last_updated": "2023-05-07T16:35:52Z",
+  "catalog_file": "gfdl_autotest.csv"
+}
diff --git a/catalogbuilder/intakebuilder/configparser.py b/catalogbuilder/intakebuilder/configparser.py
@@ -30,4 +30,9 @@ def __init__(self, config):
             print("output_file_template :", self.output_file_template)
         except:
             raise KeyError("output_file_template does not exist in config")
+        try:
+            self.schema = configfile['schema']
+            print("schema:", self.schema)
+        except:
+            raise KeyError("schema does not exist in config")
 
diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py
@@ -27,7 +27,7 @@
         sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
 
 package_dir = os.path.dirname(os.path.abspath(__file__))
-template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
+#template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
 
 def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
          overwrite=False, append=False, slow = False):
@@ -42,7 +42,13 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
             
         input_path = configyaml.input_path
         output_path = configyaml.output_path
-
+        
+        if configyaml.schema is None or not configyaml.schema:
+            print("We will use catalog builder catalogbuilder/cats/gfdl_template.json as your json schema")
+            template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
+        else:
+            template_path = configyaml.schema
+            print("Using schema from config file", template_path)
     if not os.path.exists(input_path):
         sys.exit("Input path does not exist. Adjust configuration.")
     if not os.path.exists(Path(output_path).parent.absolute()):
diff --git a/catalogbuilder/tests/config-mdtf.yaml b/catalogbuilder/tests/config-mdtf.yaml
@@ -37,6 +37,6 @@ output_file_template: ['realm','time_range','variable_id']
 
 #######################################################
 
-json_template: "/home/a1r/git/forkCatalogBuilder-/catalogbuilder/cats/mdtf-template.json" #if your json schema is slighlty different but vetted with MSD, you may use your json schema here
+schema: "/home/a1r/git/forkCatalogBuilder-/catalogbuilder/cats/mdtf_template.json" #if your json schema is slighlty different but vetted with MSD, you may use your json schema here
 input_path:  "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
-output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30_test" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)