Updated code comments.

AndyJZhao · AndyJZhao · commit a221f384a54f · 2024-06-03T08:13:52.000-04:00
diff --git a/configs/data.yaml b/configs/data.yaml
@@ -1,16 +1,16 @@
 # @package _global_
 
 # ! Dataset Preprocessing
+preprocess_device: gpu # Set to cpu if your GPU memory is below 32GB
 add_self_loop: false
 to_bidirected: true
 n_hops: 2
 
 # ! Train and Evaluation Dataset Lookup
 dataset: Debug
-preprocess_device: gpu # Set to cpu if your GPU memory is below 32GB
 train_datasets: ${oc.select:_dataset_lookup.${dataset}.train,${dataset}}
 eval_datasets: ${oc.select:_dataset_lookup.${dataset}.eval,${dataset}}
-_trans_datasets: [ Arxiv, Product, Cora, Wisconsin ]
+_trans_datasets: [ Arxiv, Product, Cora, Wisconsin ] # Used when identifying heldout datasets.
 
 _all_datasets: [
   Arxiv,
@@ -101,17 +101,19 @@ _ds_meta_data:
   # WikiTraffic: Nodes represent web pages and edges represent hyperlinks between them. Node features represent several informative nouns in the Wikipedia pages. The task is to predict the average daily traffic of the web page.
   Chameleon: pyg, WikipediaNetwork.chameleon # 5201, 217073, 2089, 5
   Squirrel: pyg, WikipediaNetwork.squirrel # 2277, 36101, 2325, 5
-
+  # Airport traffic graphs
   AirBrazil: pyg, Airports.Brazil # 131 1,038 131 4
   AirUS: pyg, Airports.USA # 1,190 13,599 1190 4
   AirEU: pyg, Airports.Europe # 399 5,995 399 4
 
   # ! HeterophilousGraphDataset
+  # See https://arxiv.org/abs/2302.11640 for details
   Roman: pyg, HeterophilousGraphDataset.Roman-empire # 22,662 32,927 300 18
   AmzRatings: pyg, HeterophilousGraphDataset.Amazon-ratings # 24,492 93,050 300 5
   Minesweeper: pyg, HeterophilousGraphDataset.Minesweeper # 10,000 39,402 7 2
   Tolokers: pyg, HeterophilousGraphDataset.Tolokers # 11,758 519,000 10 2
   Questions: pyg, HeterophilousGraphDataset.Questions # 48,921 153,540 301 2
+
   # Each node corresponds to an actor, and the edge between two nodes denotes co-occurrence on the same Wikipedia page. Node features correspond to some keywords in the Wikipedia pages. The task is to classify the nodes into five categories in terms of words of actor’s Wikipedia.
   Actor: pyg, Actor # 7,600 30,019 932 5
 
diff --git a/graphany/data.py b/graphany/data.py
@@ -12,7 +12,6 @@
 import numpy as np
 import pytorch_lightning as pl
 import torch
-import torch.nn.functional as F
 from hydra.utils import instantiate
 from omegaconf import OmegaConf
 from scipy.spatial.distance import pdist, squareform
@@ -25,15 +24,13 @@
 from graphany.utils import logger, timer
 
 
-def get_entropy_normed_cond_gaussian_prob(
-    X, entropy, beta_list=None, metric="euclidean", use_cpython=False, return_beta=False
-):
+def get_entropy_normed_cond_gaussian_prob(X, entropy, metric="euclidean"):
     """
     Parameters
     ----------
     X:              The matrix for pairwise similarity
     entropy:     Perplexity of the conditional prob distribution
-    Returns conditional probability
+    Returns the entropy-normalized conditional gaussian probability based on distances.
     -------
     """
 
@@ -175,30 +172,28 @@ def val_dataloader(self):
         sub_dataloaders = {
             name: ds.val_dataloader() for name, ds in self.eval_ds_dict.items()
         }
-        return pl.utilities.combined_loader.CombinedLoader(
-            sub_dataloaders, "max_size"
-        )  # Use max_size instead of max_size_cycle to avoid duplicates
+        # Use max_size instead of max_size_cycle to avoid repeated evaluation on small datasets
+        return pl.utilities.combined_loader.CombinedLoader(sub_dataloaders, "max_size")
 
     def test_dataloader(self):
         sub_dataloaders = {
             name: ds.test_dataloader() for name, ds in self.eval_ds_dict.items()
         }
-        return pl.utilities.combined_loader.CombinedLoader(
-            sub_dataloaders, "max_size"
-        )  # Use max_size instead of max_size_cycle to avoid duplicates
+        # Use max_size instead of max_size_cycle to avoid repeated evaluation on small datasets
+        return pl.utilities.combined_loader.CombinedLoader(sub_dataloaders, "max_size")
 
 
 class GraphDataset(pl.LightningDataModule):
     def __init__(
-        self,
-        cfg,
-        ds_name,
-        cache_dir,
-        train_batch_size=256,
-        val_test_batch_size=256,
-        n_hops=1,
-        preprocess_device=torch.device("cpu"),
-        permute_label=False,
+            self,
+            cfg,
+            ds_name,
+            cache_dir,
+            train_batch_size=256,
+            val_test_batch_size=256,
+            n_hops=1,
+            preprocess_device=torch.device("cpu"),
+            permute_label=False,
     ):
         super().__init__()
         self.cfg = cfg
@@ -369,9 +364,9 @@ def to_mask(indices):
             label = dataset.y
 
             if (
-                hasattr(dataset, "train_mask")
-                and hasattr(dataset, "val_mask")
-                and hasattr(dataset, "test_mask")
+                    hasattr(dataset, "train_mask")
+                    and hasattr(dataset, "val_mask")
+                    and hasattr(dataset, "test_mask")
             ):
                 train_mask, val_mask, test_mask = (
                     dataset.train_mask,
@@ -399,9 +394,8 @@ def to_mask(indices):
             # ! Multiple splits
             # Modified: Use the ${seed} split if not specified!
             split_index = self.data_init_args.get("split", self.cfg.seed)
-            self.split_index = split_index = (
-                split_index % train_mask.ndim
-            )  # Avoid invalid seed value
+            # Avoid invalid split index
+            self.split_index = split_index = (split_index % train_mask.ndim)
             train_mask = train_mask[:, split_index].squeeze()
             val_mask = val_mask[:, split_index].squeeze()
             if test_mask.ndim == 2:
@@ -422,29 +416,31 @@ def to_mask(indices):
         return g, label, feat, train_mask, val_mask, test_mask, num_class
 
     def compute_linear_gnn_logits(
-        self, features, n_per_label_examples, visible_nodes, bootstrap=False
+            self, features, n_per_label_examples, visible_nodes, bootstrap=False
     ):
+        # Compute and save LinearGNN logits into a dict. Note the computation is on CPU as torch does not support
+        # the gelss driver on GPU currently.
         preds = {}
         label, num_class, device = self.label, self.num_class, torch.device("cpu")
         label = label.to(device)
         visible_nodes = visible_nodes.to(device)
-        for channel, X in features.items():
-            X = X.to(device)
+        for channel, F in features.items():
+            F = F.to(device)
             if bootstrap:
                 ref_nodes = sample_k_nodes_per_label(
                     label, visible_nodes, n_per_label_examples, num_class
                 )
             else:
                 ref_nodes = visible_nodes
-            Y_L = F.one_hot(label[ref_nodes], num_class).float()
+            Y_L = torch.nn.functional.one_hot(label[ref_nodes], num_class).float()
             with timer(
-                f"Solving with CPU driver (N={len(ref_nodes)}, d={X.shape[1]}, k={num_class})",
-                logger.debug,
+                    f"Solving with CPU driver (N={len(ref_nodes)}, d={F.shape[1]}, k={num_class})",
+                    logger.debug,
             ):
                 W = torch.linalg.lstsq(
-                    X[ref_nodes.cpu()].cpu(), Y_L.cpu(), driver="gelss"
+                    F[ref_nodes.cpu()].cpu(), Y_L.cpu(), driver="gelss"
                 )[0]
-            preds[channel] = X @ W
+            preds[channel] = F @ W
 
         return preds
 
@@ -466,8 +462,8 @@ def prepare_prop_features_logits_and_dist_features(self, g, input_feats, n_hops)
         if not os.path.exists(self.cache_f_name):
             g = g.to(self.preprocess_device)
             with timer(
-                f"Computing {self.name} message passing and normalized predictions to file {self.cache_f_name}",
-                logger.info,
+                    f"Computing {self.name} message passing and normalized predictions to file {self.cache_f_name}",
+                    logger.info,
             ):
                 dim = input_feats.size(1)
                 LP = torch.zeros(n_hops, g.number_of_nodes(), dim).to(
@@ -504,9 +500,9 @@ def prepare_prop_features_logits_and_dist_features(self, g, input_feats, n_hops)
             features, unmasked_pred = torch.load(self.cache_f_name, map_location="cpu")
         if not os.path.exists(self.dist_f_name):
             with timer(
-                f"Computing {self.name} conditional gaussian distances "
-                f"to file {self.dist_f_name}",
-                logger.info,
+                    f"Computing {self.name} conditional gaussian distances "
+                    f"and save to {self.dist_f_name}",
+                    logger.info,
             ):
                 # y_feat: n_nodes, n_channels, n_labels
                 y_feat = np.stack(
@@ -532,7 +528,6 @@ def prepare_prop_features_logits_and_dist_features(self, g, input_feats, n_hops)
                             dist[:, pair_index] = cond_gaussian_prob[:, c, c_prime]
                             pair_index += 1
 
-                # Convert dist to a PyTorch tensor and move it to the same device as y_feat
                 dist = torch.from_numpy(dist)
                 torch.save(dist, self.dist_f_name)
         else:
diff --git a/graphany/model.py b/graphany/model.py
@@ -37,10 +37,9 @@ def compute_dist(self, y_feat):
                 y_feat[i, :, :].cpu().numpy(), self.entropy
             )
 
-        # Create dist as a numpy array
+        # Compute pairwise distances between channels n_channels(n_channels-1)/2 total features
         dist = np.zeros((bsz, self.dist_feat_dim), dtype=np.float32)
 
-        # Compute pairwise distances between channels n_channels(n_channels-1)/2 total features
         pair_index = 0
         for c in range(n_channel):
             for c_prime in range(n_channel):
@@ -52,14 +51,13 @@ def compute_dist(self, y_feat):
         return dist
 
     def forward(self, logit_dict, dist=None, **kwargs):
-        # Label logits tensor of shape (batch_size, n_channels, * n_classes)
+        # logit_dict: key: channel, value: prediction of shape (batch_size, n_classes)
         y_feat = torch.stack([logit_dict[c] for c in self.feat_channels], dim=1)
         y_pred = torch.stack([logit_dict[c] for c in self.pred_channels], dim=1)
 
         # ! Fuse y_pred with attentions
-        # Compute attention of (batch_size, n_channels)
         dist = self.compute_dist(y_feat) if dist is None else dist
-        # Project pairwise differences to the attention scores via MLP
+        # Project pairwise differences to the attention scores (batch_size, n_channels)
         attention = self.mlp(dist)
         attention = th.softmax(attention / self.att_temperature, dim=-1)
         fused_y = th.sum(
diff --git a/graphany/run.py b/graphany/run.py
@@ -18,7 +18,7 @@
 mean = lambda input: np.round(np.mean(input).item(), 2)
 
 
-class InductiveLabelPred(pl.LightningModule):
+class InductiveNodeClassification(pl.LightningModule):
     def __init__(self, cfg, combined_dataset, checkpoint=None):
         super().__init__()
         self.cfg = cfg
@@ -73,8 +73,6 @@ def get_metric_name(self, ds_name, split):
             return f"ind/{ds_name.lower()[:4]}_{split}_acc"
 
     def configure_optimizers(self):
-        num_devices = self.cfg.gpus if self.cfg.gpus > 0 else 1
-
         # start with all the candidate parameters
         param_dict = {pn: p for pn, p in self.named_parameters()}
         # filter out those that do not require grad
@@ -101,7 +99,7 @@ def configure_optimizers(self):
         else:  # AdamW
             optimizer = torch.optim.AdamW(
                 optim_groups,
-                lr=self.cfg.lr * num_devices,
+                lr=self.cfg.lr,
                 weight_decay=self.cfg.weight_decay,
             )
         return optimizer
@@ -117,9 +115,6 @@ def move_metrics_to_device(self):
         for metrics_dict in self.metrics.values():
             for metric in metrics_dict.values():
                 metric.to(self.device)
-        # Example for a direct metric attribute
-        if hasattr(self, "accuracy"):
-            self.accuracy.to(self.device)
 
     def predict(self, ds, nodes, input, is_training=False):
         # Use preprocessed distance during evaluation
@@ -259,7 +254,7 @@ def construct_ds_dict(datasets):
 
     combined_dataset = CombinedDataset(train_ds_dict, eval_ds_dict, cfg)
 
-    model = InductiveLabelPred(cfg, combined_dataset, cfg.get("prev_ckpt"))
+    model = InductiveNodeClassification(cfg, combined_dataset, cfg.get("prev_ckpt"))
     # Set up the checkpoint callback to save only at the end of training
     checkpoint_callback = pl.callbacks.ModelCheckpoint(
         dirpath=cfg.dirs.output,  # specify where to save