Merge pull request #47 from HumanCompatibleAI/reward-classification

dfilan · web-flow · commit b01fefe7da9d · 2023-08-14T12:10:07.000-07:00
Reward classification
diff --git a/src/reward_preprocessing/scripts/common/supervised.py b/src/reward_preprocessing/scripts/common/supervised.py
@@ -29,6 +29,9 @@ def config():
     # Only evaluate test loss on 4 batches when you're in the middle of a train epoch.
     # Set to None to evaluate on the whole test set.
     test_subset_within_epoch = 4
+    # flag to train classification for whether reward is 0 or not, rather than
+    # regression.
+    classify = False
     # use adversarial training. below are configs to be set if adversarial is set to
     # True. for details, see documentation of SupervisedTrainer in
     # trainers/supervised_trainer.py
@@ -103,19 +106,33 @@ def make_trainer(
     limit_samples: int,
     test_subset_within_epoch: Optional[int],
     opt_kwargs: Optional[Mapping[str, Any]],
+    classify: bool,
     adversarial: bool,
     start_epoch: Optional[int],
     nonsense_reward: Optional[float],
     num_acts: Optional[int],
     vis_frac_per_epoch: Optional[float],
     gradient_clip_percentile: Optional[float],
+    device: str,
     debugging: Mapping,
 ) -> SupervisedTrainer:
     if not adversarial:
-        # MSE loss with mean reduction (the default)
-        # Mean reduction means every batch affects model updates the same, regardless of
-        # batch_size.
-        loss_fn = th.nn.MSELoss()
+        if not classify:
+            # MSE loss with mean reduction (the default)
+            # Mean reduction means every batch affects model updates the same,
+            # regardless of batch_size.
+            loss_fn = th.nn.MSELoss()
+        else:
+            # loss function takes outputs (interpreted as log-probability reward is
+            # zero), reward, and computes the cross-entropy loss.
+            def loss_fn(input, target):
+                if len(input.shape) == 1:
+                    input = input[:, None]
+                zeros = th.zeros(input.shape).to(device)
+                log_probs = th.cat((input, zeros), dim=1)
+                target_classes = (target != 0).long()
+                return th.nn.CrossEntropyLoss()(log_probs, target_classes)
+
     else:
         # Huber loss with mean reduction
         # When the prediction is within a distance of sqrt(3) of the regression target,
diff --git a/src/reward_preprocessing/scripts/train_regression.py b/src/reward_preprocessing/scripts/train_regression.py
@@ -64,6 +64,7 @@ def train_regression(supervised, checkpoint_epoch_interval: int):  # From ingred
         model=model,
         custom_logger=custom_logger,
         num_acts=num_acts,
+        device=device,
     )
 
     trainer.log_data_stats()

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ def train_regression(supervised, checkpoint_epoch_interval: int): # From ingred`
`64`	`64`	`model=model,`
`65`	`65`	`custom_logger=custom_logger,`
`66`	`66`	`num_acts=num_acts,`
	`67`	`+ device=device,`
`67`	`68`	`)`
`68`	`69`
`69`	`70`	`trainer.log_data_stats()`