add Factorization Machines

nntrongnghia · nntrongnghia · commit 28757490c31d · 2022-02-21T20:22:43.000+01:00
diff --git a/README.md b/README.md
@@ -19,8 +19,8 @@ Download these dataset and unzip in the root of this directory:
 5. Personalized Ranking for Recommender Systems: [`utils.py`](utils.py). For now, only the BRP loss function is implemented.
 6. Neural Collaborative Filtering for Personalized Ranking: [`neumf.py`](neumf.py)
 7. Sequence-Aware Recommender Systems: [`caser.py`](caser.py)
-8. Feature-Rich Recommender Systems: 
-9. Factorization Machines:
+8. Feature-Rich Recommender Systems: [`ctr.py`](ctr.py) contains dataloaders for CTR dataset
+9. Factorization Machines: [`fm.py`](fm.py)
 10. Deep Factorization Machines: 
 
 ## Usage
diff --git a/ctr.py b/ctr.py
@@ -17,7 +17,7 @@ def csv_reader(data_path):
 
 
 class CTRDataset(BaseDataset):
-    def __init__(self, data_dir, min_threshold=4):
+    def __init__(self, data_dir="./ctr", min_threshold=4):
         """Read CTR dataset from train.csv and test.csv
 
         Parameters
@@ -38,6 +38,7 @@ def __init__(self, data_dir, min_threshold=4):
             col: self.train_df[col].value_counts()
             for col in self.feat_cols}
         # Feature mapper maps a unique encoded value to an identifier
+        # So each value is considered to be a categorical value.
         # Unique values are filtered with occurence greater or equal to min_threshold
         # A default value will be assign to values that not defined in feature mapper
         self.feat_mapper = {}
@@ -53,38 +54,43 @@ def _constant_factory(v):
         # Feature dimension = number of unique values = number of values in mapper + defaults
         self.feat_dims = np.array([len(mapper) + 1
                                    for mapper in self.feat_mapper.values()])
-        # Offset is a value add to the whole field to discriminate column order
-        self.offsets = np.array((0, *np.cumsum(self.feat_dims).tolist()[:-1]))
+        # Offset is a value add to the whole field to discriminate values in different columns
+        self.offsets = np.array((0, *np.cumsum(self.feat_dims).tolist()[:-1])).astype(np.int32)
         # Map values in dataframe
         for col, mapper in self.feat_mapper.items():
             self.train_df[col] = self.train_df[col].map(mapper)
             self.test_df[col] = self.test_df[col].map(mapper)
         # For each split
-        self.getitem_df = None
+        self.X = None
+        self.y = None
+
+    def build_items(self, train=True):
+        if train:
+            df = self.train_df
+        else:
+            df = self.test_df
+        self.X = df[self.feat_cols].values + self.offsets
+        self.y = df[0].values
 
     def split(self, *args, **kwargs) -> Tuple[BaseDataset, BaseDataset]:
         train_split = deepcopy(self)
-        del train_split.test_df
-        train_split.getitem_df = self.train_df
+        train_split.build_items(True)
 
         test_split = deepcopy(self)
-        del test_split.train_df
-        test_split.getitem_df = self.test_df
+        test_split.build_items(False)
+
         return train_split, test_split
 
     def __len__(self):
-        assert self.getitem_df is not None
-        return len(self.getitem_df)
+        assert self.X is not None and self.y is not None
+        return len(self.X)
 
     def __getitem__(self, idx):
-        assert self.getitem_df is not None
-        x = self.getitem_df[self.feat_cols].iloc[idx].values + self.offsets
-        y = self.getitem_df[0].iloc[idx]
-        return x, y
+        assert self.X is not None and self.y is not None
+        return self.X[idx], self.y[idx]
 
 
 if __name__ == "__main__":
     data = CTRDataset("ctr")
     train_split, test_split = data.split()
     print(train_split[0])
-    print(test_split[0])
diff --git a/fm.py b/fm.py
@@ -0,0 +1,108 @@
+from argparse import ArgumentParser
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorch_lightning.loggers import TensorBoardLogger
+from torchmetrics import Accuracy
+
+from lit_data import LitDataModule
+from lit_model import LitModel
+from ctr import CTRDataset
+
+
+class FactorizationMachine(nn.Module):
+    def __init__(self, feat_dims, embedding_dims):
+        super().__init__()
+        num_inputs = int(sum(feat_dims))
+        self.embedding = nn.Embedding(num_inputs, embedding_dims)
+        self.proj = nn.Embedding(num_inputs, 1)
+        self.fc = nn.Linear(1, 1)
+        for param in self.parameters():
+            try:
+                nn.init.xavier_normal_(param)
+            finally:
+                continue
+
+    def forward(self, x):
+        v = self.embedding(x)
+        interaction = 1/2*(v.sum(1)**2 - (v**2).sum(1)).sum(-1, keepdims=True)
+        proj = self.proj(x).sum(1)
+        logit = self.fc(proj + interaction)
+        return torch.sigmoid(logit).flatten()
+
+
+class LitFM(pl.LightningModule):
+    def __init__(self, lr=0.002, **kwargs):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model = FactorizationMachine(**kwargs)
+        self.lr = lr
+        self.train_acc = Accuracy()
+        self.test_acc = Accuracy()
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), self.lr, weight_decay=1e-5)
+
+    def forward(self, x):
+        return self.model(x)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        ypred = self(x)
+        loss = F.binary_cross_entropy(ypred, y.to(torch.float32))
+        self.train_acc.update(ypred, y)
+        return {"loss": loss}
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        ypred = self(x)
+        loss = F.binary_cross_entropy(ypred, y.to(torch.float32))
+        self.test_acc.update(ypred, y)
+        return {"loss": loss}
+
+    def training_epoch_end(self, outputs):
+        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
+        acc = self.train_acc.compute()
+        self.train_acc.reset()
+        self.logger.experiment.add_scalar(
+            "train/loss", avg_loss, self.current_epoch)
+        self.logger.experiment.add_scalar(
+            "train/acc", acc, self.current_epoch)
+
+    def validation_epoch_end(self, outputs):
+        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
+        acc = self.test_acc.compute()
+        self.test_acc.reset()
+        self.logger.experiment.add_scalar(
+            "val/loss", avg_loss, self.current_epoch)
+        self.logger.experiment.add_scalar(
+            "val/acc", acc, self.current_epoch)
+
+
+def main(args):
+    data = LitDataModule(
+        CTRDataset(), 
+        batch_size=args.batch_size,
+        num_workers=3,
+        prefetch_factor=4)
+    data.setup()
+
+    model = LitFM(
+        feat_dims=data.dataset.feat_dims,
+        embedding_dims=args.embedding_dims)
+
+    logger = TensorBoardLogger("lightning_logs", name=f"FM_{args.embedding_dims}")
+    trainer = pl.Trainer.from_argparse_args(args, logger=logger)
+    trainer.fit(model, data)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--embedding_dims", type=int, default=20)
+    parser.add_argument("--batch_size", type=int, default=1024)
+    pl.Trainer.add_argparse_args(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/lit_data.py b/lit_data.py
@@ -43,8 +43,8 @@ def __init__(self, dataset: BaseDataset,
         }
 
     def setup(self):
-        self.num_users = self.dataset.num_users
-        self.num_items = self.dataset.num_items
+        self.num_users = getattr(self.dataset, "num_users", None)
+        self.num_items = getattr(self.dataset, "num_items", None)
         self.train_split, self.test_split = self.dataset.split(
             self.train_ratio)
 

Original file line number	Diff line number	Diff line change
`@@ -43,8 +43,8 @@ def __init__(self, dataset: BaseDataset,`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`def setup(self):`
`46`		`- self.num_users = self.dataset.num_users`
`47`		`- self.num_items = self.dataset.num_items`
	`46`	`+ self.num_users = getattr(self.dataset, "num_users", None)`
	`47`	`+ self.num_items = getattr(self.dataset, "num_items", None)`
`48`	`48`	`self.train_split, self.test_split = self.dataset.split(`
`49`	`49`	`self.train_ratio)`
`50`	`50`