interface(nyz): add naive interface about grpo/rloo

PaParaZz1 · PaParaZz1 · commit 2e494371978f · 2025-02-13T14:43:17.000+08:00
diff --git a/ding/rl_utils/grpo.py b/ding/rl_utils/grpo.py
@@ -0,0 +1,18 @@
+from typing import Optional, Tuple
+from collections import namedtuple
+import torch
+import torch.nn as nn
+from torch.distributions import Independent, Normal
+
+grpo_policy_data = namedtuple('grpo_policy_data', ['logit_new', 'logit_old', 'logit_ref', 'action', 'adv', 'weight'])
+
+
+def grpo_policy_error(
+        data: namedtuple,
+        clip_ratio: float = 0.2,
+) -> Tuple[namedtuple, namedtuple]:
+    """
+    .. note::
+        Each element in this input data is a group of response samples from the same prompt.
+    """
+    raise NotImplementedError
diff --git a/ding/rl_utils/ppo.py b/ding/rl_utils/ppo.py
@@ -142,7 +142,7 @@ def ppo_policy_error(
         >>> loss, info = ppo_policy_error(data)
 
     .. note::
-        This function can be extended from `B` to more parallel dimensions, like `(B, S)`, where `S` is the 
+        This function can be extended from `B` to more parallel dimensions, like `(B, S)`, where `S` is the
         sequence length in LLM/VLM.
 
     .. note::
diff --git a/ding/rl_utils/rloo.py b/ding/rl_utils/rloo.py
@@ -0,0 +1,18 @@
+from typing import Optional, Tuple
+from collections import namedtuple
+import torch
+import torch.nn as nn
+from torch.distributions import Independent, Normal
+
+rloo_policy_data = namedtuple('rloo_policy_data', ['logit_new', 'logit_old', 'action', 'adv', 'weight'])
+
+
+def rloo_policy_error(
+        data: namedtuple,
+        clip_ratio: float = 0.2,
+) -> Tuple[namedtuple, namedtuple]:
+    """
+    .. note::
+        Each element in this input data is a group of response samples from the same prompt.
+    """
+    raise NotImplementedError