pylint.

trivialfis · trivialfis · commit 26a261b2e480 · 2023-03-31T19:41:40.000+08:00
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
@@ -430,77 +430,76 @@ def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.ND
     return indptr, lengths, values
 
 
-def simulate_clicks(cv_data: RelDataCV) -> ClickFold:
-    """Simulate click data using position biased model (PBM)."""
-
-    def init_rank_score(
-        X: sparse.csr_matrix,
-        y: npt.NDArray[np.int32],
-        qid: npt.NDArray[np.int32],
-        sample_rate: float = 0.01,
-    ) -> npt.NDArray[np.float32]:
-        """We use XGBoost to generate the initial score instead of SVMRank for
-        simplicity.
-
-        """
-        # random sample
-        _rng = np.random.default_rng(1994)
-        n_samples = int(X.shape[0] * sample_rate)
-        index = np.arange(0, X.shape[0], dtype=np.uint64)
-        _rng.shuffle(index)
-        index = index[:n_samples]
-
-        X_train = X[index]
-        y_train = y[index]
-        qid_train = qid[index]
-
-        # Sort training data based on query id, required by XGBoost.
-        sorted_idx = np.argsort(qid_train)
-        X_train = X_train[sorted_idx]
-        y_train = y_train[sorted_idx]
-        qid_train = qid_train[sorted_idx]
-
-        ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
-        ltr.fit(X_train, y_train, qid=qid_train)
-
-        # Use the original order of the data.
-        scores = ltr.predict(X)
-        return scores
-
-    def simulate_one_fold(
-        fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
-        scores_fold: npt.NDArray[np.float32],
-    ) -> ClickFold:
-        """Simulate clicks for one fold."""
-        X_fold, y_fold, qid_fold = fold
-        assert qid_fold.dtype == np.int32
-
-        qids = np.unique(qid_fold)
-
-        position = np.empty((y_fold.size,), dtype=np.int64)
-        clicks = np.empty((y_fold.size,), dtype=np.int32)
-        pbm = PBM(eta=1.0)
-
-        # Avoid grouping by qid as we want to preserve the original data partition by
-        # the dataset authors.
-        for q in qids:
-            qid_mask = q == qid_fold
-            query_scores = scores_fold[qid_mask]
-            # Initial rank list, scores sorted to decreasing order
-            query_position = np.argsort(query_scores)[::-1]
-            position[qid_mask] = query_position
-            # get labels
-            relevance_degrees = y_fold[qid_mask]
-            query_clicks = pbm.sample_clicks_for_query(
-                relevance_degrees, query_position
-            )
-            clicks[qid_mask] = query_clicks
-
-        assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
-        assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)
-
-        return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
+def init_rank_score(
+    X: sparse.csr_matrix,
+    y: npt.NDArray[np.int32],
+    qid: npt.NDArray[np.int32],
+    sample_rate: float = 0.01,
+) -> npt.NDArray[np.float32]:
+    """We use XGBoost to generate the initial score instead of SVMRank for
+    simplicity.
 
+    """
+    # random sample
+    rng = np.random.default_rng(1994)
+    n_samples = int(X.shape[0] * sample_rate)
+    index = np.arange(0, X.shape[0], dtype=np.uint64)
+    rng.shuffle(index)
+    index = index[:n_samples]
+
+    X_train = X[index]
+    y_train = y[index]
+    qid_train = qid[index]
+
+    # Sort training data based on query id, required by XGBoost.
+    sorted_idx = np.argsort(qid_train)
+    X_train = X_train[sorted_idx]
+    y_train = y_train[sorted_idx]
+    qid_train = qid_train[sorted_idx]
+
+    ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
+    ltr.fit(X_train, y_train, qid=qid_train)
+
+    # Use the original order of the data.
+    scores = ltr.predict(X)
+    return scores
+
+
+def simulate_one_fold(
+    fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
+    scores_fold: npt.NDArray[np.float32],
+) -> ClickFold:
+    """Simulate clicks for one fold."""
+    X_fold, y_fold, qid_fold = fold
+    assert qid_fold.dtype == np.int32
+
+    qids = np.unique(qid_fold)
+
+    position = np.empty((y_fold.size,), dtype=np.int64)
+    clicks = np.empty((y_fold.size,), dtype=np.int32)
+    pbm = PBM(eta=1.0)
+
+    # Avoid grouping by qid as we want to preserve the original data partition by
+    # the dataset authors.
+    for q in qids:
+        qid_mask = q == qid_fold
+        query_scores = scores_fold[qid_mask]
+        # Initial rank list, scores sorted to decreasing order
+        query_position = np.argsort(query_scores)[::-1]
+        position[qid_mask] = query_position
+        # get labels
+        relevance_degrees = y_fold[qid_mask]
+        query_clicks = pbm.sample_clicks_for_query(relevance_degrees, query_position)
+        clicks[qid_mask] = query_clicks
+
+    assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
+    assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)
+
+    return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
+
+
+def simulate_clicks(cv_data: RelDataCV) -> ClickFold:  # pylint: disable=too-many-locals
+    """Simulate click data using position biased model (PBM)."""
     X, y, qid = list(zip(cv_data.train, cv_data.test))
 
     indptr = np.array([0] + [v.shape[0] for v in X])