Skip to content

Commit 26a261b

Browse files
committed
pylint.
1 parent 0fcff38 commit 26a261b

File tree

1 file changed

+69
-70
lines changed
  • python-package/xgboost/testing

1 file changed

+69
-70
lines changed

python-package/xgboost/testing/data.py

+69-70
Original file line numberDiff line numberDiff line change
@@ -430,77 +430,76 @@ def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.ND
430430
return indptr, lengths, values
431431

432432

433-
def simulate_clicks(cv_data: RelDataCV) -> ClickFold:
434-
"""Simulate click data using position biased model (PBM)."""
435-
436-
def init_rank_score(
437-
X: sparse.csr_matrix,
438-
y: npt.NDArray[np.int32],
439-
qid: npt.NDArray[np.int32],
440-
sample_rate: float = 0.01,
441-
) -> npt.NDArray[np.float32]:
442-
"""We use XGBoost to generate the initial score instead of SVMRank for
443-
simplicity.
444-
445-
"""
446-
# random sample
447-
_rng = np.random.default_rng(1994)
448-
n_samples = int(X.shape[0] * sample_rate)
449-
index = np.arange(0, X.shape[0], dtype=np.uint64)
450-
_rng.shuffle(index)
451-
index = index[:n_samples]
452-
453-
X_train = X[index]
454-
y_train = y[index]
455-
qid_train = qid[index]
456-
457-
# Sort training data based on query id, required by XGBoost.
458-
sorted_idx = np.argsort(qid_train)
459-
X_train = X_train[sorted_idx]
460-
y_train = y_train[sorted_idx]
461-
qid_train = qid_train[sorted_idx]
462-
463-
ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
464-
ltr.fit(X_train, y_train, qid=qid_train)
465-
466-
# Use the original order of the data.
467-
scores = ltr.predict(X)
468-
return scores
469-
470-
def simulate_one_fold(
471-
fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
472-
scores_fold: npt.NDArray[np.float32],
473-
) -> ClickFold:
474-
"""Simulate clicks for one fold."""
475-
X_fold, y_fold, qid_fold = fold
476-
assert qid_fold.dtype == np.int32
477-
478-
qids = np.unique(qid_fold)
479-
480-
position = np.empty((y_fold.size,), dtype=np.int64)
481-
clicks = np.empty((y_fold.size,), dtype=np.int32)
482-
pbm = PBM(eta=1.0)
483-
484-
# Avoid grouping by qid as we want to preserve the original data partition by
485-
# the dataset authors.
486-
for q in qids:
487-
qid_mask = q == qid_fold
488-
query_scores = scores_fold[qid_mask]
489-
# Initial rank list, scores sorted to decreasing order
490-
query_position = np.argsort(query_scores)[::-1]
491-
position[qid_mask] = query_position
492-
# get labels
493-
relevance_degrees = y_fold[qid_mask]
494-
query_clicks = pbm.sample_clicks_for_query(
495-
relevance_degrees, query_position
496-
)
497-
clicks[qid_mask] = query_clicks
498-
499-
assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
500-
assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)
501-
502-
return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
433+
def init_rank_score(
434+
X: sparse.csr_matrix,
435+
y: npt.NDArray[np.int32],
436+
qid: npt.NDArray[np.int32],
437+
sample_rate: float = 0.01,
438+
) -> npt.NDArray[np.float32]:
439+
"""We use XGBoost to generate the initial score instead of SVMRank for
440+
simplicity.
503441
442+
"""
443+
# random sample
444+
rng = np.random.default_rng(1994)
445+
n_samples = int(X.shape[0] * sample_rate)
446+
index = np.arange(0, X.shape[0], dtype=np.uint64)
447+
rng.shuffle(index)
448+
index = index[:n_samples]
449+
450+
X_train = X[index]
451+
y_train = y[index]
452+
qid_train = qid[index]
453+
454+
# Sort training data based on query id, required by XGBoost.
455+
sorted_idx = np.argsort(qid_train)
456+
X_train = X_train[sorted_idx]
457+
y_train = y_train[sorted_idx]
458+
qid_train = qid_train[sorted_idx]
459+
460+
ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
461+
ltr.fit(X_train, y_train, qid=qid_train)
462+
463+
# Use the original order of the data.
464+
scores = ltr.predict(X)
465+
return scores
466+
467+
468+
def simulate_one_fold(
469+
fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
470+
scores_fold: npt.NDArray[np.float32],
471+
) -> ClickFold:
472+
"""Simulate clicks for one fold."""
473+
X_fold, y_fold, qid_fold = fold
474+
assert qid_fold.dtype == np.int32
475+
476+
qids = np.unique(qid_fold)
477+
478+
position = np.empty((y_fold.size,), dtype=np.int64)
479+
clicks = np.empty((y_fold.size,), dtype=np.int32)
480+
pbm = PBM(eta=1.0)
481+
482+
# Avoid grouping by qid as we want to preserve the original data partition by
483+
# the dataset authors.
484+
for q in qids:
485+
qid_mask = q == qid_fold
486+
query_scores = scores_fold[qid_mask]
487+
# Initial rank list, scores sorted to decreasing order
488+
query_position = np.argsort(query_scores)[::-1]
489+
position[qid_mask] = query_position
490+
# get labels
491+
relevance_degrees = y_fold[qid_mask]
492+
query_clicks = pbm.sample_clicks_for_query(relevance_degrees, query_position)
493+
clicks[qid_mask] = query_clicks
494+
495+
assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
496+
assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)
497+
498+
return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
499+
500+
501+
def simulate_clicks(cv_data: RelDataCV) -> ClickFold: # pylint: disable=too-many-locals
502+
"""Simulate click data using position biased model (PBM)."""
504503
X, y, qid = list(zip(cv_data.train, cv_data.test))
505504

506505
indptr = np.array([0] + [v.shape[0] for v in X])

0 commit comments

Comments
 (0)