@@ -430,77 +430,76 @@ def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.ND
430430 return indptr , lengths , values
431431
432432
433- def simulate_clicks (cv_data : RelDataCV ) -> ClickFold :
434- """Simulate click data using position biased model (PBM)."""
435-
436- def init_rank_score (
437- X : sparse .csr_matrix ,
438- y : npt .NDArray [np .int32 ],
439- qid : npt .NDArray [np .int32 ],
440- sample_rate : float = 0.01 ,
441- ) -> npt .NDArray [np .float32 ]:
442- """We use XGBoost to generate the initial score instead of SVMRank for
443- simplicity.
444-
445- """
446- # random sample
447- _rng = np .random .default_rng (1994 )
448- n_samples = int (X .shape [0 ] * sample_rate )
449- index = np .arange (0 , X .shape [0 ], dtype = np .uint64 )
450- _rng .shuffle (index )
451- index = index [:n_samples ]
452-
453- X_train = X [index ]
454- y_train = y [index ]
455- qid_train = qid [index ]
456-
457- # Sort training data based on query id, required by XGBoost.
458- sorted_idx = np .argsort (qid_train )
459- X_train = X_train [sorted_idx ]
460- y_train = y_train [sorted_idx ]
461- qid_train = qid_train [sorted_idx ]
462-
463- ltr = xgboost .XGBRanker (objective = "rank:ndcg" , tree_method = "hist" )
464- ltr .fit (X_train , y_train , qid = qid_train )
465-
466- # Use the original order of the data.
467- scores = ltr .predict (X )
468- return scores
469-
470- def simulate_one_fold (
471- fold : Tuple [sparse .csr_matrix , npt .NDArray [np .int32 ], npt .NDArray [np .int32 ]],
472- scores_fold : npt .NDArray [np .float32 ],
473- ) -> ClickFold :
474- """Simulate clicks for one fold."""
475- X_fold , y_fold , qid_fold = fold
476- assert qid_fold .dtype == np .int32
477-
478- qids = np .unique (qid_fold )
479-
480- position = np .empty ((y_fold .size ,), dtype = np .int64 )
481- clicks = np .empty ((y_fold .size ,), dtype = np .int32 )
482- pbm = PBM (eta = 1.0 )
483-
484- # Avoid grouping by qid as we want to preserve the original data partition by
485- # the dataset authors.
486- for q in qids :
487- qid_mask = q == qid_fold
488- query_scores = scores_fold [qid_mask ]
489- # Initial rank list, scores sorted to decreasing order
490- query_position = np .argsort (query_scores )[::- 1 ]
491- position [qid_mask ] = query_position
492- # get labels
493- relevance_degrees = y_fold [qid_mask ]
494- query_clicks = pbm .sample_clicks_for_query (
495- relevance_degrees , query_position
496- )
497- clicks [qid_mask ] = query_clicks
498-
499- assert X_fold .shape [0 ] == qid_fold .shape [0 ], (X_fold .shape , qid_fold .shape )
500- assert X_fold .shape [0 ] == clicks .shape [0 ], (X_fold .shape , clicks .shape )
501-
502- return ClickFold (X_fold , y_fold , qid_fold , scores_fold , clicks , position )
433+ def init_rank_score (
434+ X : sparse .csr_matrix ,
435+ y : npt .NDArray [np .int32 ],
436+ qid : npt .NDArray [np .int32 ],
437+ sample_rate : float = 0.01 ,
438+ ) -> npt .NDArray [np .float32 ]:
439+ """We use XGBoost to generate the initial score instead of SVMRank for
440+ simplicity.
503441
442+ """
443+ # random sample
444+ rng = np .random .default_rng (1994 )
445+ n_samples = int (X .shape [0 ] * sample_rate )
446+ index = np .arange (0 , X .shape [0 ], dtype = np .uint64 )
447+ rng .shuffle (index )
448+ index = index [:n_samples ]
449+
450+ X_train = X [index ]
451+ y_train = y [index ]
452+ qid_train = qid [index ]
453+
454+ # Sort training data based on query id, required by XGBoost.
455+ sorted_idx = np .argsort (qid_train )
456+ X_train = X_train [sorted_idx ]
457+ y_train = y_train [sorted_idx ]
458+ qid_train = qid_train [sorted_idx ]
459+
460+ ltr = xgboost .XGBRanker (objective = "rank:ndcg" , tree_method = "hist" )
461+ ltr .fit (X_train , y_train , qid = qid_train )
462+
463+ # Use the original order of the data.
464+ scores = ltr .predict (X )
465+ return scores
466+
467+
468+ def simulate_one_fold (
469+ fold : Tuple [sparse .csr_matrix , npt .NDArray [np .int32 ], npt .NDArray [np .int32 ]],
470+ scores_fold : npt .NDArray [np .float32 ],
471+ ) -> ClickFold :
472+ """Simulate clicks for one fold."""
473+ X_fold , y_fold , qid_fold = fold
474+ assert qid_fold .dtype == np .int32
475+
476+ qids = np .unique (qid_fold )
477+
478+ position = np .empty ((y_fold .size ,), dtype = np .int64 )
479+ clicks = np .empty ((y_fold .size ,), dtype = np .int32 )
480+ pbm = PBM (eta = 1.0 )
481+
482+ # Avoid grouping by qid as we want to preserve the original data partition by
483+ # the dataset authors.
484+ for q in qids :
485+ qid_mask = q == qid_fold
486+ query_scores = scores_fold [qid_mask ]
487+ # Initial rank list, scores sorted to decreasing order
488+ query_position = np .argsort (query_scores )[::- 1 ]
489+ position [qid_mask ] = query_position
490+ # get labels
491+ relevance_degrees = y_fold [qid_mask ]
492+ query_clicks = pbm .sample_clicks_for_query (relevance_degrees , query_position )
493+ clicks [qid_mask ] = query_clicks
494+
495+ assert X_fold .shape [0 ] == qid_fold .shape [0 ], (X_fold .shape , qid_fold .shape )
496+ assert X_fold .shape [0 ] == clicks .shape [0 ], (X_fold .shape , clicks .shape )
497+
498+ return ClickFold (X_fold , y_fold , qid_fold , scores_fold , clicks , position )
499+
500+
501+ def simulate_clicks (cv_data : RelDataCV ) -> ClickFold : # pylint: disable=too-many-locals
502+ """Simulate click data using position biased model (PBM)."""
504503 X , y , qid = list (zip (cv_data .train , cv_data .test ))
505504
506505 indptr = np .array ([0 ] + [v .shape [0 ] for v in X ])
0 commit comments