@@ -430,77 +430,76 @@ def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.ND
430
430
return indptr , lengths , values
431
431
432
432
433
- def simulate_clicks (cv_data : RelDataCV ) -> ClickFold :
434
- """Simulate click data using position biased model (PBM)."""
435
-
436
- def init_rank_score (
437
- X : sparse .csr_matrix ,
438
- y : npt .NDArray [np .int32 ],
439
- qid : npt .NDArray [np .int32 ],
440
- sample_rate : float = 0.01 ,
441
- ) -> npt .NDArray [np .float32 ]:
442
- """We use XGBoost to generate the initial score instead of SVMRank for
443
- simplicity.
444
-
445
- """
446
- # random sample
447
- _rng = np .random .default_rng (1994 )
448
- n_samples = int (X .shape [0 ] * sample_rate )
449
- index = np .arange (0 , X .shape [0 ], dtype = np .uint64 )
450
- _rng .shuffle (index )
451
- index = index [:n_samples ]
452
-
453
- X_train = X [index ]
454
- y_train = y [index ]
455
- qid_train = qid [index ]
456
-
457
- # Sort training data based on query id, required by XGBoost.
458
- sorted_idx = np .argsort (qid_train )
459
- X_train = X_train [sorted_idx ]
460
- y_train = y_train [sorted_idx ]
461
- qid_train = qid_train [sorted_idx ]
462
-
463
- ltr = xgboost .XGBRanker (objective = "rank:ndcg" , tree_method = "hist" )
464
- ltr .fit (X_train , y_train , qid = qid_train )
465
-
466
- # Use the original order of the data.
467
- scores = ltr .predict (X )
468
- return scores
469
-
470
- def simulate_one_fold (
471
- fold : Tuple [sparse .csr_matrix , npt .NDArray [np .int32 ], npt .NDArray [np .int32 ]],
472
- scores_fold : npt .NDArray [np .float32 ],
473
- ) -> ClickFold :
474
- """Simulate clicks for one fold."""
475
- X_fold , y_fold , qid_fold = fold
476
- assert qid_fold .dtype == np .int32
477
-
478
- qids = np .unique (qid_fold )
479
-
480
- position = np .empty ((y_fold .size ,), dtype = np .int64 )
481
- clicks = np .empty ((y_fold .size ,), dtype = np .int32 )
482
- pbm = PBM (eta = 1.0 )
483
-
484
- # Avoid grouping by qid as we want to preserve the original data partition by
485
- # the dataset authors.
486
- for q in qids :
487
- qid_mask = q == qid_fold
488
- query_scores = scores_fold [qid_mask ]
489
- # Initial rank list, scores sorted to decreasing order
490
- query_position = np .argsort (query_scores )[::- 1 ]
491
- position [qid_mask ] = query_position
492
- # get labels
493
- relevance_degrees = y_fold [qid_mask ]
494
- query_clicks = pbm .sample_clicks_for_query (
495
- relevance_degrees , query_position
496
- )
497
- clicks [qid_mask ] = query_clicks
498
-
499
- assert X_fold .shape [0 ] == qid_fold .shape [0 ], (X_fold .shape , qid_fold .shape )
500
- assert X_fold .shape [0 ] == clicks .shape [0 ], (X_fold .shape , clicks .shape )
501
-
502
- return ClickFold (X_fold , y_fold , qid_fold , scores_fold , clicks , position )
433
+ def init_rank_score (
434
+ X : sparse .csr_matrix ,
435
+ y : npt .NDArray [np .int32 ],
436
+ qid : npt .NDArray [np .int32 ],
437
+ sample_rate : float = 0.01 ,
438
+ ) -> npt .NDArray [np .float32 ]:
439
+ """We use XGBoost to generate the initial score instead of SVMRank for
440
+ simplicity.
503
441
442
+ """
443
+ # random sample
444
+ rng = np .random .default_rng (1994 )
445
+ n_samples = int (X .shape [0 ] * sample_rate )
446
+ index = np .arange (0 , X .shape [0 ], dtype = np .uint64 )
447
+ rng .shuffle (index )
448
+ index = index [:n_samples ]
449
+
450
+ X_train = X [index ]
451
+ y_train = y [index ]
452
+ qid_train = qid [index ]
453
+
454
+ # Sort training data based on query id, required by XGBoost.
455
+ sorted_idx = np .argsort (qid_train )
456
+ X_train = X_train [sorted_idx ]
457
+ y_train = y_train [sorted_idx ]
458
+ qid_train = qid_train [sorted_idx ]
459
+
460
+ ltr = xgboost .XGBRanker (objective = "rank:ndcg" , tree_method = "hist" )
461
+ ltr .fit (X_train , y_train , qid = qid_train )
462
+
463
+ # Use the original order of the data.
464
+ scores = ltr .predict (X )
465
+ return scores
466
+
467
+
468
+ def simulate_one_fold (
469
+ fold : Tuple [sparse .csr_matrix , npt .NDArray [np .int32 ], npt .NDArray [np .int32 ]],
470
+ scores_fold : npt .NDArray [np .float32 ],
471
+ ) -> ClickFold :
472
+ """Simulate clicks for one fold."""
473
+ X_fold , y_fold , qid_fold = fold
474
+ assert qid_fold .dtype == np .int32
475
+
476
+ qids = np .unique (qid_fold )
477
+
478
+ position = np .empty ((y_fold .size ,), dtype = np .int64 )
479
+ clicks = np .empty ((y_fold .size ,), dtype = np .int32 )
480
+ pbm = PBM (eta = 1.0 )
481
+
482
+ # Avoid grouping by qid as we want to preserve the original data partition by
483
+ # the dataset authors.
484
+ for q in qids :
485
+ qid_mask = q == qid_fold
486
+ query_scores = scores_fold [qid_mask ]
487
+ # Initial rank list, scores sorted to decreasing order
488
+ query_position = np .argsort (query_scores )[::- 1 ]
489
+ position [qid_mask ] = query_position
490
+ # get labels
491
+ relevance_degrees = y_fold [qid_mask ]
492
+ query_clicks = pbm .sample_clicks_for_query (relevance_degrees , query_position )
493
+ clicks [qid_mask ] = query_clicks
494
+
495
+ assert X_fold .shape [0 ] == qid_fold .shape [0 ], (X_fold .shape , qid_fold .shape )
496
+ assert X_fold .shape [0 ] == clicks .shape [0 ], (X_fold .shape , clicks .shape )
497
+
498
+ return ClickFold (X_fold , y_fold , qid_fold , scores_fold , clicks , position )
499
+
500
+
501
+ def simulate_clicks (cv_data : RelDataCV ) -> ClickFold : # pylint: disable=too-many-locals
502
+ """Simulate click data using position biased model (PBM)."""
504
503
X , y , qid = list (zip (cv_data .train , cv_data .test ))
505
504
506
505
indptr = np .array ([0 ] + [v .shape [0 ] for v in X ])
0 commit comments