Skip to content

Commit a16d53b

Browse files
authored
Merge branch 'master' into docs/install-py
2 parents bb39ff5 + e61bcbe commit a16d53b

File tree

12 files changed

+233
-1215
lines changed

12 files changed

+233
-1215
lines changed

Diff for: .ci/conda-envs/ci-core-py38.txt

+4
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ pytest=8.2.*
3939
# pinned here to help speed up solves
4040
bokeh=3.1.*
4141
fsspec=2024.5.*
42+
# pinning 'libabseil' and 'libre2' to specific build numbers for pyarrow compatibility:
43+
# ref: https://github.com/microsoft/LightGBM/issues/6772
44+
libabseil=20240722.0=*_1
45+
libre2-11=2024.07.02=*_1
4246
msgpack-python=1.0.*
4347
pluggy=1.5.*
4448
pyparsing=3.1.4

Diff for: .github/workflows/lock.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ name: 'Lock Inactive Threads'
22

33
on:
44
schedule:
5-
# midnight UTC, every Wednesday
5+
# midnight UTC, every Wednesday, for Issues
66
- cron: '0 0 * * 3'
7+
# midnight UTC, every Thursday, for PRs
8+
- cron: '0 0 * * 4'
79
# allow manual triggering from GitHub UI
810
workflow_dispatch:
911

@@ -42,4 +44,4 @@ jobs:
4244
# what should the locking status be?
4345
issue-lock-reason: 'resolved'
4446
pr-lock-reason: 'resolved'
45-
process-only: 'issues, prs'
47+
process-only: ${{ github.event.schedule == '0 0 * * 3' && 'issues' || 'prs' }}

Diff for: .pre-commit-config.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ repos:
1717
hooks:
1818
- id: end-of-file-fixer
1919
- id: trailing-whitespace
20+
- repo: https://github.com/adrienverge/yamllint
21+
rev: v1.35.1
22+
hooks:
23+
- id: yamllint
24+
args: ["--strict"]
2025
- repo: https://github.com/astral-sh/ruff-pre-commit
2126
# Ruff version.
2227
rev: v0.8.3

Diff for: .yamllint.yml

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# default config: https://yamllint.readthedocs.io/en/stable/configuration.html#default-configuration
2+
extends: default
3+
4+
rules:
5+
document-start: disable
6+
line-length:
7+
max: 999 # temporarily increase allowed line length
8+
truthy:
9+
# prevent treating GitHub Workflow "on" key as boolean value
10+
check-keys: false
11+
12+
# temporarily disabled rules
13+
indentation: disable
14+
comments-indentation: disable

Diff for: CMakeLists.txt

-56
Original file line numberDiff line numberDiff line change
@@ -252,54 +252,6 @@ if(USE_CUDA)
252252
set(CMAKE_CUDA_STANDARD 11)
253253
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
254254
endif()
255-
256-
set(
257-
BASE_DEFINES
258-
-DPOWER_FEATURE_WORKGROUPS=12
259-
-DUSE_CONSTANT_BUF=0
260-
)
261-
set(
262-
ALLFEATS_DEFINES
263-
${BASE_DEFINES}
264-
-DENABLE_ALL_FEATURES
265-
)
266-
set(
267-
FULLDATA_DEFINES
268-
${ALLFEATS_DEFINES}
269-
-DIGNORE_INDICES
270-
)
271-
272-
message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
273-
message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")
274-
275-
function(add_histogram hsize hname hadd hconst hdir)
276-
add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
277-
set_target_properties(
278-
histo${hsize}${hname}
279-
PROPERTIES
280-
CUDA_SEPARABLE_COMPILATION ON
281-
CUDA_ARCHITECTURES ${CUDA_ARCHS}
282-
)
283-
if(hadd)
284-
list(APPEND histograms histo${hsize}${hname})
285-
set(histograms ${histograms} PARENT_SCOPE)
286-
endif()
287-
target_compile_definitions(
288-
histo${hsize}${hname}
289-
PRIVATE
290-
-DCONST_HESSIAN=${hconst}
291-
${hdir}
292-
)
293-
endfunction()
294-
295-
foreach(hsize _16_64_256)
296-
add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
297-
add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
298-
add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
299-
add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}")
300-
add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
301-
add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
302-
endforeach()
303255
endif()
304256

305257
include(CheckCXXSourceCompiles)
@@ -634,14 +586,6 @@ if(USE_CUDA)
634586
CUDA_RESOLVE_DEVICE_SYMBOLS ON
635587
)
636588
endif()
637-
638-
# histograms are list of object libraries. Linking object library to other
639-
# object libraries only gets usage requirements, the linked objects won't be
640-
# used. Thus we have to call target_link_libraries on final targets here.
641-
if(BUILD_CLI)
642-
target_link_libraries(lightgbm PRIVATE ${histograms})
643-
endif()
644-
target_link_libraries(_lightgbm PRIVATE ${histograms})
645589
endif()
646590

647591
if(WIN32)

Diff for: python-package/lightgbm/basic.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1248,7 +1248,7 @@ def predict(
12481248
if pred_leaf:
12491249
preds = preds.astype(np.int32)
12501250
is_sparse = isinstance(preds, (list, scipy.sparse.spmatrix))
1251-
if not is_sparse and preds.size != nrow:
1251+
if not is_sparse and (preds.size != nrow or pred_leaf or pred_contrib):
12521252
if preds.size % nrow == 0:
12531253
preds = preds.reshape(nrow, -1)
12541254
else:
@@ -2126,6 +2126,8 @@ def _lazy_init(
21262126
categorical_feature=categorical_feature,
21272127
pandas_categorical=self.pandas_categorical,
21282128
)
2129+
elif _is_pyarrow_table(data) and feature_name == "auto":
2130+
feature_name = data.column_names
21292131

21302132
# process for args
21312133
params = {} if params is None else params
@@ -2185,7 +2187,6 @@ def _lazy_init(
21852187
self.__init_from_np2d(data, params_str, ref_dataset)
21862188
elif _is_pyarrow_table(data):
21872189
self.__init_from_pyarrow_table(data, params_str, ref_dataset)
2188-
feature_name = data.column_names
21892190
elif isinstance(data, list) and len(data) > 0:
21902191
if _is_list_of_numpy_arrays(data):
21912192
self.__init_from_list_np2d(data, params_str, ref_dataset)

Diff for: python-package/lightgbm/callback.py

+38-34
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ class CallbackEnv:
7171
evaluation_result_list: Optional[_ListOfEvalResultTuples]
7272

7373

74+
def _is_using_cv(env: CallbackEnv) -> bool:
75+
"""Check if model in callback env is a CVBooster."""
76+
# this import is here to avoid a circular import
77+
from .engine import CVBooster
78+
79+
return isinstance(env.model, CVBooster)
80+
81+
7482
def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str:
7583
"""Format metric string."""
7684
dataset_name, metric_name, metric_value, *_ = value
@@ -143,16 +151,13 @@ def _init(self, env: CallbackEnv) -> None:
143151
)
144152
self.eval_result.clear()
145153
for item in env.evaluation_result_list:
146-
if len(item) == 4: # regular train
147-
data_name, eval_name = item[:2]
148-
else: # cv
149-
data_name, eval_name = item[1].split()
150-
self.eval_result.setdefault(data_name, OrderedDict())
154+
dataset_name, metric_name, *_ = item
155+
self.eval_result.setdefault(dataset_name, OrderedDict())
151156
if len(item) == 4:
152-
self.eval_result[data_name].setdefault(eval_name, [])
157+
self.eval_result[dataset_name].setdefault(metric_name, [])
153158
else:
154-
self.eval_result[data_name].setdefault(f"{eval_name}-mean", [])
155-
self.eval_result[data_name].setdefault(f"{eval_name}-stdv", [])
159+
self.eval_result[dataset_name].setdefault(f"{metric_name}-mean", [])
160+
self.eval_result[dataset_name].setdefault(f"{metric_name}-stdv", [])
156161

157162
def __call__(self, env: CallbackEnv) -> None:
158163
if env.iteration == env.begin_iteration:
@@ -163,15 +168,16 @@ def __call__(self, env: CallbackEnv) -> None:
163168
"Please report it at https://github.com/microsoft/LightGBM/issues"
164169
)
165170
for item in env.evaluation_result_list:
171+
# for cv(), 'metric_value' is actually a mean of metric values over all CV folds
172+
dataset_name, metric_name, metric_value, *_ = item
166173
if len(item) == 4:
167-
data_name, eval_name, result = item[:3]
168-
self.eval_result[data_name][eval_name].append(result)
174+
# train()
175+
self.eval_result[dataset_name][metric_name].append(metric_value)
169176
else:
170-
data_name, eval_name = item[1].split()
171-
res_mean = item[2]
172-
res_stdv = item[4] # type: ignore[misc]
173-
self.eval_result[data_name][f"{eval_name}-mean"].append(res_mean)
174-
self.eval_result[data_name][f"{eval_name}-stdv"].append(res_stdv)
177+
# cv()
178+
metric_std_dev = item[4] # type: ignore[misc]
179+
self.eval_result[dataset_name][f"{metric_name}-mean"].append(metric_value)
180+
self.eval_result[dataset_name][f"{metric_name}-stdv"].append(metric_std_dev)
175181

176182

177183
def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable:
@@ -304,15 +310,15 @@ def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
304310
def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
305311
return curr_score < best_score - delta
306312

307-
def _is_train_set(self, ds_name: str, eval_name: str, env: CallbackEnv) -> bool:
313+
def _is_train_set(self, dataset_name: str, env: CallbackEnv) -> bool:
308314
"""Check, by name, if a given Dataset is the training data."""
309315
# for lgb.cv() with eval_train_metric=True, evaluation is also done on the training set
310316
# and those metrics are considered for early stopping
311-
if ds_name == "cv_agg" and eval_name == "train":
317+
if _is_using_cv(env) and dataset_name == "train":
312318
return True
313319

314320
# for lgb.train(), it's possible to pass the training data via valid_sets with any eval_name
315-
if isinstance(env.model, Booster) and ds_name == env.model._train_data_name:
321+
if isinstance(env.model, Booster) and dataset_name == env.model._train_data_name:
316322
return True
317323

318324
return False
@@ -327,11 +333,13 @@ def _init(self, env: CallbackEnv) -> None:
327333
_log_warning("Early stopping is not available in dart mode")
328334
return
329335

336+
# get details of the first dataset
337+
first_dataset_name, first_metric_name, *_ = env.evaluation_result_list[0]
338+
330339
# validation sets are guaranteed to not be identical to the training data in cv()
331340
if isinstance(env.model, Booster):
332341
only_train_set = len(env.evaluation_result_list) == 1 and self._is_train_set(
333-
ds_name=env.evaluation_result_list[0][0],
334-
eval_name=env.evaluation_result_list[0][1].split(" ")[0],
342+
dataset_name=first_dataset_name,
335343
env=env,
336344
)
337345
if only_train_set:
@@ -370,8 +378,7 @@ def _init(self, env: CallbackEnv) -> None:
370378
_log_info(f"Using {self.min_delta} as min_delta for all metrics.")
371379
deltas = [self.min_delta] * n_datasets * n_metrics
372380

373-
# split is needed for "<dataset type> <metric>" case (e.g. "train l1")
374-
self.first_metric = env.evaluation_result_list[0][1].split(" ")[-1]
381+
self.first_metric = first_metric_name
375382
for eval_ret, delta in zip(env.evaluation_result_list, deltas):
376383
self.best_iter.append(0)
377384
if eval_ret[3]: # greater is better
@@ -381,15 +388,15 @@ def _init(self, env: CallbackEnv) -> None:
381388
self.best_score.append(float("inf"))
382389
self.cmp_op.append(partial(self._lt_delta, delta=delta))
383390

384-
def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None:
391+
def _final_iteration_check(self, *, env: CallbackEnv, metric_name: str, i: int) -> None:
385392
if env.iteration == env.end_iteration - 1:
386393
if self.verbose:
387394
best_score_str = "\t".join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]])
388395
_log_info(
389396
"Did not meet early stopping. " f"Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}"
390397
)
391398
if self.first_metric_only:
392-
_log_info(f"Evaluated only: {eval_name_splitted[-1]}")
399+
_log_info(f"Evaluated only: {metric_name}")
393400
raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
394401

395402
def __call__(self, env: CallbackEnv) -> None:
@@ -405,21 +412,18 @@ def __call__(self, env: CallbackEnv) -> None:
405412
# self.best_score_list is initialized to an empty list
406413
first_time_updating_best_score_list = self.best_score_list == []
407414
for i in range(len(env.evaluation_result_list)):
408-
score = env.evaluation_result_list[i][2]
409-
if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]):
410-
self.best_score[i] = score
415+
dataset_name, metric_name, metric_value, *_ = env.evaluation_result_list[i]
416+
if first_time_updating_best_score_list or self.cmp_op[i](metric_value, self.best_score[i]):
417+
self.best_score[i] = metric_value
411418
self.best_iter[i] = env.iteration
412419
if first_time_updating_best_score_list:
413420
self.best_score_list.append(env.evaluation_result_list)
414421
else:
415422
self.best_score_list[i] = env.evaluation_result_list
416-
# split is needed for "<dataset type> <metric>" case (e.g. "train l1")
417-
eval_name_splitted = env.evaluation_result_list[i][1].split(" ")
418-
if self.first_metric_only and self.first_metric != eval_name_splitted[-1]:
423+
if self.first_metric_only and self.first_metric != metric_name:
419424
continue # use only the first metric for early stopping
420425
if self._is_train_set(
421-
ds_name=env.evaluation_result_list[i][0],
422-
eval_name=eval_name_splitted[0],
426+
dataset_name=dataset_name,
423427
env=env,
424428
):
425429
continue # train data for lgb.cv or sklearn wrapper (underlying lgb.train)
@@ -430,9 +434,9 @@ def __call__(self, env: CallbackEnv) -> None:
430434
)
431435
_log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}")
432436
if self.first_metric_only:
433-
_log_info(f"Evaluated only: {eval_name_splitted[-1]}")
437+
_log_info(f"Evaluated only: {metric_name}")
434438
raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
435-
self._final_iteration_check(env, eval_name_splitted, i)
439+
self._final_iteration_check(env=env, metric_name=metric_name, i=i)
436440

437441

438442
def _should_enable_early_stopping(stopping_rounds: Any) -> bool:

Diff for: python-package/lightgbm/engine.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -581,15 +581,31 @@ def _agg_cv_result(
581581
raw_results: List[List[_LGBM_BoosterEvalMethodResultType]],
582582
) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
583583
"""Aggregate cross-validation results."""
584-
cvmap: Dict[str, List[float]] = OrderedDict()
585-
metric_type: Dict[str, bool] = {}
584+
# build up 2 maps, of the form:
585+
#
586+
# OrderedDict{
587+
# (<dataset_name>, <metric_name>): <is_higher_better>
588+
# }
589+
#
590+
# OrderedDict{
591+
# (<dataset_name>, <metric_name>): list[<metric_value>]
592+
# }
593+
#
594+
metric_types: Dict[Tuple[str, str], bool] = OrderedDict()
595+
metric_values: Dict[Tuple[str, str], List[float]] = OrderedDict()
586596
for one_result in raw_results:
587-
for one_line in one_result:
588-
key = f"{one_line[0]} {one_line[1]}"
589-
metric_type[key] = one_line[3]
590-
cvmap.setdefault(key, [])
591-
cvmap[key].append(one_line[2])
592-
return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
597+
for dataset_name, metric_name, metric_value, is_higher_better in one_result:
598+
key = (dataset_name, metric_name)
599+
metric_types[key] = is_higher_better
600+
metric_values.setdefault(key, [])
601+
metric_values[key].append(metric_value)
602+
603+
# turn that into a list of tuples of the form:
604+
#
605+
# [
606+
# (<dataset_name>, <metric_name>, mean(<values>), <is_higher_better>, std_dev(<values>))
607+
# ]
608+
return [(k[0], k[1], float(np.mean(v)), metric_types[k], float(np.std(v))) for k, v in metric_values.items()]
593609

594610

595611
def cv(
@@ -812,9 +828,9 @@ def cv(
812828
)
813829
cvbooster.update(fobj=fobj) # type: ignore[call-arg]
814830
res = _agg_cv_result(cvbooster.eval_valid(feval)) # type: ignore[call-arg]
815-
for _, key, mean, _, std in res:
816-
results[f"{key}-mean"].append(mean)
817-
results[f"{key}-stdv"].append(std)
831+
for dataset_name, metric_name, metric_mean, _, metric_std_dev in res:
832+
results[f"{dataset_name} {metric_name}-mean"].append(metric_mean)
833+
results[f"{dataset_name} {metric_name}-stdv"].append(metric_std_dev)
818834
try:
819835
for cb in callbacks_after_iter:
820836
cb(

0 commit comments

Comments
 (0)