PAYN/config.yaml at main · SPP2363/PAYN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
general:
  user: "FBS&JSP"                         # User or team name
  experiment_id: "001"
  random_seed: 42
  verbose: 200
  usage_mode: "training"                  # Options: "training" or "inference"

dataset:
  file_path: "./datasets/Dreher_and_Doyle_input_data.xlsx"
  sheet_name: "FullCV_01"
  input_columns: ["Ligand", "Additive", "Base", "Aryl halide"]     # List specific feature columns if needed (empty = auto-detect)
  absence_flag: [] # 'ValueNotFound'                               # Values to treat as missing/null
  target_column: "Output"                                          # Name of the yield column (e.g., "Output", "Yield")
  yield_limit: 100                                                 # Max yield value (100 for percentage, 1.0 for fraction)
  yield_classification_threshold: 0.2                              # Threshold to binarize yield (0.2 = 20%)

featurisation:
  method: "ecfp"
  ECFP_bit_length: 2048
  ECFP_radius: 2
  condense_bits: True
  existing_feature_columns: []            # List of columns containing pre-calculated features (e.g., DFT)
  combined_features_column_name: "FP_combined"

meta_columns:                                     # Internal column names used by the framework for tracking data roles
  meta_true_label_bin: "true_bin"                 # Ground truth binary label
  meta_data_point_role: "true_role"               # Role in Outer Split (train/val/test)
  meta_mod_label_bin: "spy_inf_bin"               # Modified label after Spy infiltration
  meta_mod_data_point_role: "spy_inf_role"        # Role in Inner/Spy Split (spy/unlabeled)
  meta_mod_probability_1: "spy_inf_prob_1"        # Probability of belonging to positive class after Spy infiltration
  meta_mod_prediction_class: "spy_inf_pred_label" # Predicted class after Spy infiltration
  meta_augmented_bin: "augm_bin"                  # Binary label after Augmentation
  meta_augmented_role: "augm_role"                # Role in Regression (known_positive/aug_neg) (known_positive, aug_neg)
  meta_augmented_target: "augm_yield"             # Target variable for regression

splitting:
  cross_validation_folds: 5
  test_size: 0.1                                  # Ignored for scaffold splits, approximated for Butina
  validation_size: 0.1                            # Percentage of training data to use as validation set

spy_splitting:
  spy_rate: 0.2                                   # Percentage of positive training data to use as "Spies"
  spy_tolerance: 0.05                             # Tolerance for spies being detected as positives in the Spy Model predictions
  ratio_positives_to_unlabeled: 1.0               # Target ratio of Positives / Unlabeled

spy_model:
  eval_metric: "MCC"                              # Metric to optimise in Spy Model
  all_metrics: ['Accuracy', 'BalancedAccuracy', 'AUC', 'Logloss', 'Recall', 'Precision', 'F1', 'TotalF1',  'MCC']
  training_target_column_name: "spy_inf_bin"
  validation_target_column_name: "true_bin"
  metric_manipulation: "Recall"                   # Metric to boost in augmented negatives (None, Recall, Precision)
  target_value: 0.5                               # Target value for the manipulated metric

reg_model:
  eval_metric: "MAE"                              # Metric to optimise in Regression Model
  all_metrics: ['MAE', 'R2', 'RMSE']
  training_target_column_name: "augm_yield"
  validation_target_column_name: "Output"

optimisation:
  type: "Bayesian"
  search_space: ["depth", "learning_rate"]
  iterations: 50

catboost:
  max_depth: 12
  max_iterations: 10000
  min_learning_rate: 0.00001
  max_bin: 254