forked from GloriusGroup/PAYN
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
70 lines (60 loc) · 3.54 KB
/
config.yaml
File metadata and controls
70 lines (60 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
general:
user: "FBS&JSP" # User or team name
experiment_id: "001"
random_seed: 42
verbose: 200
usage_mode: "training" # Options: "training" or "inference"
dataset:
file_path: "./datasets/Dreher_and_Doyle_input_data.xlsx"
sheet_name: "FullCV_01"
input_columns: ["Ligand", "Additive", "Base", "Aryl halide"] # List specific feature columns if needed (empty = auto-detect)
absence_flag: [] # 'ValueNotFound' # Values to treat as missing/null
target_column: "Output" # Name of the yield column (e.g., "Output", "Yield")
yield_limit: 100 # Max yield value (100 for percentage, 1.0 for fraction)
yield_classification_threshold: 0.2 # Threshold to binarize yield (0.2 = 20%)
featurisation:
method: "ecfp"
ECFP_bit_length: 2048
ECFP_radius: 2
condense_bits: True
existing_feature_columns: [] # List of columns containing pre-calculated features (e.g., DFT)
combined_features_column_name: "FP_combined"
meta_columns: # Internal column names used by the framework for tracking data roles
meta_true_label_bin: "true_bin" # Ground truth binary label
meta_data_point_role: "true_role" # Role in Outer Split (train/val/test)
meta_mod_label_bin: "spy_inf_bin" # Modified label after Spy infiltration
meta_mod_data_point_role: "spy_inf_role" # Role in Inner/Spy Split (spy/unlabeled)
meta_mod_probability_1: "spy_inf_prob_1" # Probability of belonging to positive class after Spy infiltration
meta_mod_prediction_class: "spy_inf_pred_label" # Predicted class after Spy infiltration
meta_augmented_bin: "augm_bin" # Binary label after Augmentation
meta_augmented_role: "augm_role" # Role in Regression (known_positive/aug_neg) (known_positive, aug_neg)
meta_augmented_target: "augm_yield" # Target variable for regression
splitting:
cross_validation_folds: 5
test_size: 0.1 # Ignored for scaffold splits, approximated for Butina
validation_size: 0.1 # Percentage of training data to use as validation set
spy_splitting:
spy_rate: 0.2 # Percentage of positive training data to use as "Spies"
spy_tolerance: 0.05 # Tolerance for spies being detected as positives in the Spy Model predictions
ratio_positives_to_unlabeled: 1.0 # Target ratio of Positives / Unlabeled
spy_model:
eval_metric: "MCC" # Metric to optimise in Spy Model
all_metrics: ['Accuracy', 'BalancedAccuracy', 'AUC', 'Logloss', 'Recall', 'Precision', 'F1', 'TotalF1', 'MCC']
training_target_column_name: "spy_inf_bin"
validation_target_column_name: "true_bin"
metric_manipulation: "Recall" # Metric to boost in augmented negatives (None, Recall, Precision)
target_value: 0.5 # Target value for the manipulated metric
reg_model:
eval_metric: "MAE" # Metric to optimise in Regression Model
all_metrics: ['MAE', 'R2', 'RMSE']
training_target_column_name: "augm_yield"
validation_target_column_name: "Output"
optimisation:
type: "Bayesian"
search_space: ["depth", "learning_rate"]
iterations: 50
catboost:
max_depth: 12
max_iterations: 10000
min_learning_rate: 0.00001
max_bin: 254