2
2
3
3
import adaa .analytics .rules .logic .representation .*;
4
4
import com .rapidminer .example .Attribute ;
5
- import com .rapidminer .example .Example ;
6
5
import com .rapidminer .example .ExampleSet ;
7
- import org .apache .lucene .search .FieldComparator ;
8
- import org .jetbrains .annotations .NotNull ;
9
6
10
- import java .io .IOException ;
11
7
import java .util .*;
12
8
import java .util .concurrent .ExecutionException ;
13
9
import java .util .concurrent .Future ;
@@ -21,7 +17,8 @@ public class ApproximateClassificationFinder extends ClassificationFinder {
21
17
static class ConditionCandidate extends ElementaryCondition {
22
18
23
19
public double quality = -Double .MAX_VALUE ;
24
- public double covered = 0 ;
20
+ public double p = 0 ;
21
+ public double n = 0 ;
25
22
public boolean opposite = false ;
26
23
public int blockId = -1 ;
27
24
@@ -98,8 +95,22 @@ public ExampleSet preprocess(ExampleSet dataset) {
98
95
determineBins (dataset , attr , descriptions [ia ], mappings [ia ], bins_begins [ia ], ruleRanges [ia ]);
99
96
100
97
arrayCopies .put ("ruleRanges" , (Object )Arrays .stream (ruleRanges ).map (int []::clone ).toArray (int [][]::new ));
98
+
99
+ if (attr .isNominal ()) {
100
+ // get orders
101
+ Integer [] valuesOrder = new Integer [attr .getMapping ().size ()];
102
+ List <String > labels = new ArrayList <>();
103
+ labels .addAll (attr .getMapping ().getValues ());
104
+ Collections .sort (labels );
105
+ for (int j = 0 ; j < labels .size (); ++j ) {
106
+ valuesOrder [j ] = attr .getMapping ().getIndex (labels .get (j ));
107
+ }
108
+ attributeValuesOrder .put (attr , valuesOrder );
109
+ }
101
110
}
102
111
112
+
113
+
103
114
return dataset ;
104
115
}
105
116
@@ -274,11 +285,6 @@ protected ElementaryCondition induceCondition(
274
285
Set <Attribute > allowedAttributes ,
275
286
Object ... extraParams ) {
276
287
277
-
278
- if (rule .getPremise ().getSubconditions ().size () == 41 ) {
279
- //return null;
280
- }
281
-
282
288
if (allowedAttributes .size () == 0 ) {
283
289
return null ;
284
290
}
@@ -342,26 +348,26 @@ class Stats {
342
348
}
343
349
}
344
350
345
- int first_bid = ruleRanges [attribute_id ][0 ];
346
- int last_bid = ruleRanges [attribute_id ][1 ];
347
-
348
- // omit empty bins from the beginning and from the end
349
- while (first_bid < last_bid && (cur_positives [first_bid ] + cur_negatives [first_bid ] == 0 )) {
350
- ++first_bid ;
351
- }
352
-
353
- while (first_bid < last_bid && (cur_positives [last_bid - 1 ] + cur_negatives [last_bid - 1 ] == 0 )) {
354
- --last_bid ;
355
- }
356
-
357
351
Stats [] stats = new Stats [2 ];
358
- stats [0 ] = new Stats (cur_positives [first_bid ], cur_negatives [first_bid ], cur_newPositives [first_bid ]);
359
- stats [1 ] = new Stats (finalCovered_p - stats [0 ].p , finalCovered_n - stats [0 ].n , finalCovered_new_p - stats [0 ].p_new );
360
352
361
353
// numerical attribute
362
354
if (attr .isNumerical ()) {
363
- // iterate over blocks
355
+ int first_bid = ruleRanges [attribute_id ][0 ];
356
+ int last_bid = ruleRanges [attribute_id ][1 ];
357
+
358
+ // omit empty bins from the beginning and from the end
359
+ while (first_bid < last_bid && (cur_positives [first_bid ] + cur_negatives [first_bid ] == 0 )) {
360
+ ++first_bid ;
361
+ }
364
362
363
+ while (first_bid < last_bid && (cur_positives [last_bid - 1 ] + cur_negatives [last_bid - 1 ] == 0 )) {
364
+ --last_bid ;
365
+ }
366
+
367
+ stats [0 ] = new Stats (cur_positives [first_bid ], cur_negatives [first_bid ], cur_newPositives [first_bid ]);
368
+ stats [1 ] = new Stats (finalCovered_p - stats [0 ].p , finalCovered_n - stats [0 ].n , finalCovered_new_p - stats [0 ].p_new );
369
+
370
+ // iterate over blocks
365
371
for (int bid = first_bid + 1 ; bid < last_bid ; ++bid ) {
366
372
// omit conditions:
367
373
// - preceding empty bins - they may appear as coverage drops
@@ -380,8 +386,8 @@ class Stats {
380
386
if (prec > apriori_prec && stats [c ].p_new > 0 ) {
381
387
double quality = params .getInductionMeasure ().calculate (stats [c ].p , stats [c ].n , P , N );
382
388
383
- // better then current best
384
- if (quality > best .quality || (quality == best .quality && stats [c ].p > best .covered )) {
389
+ // better than current best
390
+ if (quality > best .quality || (quality == best .quality && stats [c ].p > best .p )) {
385
391
386
392
int left_id = (int ) (cur_descriptions [cur_begins [bid ] - 1 ] & MASK_IDENTIFIER );
387
393
int right_id = (int ) (cur_descriptions [cur_begins [bid ]] & MASK_IDENTIFIER );
@@ -397,7 +403,8 @@ class Stats {
397
403
//Logger.log("\tCurrent best: " + candidate + " (p=" + stats[c].p + ", n=" + stats[c].n + ", new_p=" + (double) stats[c].p_new + ", quality=" + quality + ")\n", Level.FINEST);
398
404
best = candidate ;
399
405
best .quality = quality ;
400
- best .covered = stats [c ].p ;
406
+ best .p = stats [c ].p ;
407
+ best .n = stats [c ].n ;
401
408
best .opposite = (c == 1 );
402
409
best .blockId = bid ;
403
410
}
@@ -417,17 +424,36 @@ class Stats {
417
424
}
418
425
} else { // nominal attribute
419
426
420
- for (int bid = 1 ; bid < cur_positives .length ; ++bid ) {
427
+ // they will be reassigned anyway
428
+ stats [0 ] = new Stats (0 , 0 , 0 );
429
+ stats [1 ] = new Stats (finalCovered_p - stats [0 ].p , finalCovered_n - stats [0 ].n , finalCovered_new_p - stats [0 ].p_new );
430
+
431
+ for (int j = 0 ; j < attr .getMapping ().size (); ++j ) {
432
+ int bid = attributeValuesOrder .get (attr )[j ];
433
+
434
+ // update stats
435
+ stats [0 ].p = cur_positives [bid ];
436
+ stats [0 ].n = cur_negatives [bid ];
437
+ stats [0 ].p_new = cur_newPositives [bid ];
438
+
439
+ stats [1 ].p = finalCovered_p - stats [0 ].p ;
440
+ stats [1 ].n = finalCovered_n - stats [0 ].n ;
441
+ stats [1 ].p_new = finalCovered_new_p - stats [0 ].p_new ;
442
+
421
443
// evaluate both conditions
422
444
for (int c = 0 ; c < 2 ; ++c ) {
423
445
double prec = stats [c ].p / (stats [c ].p + stats [c ].n );
424
446
425
447
if (prec > apriori_prec && stats [c ].p_new > 0 ) {
426
448
double quality = params .getInductionMeasure ().calculate (stats [c ].p , stats [c ].n , P , N );
427
449
428
- // better then current best
429
- if (quality > best .quality || (quality == best .quality && stats [c ].p > best .covered )) {
430
- IValueSet interval = (c == 0 )
450
+ boolean opposite = (c == 1 );
451
+
452
+ // better than current best
453
+ if (quality > best .quality || (quality == best .quality && (stats [c ].p > best .p ||
454
+ (stats [c ].p == best .p && best .opposite && !opposite )))) {
455
+
456
+ IValueSet interval = !opposite
431
457
? new SingletonSet ((double ) bid , attr .getMapping ().getValues ())
432
458
: new SingletonSetComplement ((double ) bid , attr .getMapping ().getValues ());
433
459
@@ -436,23 +462,14 @@ class Stats {
436
462
//Logger.log("\tCurrent best: " + candidate + " (p=" + stats[c].p + ", n=" + stats[c].n + ", new_p=" + (double) stats[c].p_new + ", quality=" + quality + ")\n", Level.FINEST);
437
463
best = candidate ;
438
464
best .quality = quality ;
439
- best .covered = stats [c ].p ;
440
- best .opposite = (c == 1 );
465
+ best .p = stats [c ].p ;
466
+ best .n = stats [c ].n ;
467
+ best .opposite = opposite ;
441
468
best .blockId = bid ;
442
469
}
443
470
}
444
471
}
445
472
}
446
-
447
- // update stats
448
- stats [0 ].p = cur_positives [bid ];
449
- stats [0 ].n = cur_negatives [bid ];
450
- stats [0 ].p_new = cur_newPositives [bid ];
451
-
452
- stats [1 ].p = finalCovered_p - stats [0 ].p ;
453
- stats [1 ].n = finalCovered_n - stats [0 ].n ;
454
- stats [1 ].p_new = finalCovered_new_p - stats [0 ].p_new ;
455
-
456
473
}
457
474
}
458
475
@@ -469,15 +486,16 @@ class Stats {
469
486
ConditionCandidate current = (ConditionCandidate )f .get ();
470
487
471
488
if (current != null && current .getAttribute () != null ) {
472
- Logger .log ("\t Attribute best: " + current + ", quality=" + current .quality , Level .FINEST );
489
+ Logger .log ("\t Attribute best: " + current + ", quality=" +
490
+ current .quality + ", p=" + current .p + ", n=" + current .n , Level .FINEST );
473
491
Attribute attr = dataset .getAttributes ().get (current .getAttribute ());
474
492
if (attr .isNumerical ()) {
475
493
updateMidpoint (dataset , current );
476
494
}
477
495
Logger .log (", adjusted: " + current + "\n " , Level .FINEST );
478
496
}
479
497
480
- if (best == null || current .quality > best .quality || (current .quality == best .quality && current .covered > best .covered )) {
498
+ if (best == null || current .quality > best .quality || (current .quality == best .quality && current .p > best .p )) {
481
499
best = current ;
482
500
}
483
501
}
0 commit comments