-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTREX_pairwise_predictor.pl
149 lines (148 loc) · 6 KB
/
TREX_pairwise_predictor.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!usr/bin/perl
use List::Util qw( shuffle ); $path = `pwd`; chomp $path;
open (IN, $ARGV[0]); @file = split (/[_\.]/,$ARGV[0]); $orig_train_file = '';
for $n (0..$#file-2){ $orig_train_file .= "$file[$n]_";} $orig_train_file .=
$file[$#file-1];
open (IN1, $ARGV[1]); @file = split (/[_\.]/,$ARGV[1]); $orig_test_file = '';
for $z (0..$#file-2){ $orig_test_file .= "$file[$z]_";} $orig_test_file .=
$file[$#file-1];
#print "Enter two (only) predictors' numbers you wish to exclude\n";
@pred_train = (); $line = 0;
while (<IN>){
chomp $_;
if ($_=~m/^\@ATT.*?\s(.*?)\s.*/){ push (@pred_train, $1) unless $1 eq
"class"; }
if ($_=~m/^(.+?),([12])$/){ $line++; @{'annot_train'.$line} =
split(/,/,$1); push (@{'annot_train'.$line}, $2); }
} close IN;
#for $n (0..$#pred_train){ $pred_train = $n+1; print "$pred_train
$pred_train[$n]\n";}
#$input = <STDIN>; chomp $input;
@exclude = split(/,/,$ARGV[2]);
#print "you excluded $pred_train[$exclude[0]-1] and $pred_train[$exclude[1]-
1] predictors\n";
open (OUT,
">$path/temp_datasets/${orig_train_file}_minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1].arff");
open (OUT2, ">doubleknockout_deltaacc_$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1]");
$l = 0; open (IN, $ARGV[0]);
while (<IN>){
chomp $_;
if ($_=~m/^\@ATT.*?\s(.*?)\s.*/){ print OUT "$_\n" unless $1 eq
$pred_train[$exclude[0]-1] || $1 eq $pred_train[$exclude[1]-1];}
elsif ($_=~m/.+?,[12]$/){ $l++;
for $n (0..$#{'annot_train'.$l}-1){ print OUT
"${'annot_train'.$l}[$n]," unless $n == $exclude[0]-1 || $n == $exclude[1]-1
;}
print OUT ${'annot_train'.$l}[$#{'annot_train'.$l}],"\n";
}
else {print OUT $_,"\n";}
}
#print "training double knockout model\n";
`java weka.classifiers.trees.RandomForest -I 100 -K 0 -S 1 -num-slots 32 -t
$path/temp_datasets/${orig_train_file}_minus$pred_train[$exclude[0]
-1]$pred_train[$exclude[1]-1].arff -d
$path/trained_models/${orig_train_file}_minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1].model`;
open (OUT,
">$path/temp_datasets/${orig_test_file}_minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1].arff"); $l1 = 0;
while (<IN1>){ chomp $_; if ($_=~m/^(.+?),([12])$/){ $l1++;
@{'annot_test'.$l1} = split(/,/,$1); push (@{'annot_test'.$l1}, $2); }} close
IN1;
311
open (IN1, "$ARGV[1]"); $l2 = 0; @incl_test_pred = ();
while (<IN1>){ chomp $_; $t = 0;
if ($_=~m/^\@ATT.*?\s(.*?)\s.*/){
print OUT "$_\n" unless $1 eq $pred_train[$exclude[0]-1] || $1 eq
$pred_train[$exclude[1]-1];
push (@incl_test_pred, $1) unless $1 eq "class";
}
elsif ($_=~m/.+?,[12]$/){ $l2++;
for $n (0..$#{'annot_test'.$l2}-1){
print OUT "${'annot_test'.$l2}[$n]," unless $n == $exclude[0]-1
|| $n == $exclude[1]-1 ;
push (@{'chosen_test_pred'.$t}, ${'annot_test'.$l2}[$n]) ;
$t++;
}
print OUT ${'annot_test'.$l2}[$#{'annot_test'.$l2}],"\n";
}
else {print OUT $_,"\n";}
}
#evaluating original model against test dataset (no randomization)
#This requires that the original model against original test dataset is
evaluated in orig_teststat
# consider running
#`java weka.classifiers.trees.RandomForest -I 100 -K 0 -S 1 -num-slots 32 -t
$path/$ARGV[0] -d $path/trained_models/${orig_train_file}.model`;
`java weka.classifiers.trees.RandomForest -l
"$path/trained_models/${orig_train_file}.model" -T
$path/${orig_test_file}.arff > $path/temp_datas
ets/orig_teststat`; #or if the single predictor exclusion randomization is
already done, the file orig_teststat should already be within the te
mp_datasets directory
open (IN3, "$path/temp_datasets/orig_teststat"); while (<IN3>){ chomp $_; if
($_=~m/^Correctly.*?\d+\s+(\d.*?)\s+%$/){ $orig_orig = $1; }} clos
e IN3;
#print "evaluating double knocked model against test dataset (no
randomization)\n";
`java weka.classifiers.trees.RandomForest -l
"$path/trained_models/${orig_train_file}_minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1]
.model" -T
$path/temp_datasets/${orig_test_file}_minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1].arff > $path/temp_datasets/minus$pre
d_train[$exclude[0]-1]$pred_train[$exclude[1]-1]`;
open (IN3, "$path/temp_datasets/minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1]"); while (<IN3>){ chomp $_; if
($_=~m/^Correctly.*?\d
+\s+(\d.*?)\s+%$/){ $acc_minus = sprintf("%.2f",$1); $delta_orig =
sprintf("%.2f",$orig_orig - $1); print OUT2 "$acc_minus\t$pred_train[$exclud
e[0]-1]",",","$pred_train[$exclude[1]-1]\t",$delta_orig,"\t";}} close IN3;
for $y (0..$t-1){
if ($y == $exclude[0]-1 || $y == $exclude[1]-1){ print OUT2 "NA\t";}
else {
@random_test_pred = shuffle 0..$#{'chosen_test_pred'.$y};
$l3 = 0; open (IN1, $ARGV[1]);
open (OUT,
">$path/temp_datasets/${orig_test_file}_minus$pred_train[$exclude[0]-
312
1]$pred_train[$exclude[1]-1]_random$incl_test_p
red[$y].arff");
while (<IN1>){ chomp $_;
if ($_=~m/^\@ATT.*?\s(.*?)\s.*/){ print OUT "$_\n" unless $1 eq
$pred_train[$exclude[0]-1] || $1 eq $pred_train[$exclud
e[1]-1];}
elsif ($_=~m/^(.+?),([12])$/){ $l3++; $label = $2;
@{'annot'.$l3} = split (/,/,$1);
for $x (0..$#{'annot'.$l3}){
print OUT "${'annot'.$l3}[$x]," unless $x ==
$exclude[0]-1 || $x == $exclude[1]-1 || $x == $y;
if ($x == $y){ print OUT
"${'chosen_test_pred'.$y}[$random_test_pred[$l3]],";}
}
print OUT $label,"\n";
}
else {print OUT $_,"\n";}
} close IN1;
#print "evaluating double knocked model against test dataset
randomized predictor $incl_test_pred[$y] . Count $y+1 out of total
$t possibilities\n";
`java weka.classifiers.trees.RandomForest -l
"$path/trained_models/${orig_train_file}_minus$pred_train[$exclude[0]-
1]$pred_trai
n[$exclude[1]-1].model" -T
$path/temp_datasets/${orig_test_file}_minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1]_random$incl_test_pre
d[$y].arff > $path/temp_datasets/minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1]_random$incl_test_pred[$y]`;
open (IN3, "$path/temp_datasets/minus$pred_train[$exclude[0]-
1]$pred_train[$exclude[1]-1]_random$incl_test_pred[$y]");
while (<IN3>){ chomp $_; if
($_=~m/^Correctly.*?\d+\s+(\d.*?)\s+%$/){ $delta_accminus = sprintf("%.2f",
$acc_minus - $1); print
OUT2 $delta_accminus,"\t"; }} close IN3;
}
}
print OUT2 "\n";