4
4
from vlmeval .smp import *
5
5
6
6
# Define valid modes
7
- MODES = ('dlist' , 'mlist' , 'missing' , 'circular' , 'localize' , 'check' , 'run' , 'eval' , 'merge_pkl' )
7
+ MODES = ('dlist' , 'mlist' , 'missing' , 'circular' , 'localize' , 'check' , 'run' , 'eval' , 'merge_pkl' , 'scan' )
8
8
9
9
CLI_HELP_MSG = \
10
10
f"""
35
35
vlmutil eval [dataset_name] [prediction_file]
36
36
9. Merge pkl files:
37
37
vlmutil merge_pkl [pkl_dir] [world_size]
38
-
38
+ 10. Scan evaluation results and detect api failure
39
+ vlmutil scan --model [model_list.txt or model_names] --data [dataset_names] --root [root_dir]
39
40
GitHub: https://github.com/open-compass/VLMEvalKit
40
41
""" # noqa: E501
41
42
@@ -395,6 +396,15 @@ def parse_args_eval():
395
396
return args
396
397
397
398
399
+ def parse_args_scan ():
400
+ parser = argparse .ArgumentParser ()
401
+ parser .add_argument ('--model' , type = str , nargs = '+' )
402
+ parser .add_argument ('--data' , type = str , nargs = '+' )
403
+ parser .add_argument ('--root' , type = str , default = None )
404
+ args , unknownargs = parser .parse_known_args ()
405
+ return args , unknownargs
406
+
407
+
398
408
def MERGE_PKL (pkl_dir , world_size = 1 ):
399
409
prefs = []
400
410
for ws in list (range (1 , 9 )):
@@ -416,6 +426,53 @@ def MERGE_PKL(pkl_dir, world_size=1):
416
426
dump (res_all [k ], f'{ pkl_dir } /{ pf } { k } ' )
417
427
print (f'Merged { len (res_all [k ])} records into { pkl_dir } /{ dump_prefs [0 ]} { k } ' )
418
428
429
+
430
+ def SCAN (root , model , dataset ):
431
+ from termcolor import colored
432
+ FAIL_MSG = 'Failed to obtain answer via API.'
433
+ root = osp .join (root , model )
434
+ fname = f'{ model } _{ dataset } .xlsx'
435
+ pth = osp .join (root , fname )
436
+ if osp .exists (pth ):
437
+ data = load (pth )
438
+ # Detect Failure
439
+ assert 'prediction' in data
440
+ data ['prediction' ] = [str (x ) for x in data ['prediction' ]]
441
+ fail = [FAIL_MSG in x for x in data ['prediction' ]]
442
+ if sum (fail ):
443
+ nfail = sum (fail )
444
+ ntot = len (fail )
445
+ print (colored (f'Model { model } x Dataset { dataset } Inference: { nfail } out of { ntot } failed. { nfail / ntot * 100 : .2f} %. ' , 'light_red' ))
446
+
447
+ eval_files = ls (root , match = f'{ model } _{ dataset } _' )
448
+ eval_files = [x for x in eval_files if listinstr ([f'{ dataset } _openai' , f'{ dataset } _gpt' ], x ) and x .endswith ('.xlsx' )]
449
+
450
+ if len (eval_files ) == 0 :
451
+ return
452
+
453
+ for eval_file in eval_files :
454
+ data = load (eval_file )
455
+
456
+ if 'MMVet' in dataset :
457
+ bad = [x for x in data ['log' ] if 'All 5 retries failed.' in str (x )]
458
+ if len (bad ):
459
+ print (f'Evaluation ({ eval_file } ): { len (bad )} out of { len (data )} failed.' )
460
+ elif 'MathVista' in dataset :
461
+ bad = [x for x in data ['res' ] if FAIL_MSG in str (x )]
462
+ if len (bad ):
463
+ print (f'Evaluation ({ eval_file } ): { len (bad )} out of { len (data )} failed.' )
464
+ elif dataset == 'LLaVABench' :
465
+ sub = data [data ['gpt4_score' ] == - 1 ]
466
+ sub = sub [sub ['gpt4_score' ] == - 1 ]
467
+ if len (sub ):
468
+ print (f'Evaluation ({ eval_file } ): { len (sub )} out of { len (data )} failed.' )
469
+ else :
470
+ bad = [x for x in data ['log' ] if FAIL_MSG in str (x )]
471
+ if len (bad ):
472
+ print (f'Evaluation ({ eval_file } ): { len (bad )} out of { len (data )} failed.' )
473
+ else :
474
+ print (colored (f'Model { model } x Dataset { dataset } Inference Result Missing! ' , 'red' ))
475
+
419
476
420
477
def cli ():
421
478
logger = get_logger ('VLMEvalKit Tools' )
@@ -491,6 +548,25 @@ def extract_dataset(file_name):
491
548
args [2 ] = int (args [2 ])
492
549
assert args [2 ] in [1 , 2 , 4 , 8 ]
493
550
MERGE_PKL (args [1 ], args [2 ])
551
+ elif args [0 ].lower () == 'scan' :
552
+ args , unknownargs = parse_args_scan ()
553
+ # The default value is only for the maintainer usage
554
+ root = args .root if args .root is not None else osp .join (osp .expanduser ('~' ), 'mmeval' )
555
+ models = []
556
+ for m in args .model :
557
+ if osp .exists (m ) and m .endswith ('.txt' ):
558
+ lines = mrlines (m )
559
+ models .extend ([x .split ()[0 ] for x in lines if len (x .split ()) >= 1 ])
560
+ else :
561
+ models .append (m )
562
+ datasets = args .data
563
+ assert len (datasets )
564
+ for m in models :
565
+ if not osp .exists (osp .join (root , m )):
566
+ warnings .warn (f'Model { m } not found in { root } ' )
567
+ continue
568
+ for d in datasets :
569
+ SCAN (root , m , d )
494
570
else :
495
571
logger .error ('WARNING: command error!' )
496
572
logger .info (CLI_HELP_MSG )
0 commit comments