From 94fec43fa5501460d00ec037c42b815105decead Mon Sep 17 00:00:00 2001 From: Gu Wang Date: Wed, 24 Nov 2021 22:53:04 +0800 Subject: [PATCH 1/3] use lite for ddp --- configs/gdrn/lm/a6_cPnP_lm13_lite.py | 70 +++ configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py | 70 +++ core/gdrn_modeling/datasets/lm_blender.py | 4 +- core/gdrn_modeling/datasets/lm_dataset_d2.py | 4 +- core/gdrn_modeling/datasets/lm_pbr.py | 4 +- core/gdrn_modeling/datasets/lm_syn_imgn.py | 4 +- core/gdrn_modeling/datasets/ycbv_bop_test.py | 4 +- core/gdrn_modeling/datasets/ycbv_d2.py | 4 +- core/gdrn_modeling/datasets/ycbv_pbr.py | 4 +- core/gdrn_modeling/engine.py | 515 +++++++++---------- core/gdrn_modeling/main_gdrn.py | 112 ++-- core/gdrn_modeling/test_utils.py | 5 +- core/utils/default_args_setup.py | 12 +- core/utils/my_comm.py | 169 ------ lib/utils/mask_utils.py | 6 - lib/utils/setup_logger.py | 333 ++++++------ scripts/format_code.sh | 2 +- scripts/install_deps.sh | 5 +- 18 files changed, 625 insertions(+), 702 deletions(-) create mode 100644 configs/gdrn/lm/a6_cPnP_lm13_lite.py create mode 100644 configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py diff --git a/configs/gdrn/lm/a6_cPnP_lm13_lite.py b/configs/gdrn/lm/a6_cPnP_lm13_lite.py new file mode 100644 index 00000000..8e1bbf04 --- /dev/null +++ b/configs/gdrn/lm/a6_cPnP_lm13_lite.py @@ -0,0 +1,70 @@ +_base_ = ["../../_base_/gdrn_base.py"] + +OUTPUT_DIR = "output/gdrn/lm/a6_cPnP_lm13_lite" +INPUT = dict( + DZI_PAD_SCALE=1.5, + COLOR_AUG_PROB=0.0, + COLOR_AUG_TYPE="code", + COLOR_AUG_CODE=( + "Sequential([" + "Sometimes(0.4, CoarseDropout( p=0.1, size_percent=0.05) )," + # "Sometimes(0.5, Affine(scale=(1.0, 1.2)))," + "Sometimes(0.5, GaussianBlur(np.random.rand()))," + "Sometimes(0.5, Add((-20, 20), per_channel=0.3))," + "Sometimes(0.4, Invert(0.20, per_channel=True))," + "Sometimes(0.5, Multiply((0.7, 1.4), per_channel=0.8))," + "Sometimes(0.5, Multiply((0.7, 1.4)))," + "Sometimes(0.5, ContrastNormalization((0.5, 2.0), per_channel=0.3))" + "], random_order=False)" + ), +) + +SOLVER = dict( + IMS_PER_BATCH=24, + LR_SCHEDULER_NAME="flat_and_anneal", + ANNEAL_METHOD="cosine", # "cosine" + ANNEAL_POINT=0.72, + # REL_STEPS=(0.3125, 0.625, 0.9375), + OPTIMIZER_CFG=dict(_delete_=True, type="Ranger", lr=1e-4, weight_decay=0), + WEIGHT_DECAY=0.0, + WARMUP_FACTOR=0.001, + WARMUP_ITERS=1000, +) + +DATASETS = dict( + TRAIN=("lm_13_train", "lm_imgn_13_train_1k_per_obj"), + TEST=("lm_13_test",), + DET_FILES_TEST=("datasets/BOP_DATASETS/lm/test/test_bboxes/bbox_faster_all.json",), +) + +MODEL = dict( + LOAD_DETS_TEST=True, + PIXEL_MEAN=[0.0, 0.0, 0.0], + PIXEL_STD=[255.0, 255.0, 255.0], + CDPN=dict( + ROT_HEAD=dict( + FREEZE=False, + ROT_CLASS_AWARE=False, + MASK_CLASS_AWARE=False, + XYZ_LW=1.0, + REGION_CLASS_AWARE=False, + NUM_REGIONS=64, + ), + PNP_NET=dict( + R_ONLY=False, + REGION_ATTENTION=True, + WITH_2D_COORD=True, + ROT_TYPE="allo_rot6d", + TRANS_TYPE="centroid_z", + PM_NORM_BY_EXTENT=True, + PM_R_ONLY=True, + CENTROID_LOSS_TYPE="L1", + CENTROID_LW=1.0, + Z_LOSS_TYPE="L1", + Z_LW=1.0, + ), + TRANS_HEAD=dict(FREEZE=True), + ), +) + +TEST = dict(EVAL_PERIOD=0, VIS=False, TEST_BBOX_TYPE="est") # gt | est diff --git a/configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py b/configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py new file mode 100644 index 00000000..44784ceb --- /dev/null +++ b/configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py @@ -0,0 +1,70 @@ +_base_ = ["../../_base_/gdrn_base.py"] + +OUTPUT_DIR = "output/gdrn/lm/a6_cPnP_lm13_lite_2gpus" +INPUT = dict( + DZI_PAD_SCALE=1.5, + COLOR_AUG_PROB=0.0, + COLOR_AUG_TYPE="code", + COLOR_AUG_CODE=( + "Sequential([" + "Sometimes(0.4, CoarseDropout( p=0.1, size_percent=0.05) )," + # "Sometimes(0.5, Affine(scale=(1.0, 1.2)))," + "Sometimes(0.5, GaussianBlur(np.random.rand()))," + "Sometimes(0.5, Add((-20, 20), per_channel=0.3))," + "Sometimes(0.4, Invert(0.20, per_channel=True))," + "Sometimes(0.5, Multiply((0.7, 1.4), per_channel=0.8))," + "Sometimes(0.5, Multiply((0.7, 1.4)))," + "Sometimes(0.5, ContrastNormalization((0.5, 2.0), per_channel=0.3))" + "], random_order=False)" + ), +) + +SOLVER = dict( + IMS_PER_BATCH=48, + LR_SCHEDULER_NAME="flat_and_anneal", + ANNEAL_METHOD="cosine", # "cosine" + ANNEAL_POINT=0.72, + # REL_STEPS=(0.3125, 0.625, 0.9375), + OPTIMIZER_CFG=dict(_delete_=True, type="Ranger", lr=1e-4, weight_decay=0), + WEIGHT_DECAY=0.0, + WARMUP_FACTOR=0.001, + WARMUP_ITERS=1000, +) + +DATASETS = dict( + TRAIN=("lm_13_train", "lm_imgn_13_train_1k_per_obj"), + TEST=("lm_13_test",), + DET_FILES_TEST=("datasets/BOP_DATASETS/lm/test/test_bboxes/bbox_faster_all.json",), +) + +MODEL = dict( + LOAD_DETS_TEST=True, + PIXEL_MEAN=[0.0, 0.0, 0.0], + PIXEL_STD=[255.0, 255.0, 255.0], + CDPN=dict( + ROT_HEAD=dict( + FREEZE=False, + ROT_CLASS_AWARE=False, + MASK_CLASS_AWARE=False, + XYZ_LW=1.0, + REGION_CLASS_AWARE=False, + NUM_REGIONS=64, + ), + PNP_NET=dict( + R_ONLY=False, + REGION_ATTENTION=True, + WITH_2D_COORD=True, + ROT_TYPE="allo_rot6d", + TRANS_TYPE="centroid_z", + PM_NORM_BY_EXTENT=True, + PM_R_ONLY=True, + CENTROID_LOSS_TYPE="L1", + CENTROID_LW=1.0, + Z_LOSS_TYPE="L1", + Z_LW=1.0, + ), + TRANS_HEAD=dict(FREEZE=True), + ), +) + +TEST = dict(EVAL_PERIOD=0, VIS=False, TEST_BBOX_TYPE="est") # gt | est diff --git a/core/gdrn_modeling/datasets/lm_blender.py b/core/gdrn_modeling/datasets/lm_blender.py index 3a3310d3..2f36f458 100644 --- a/core/gdrn_modeling/datasets/lm_blender.py +++ b/core/gdrn_modeling/datasets/lm_blender.py @@ -480,7 +480,7 @@ def test_vis(): python -m core.datasets.lm_blender dataset_name """ from lib.vis_utils.image import grid_show - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger import detectron2.data.datasets # noqa # add pre-defined metadata from lib.vis_utils.image import vis_image_mask_bbox_cv2 @@ -488,7 +488,7 @@ def test_vis(): from core.utils.data_utils import read_image_cv2 print("sys.argv:", sys.argv) - logger = setup_my_logger(name="core") + setup_logger() register_with_name_cfg(sys.argv[1]) print("dataset catalog: ", DatasetCatalog.list()) test_vis() diff --git a/core/gdrn_modeling/datasets/lm_dataset_d2.py b/core/gdrn_modeling/datasets/lm_dataset_d2.py index ee27d7f7..51f43d7f 100644 --- a/core/gdrn_modeling/datasets/lm_dataset_d2.py +++ b/core/gdrn_modeling/datasets/lm_dataset_d2.py @@ -713,7 +713,7 @@ def test_vis(): python this_file.py dataset_name """ from lib.vis_utils.image import grid_show - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger import detectron2.data.datasets # noqa # add pre-defined metadata from lib.vis_utils.image import vis_image_mask_bbox_cv2 @@ -721,7 +721,7 @@ def test_vis(): from core.utils.data_utils import read_image_cv2 print("sys.argv:", sys.argv) - logger = setup_my_logger(name="core") + setup_logger() register_with_name_cfg(sys.argv[1]) print("dataset catalog: ", DatasetCatalog.list()) diff --git a/core/gdrn_modeling/datasets/lm_pbr.py b/core/gdrn_modeling/datasets/lm_pbr.py index cdd4880a..cf86e372 100644 --- a/core/gdrn_modeling/datasets/lm_pbr.py +++ b/core/gdrn_modeling/datasets/lm_pbr.py @@ -513,7 +513,7 @@ def test_vis(): python -m core.datasets.lm_pbr dataset_name """ from lib.vis_utils.image import grid_show - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger import detectron2.data.datasets # noqa # add pre-defined metadata from lib.vis_utils.image import vis_image_mask_bbox_cv2 @@ -521,7 +521,7 @@ def test_vis(): from core.utils.data_utils import read_image_cv2 print("sys.argv:", sys.argv) - logger = setup_my_logger(name="core") + setup_logger() register_with_name_cfg(sys.argv[1]) print("dataset catalog: ", DatasetCatalog.list()) diff --git a/core/gdrn_modeling/datasets/lm_syn_imgn.py b/core/gdrn_modeling/datasets/lm_syn_imgn.py index 2c3bc58d..67873a7a 100644 --- a/core/gdrn_modeling/datasets/lm_syn_imgn.py +++ b/core/gdrn_modeling/datasets/lm_syn_imgn.py @@ -446,7 +446,7 @@ def test_vis(): python -m core.datasets.lm_syn_imgn dataset_name """ from lib.vis_utils.image import grid_show - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger import detectron2.data.datasets # noqa # add pre-defined metadata from lib.vis_utils.image import vis_image_mask_bbox_cv2 @@ -454,7 +454,7 @@ def test_vis(): from core.utils.data_utils import read_image_cv2 print("sys.argv:", sys.argv) - logger = setup_my_logger(name="core") + setup_logger() register_with_name_cfg(sys.argv[1]) print("dataset catalog: ", DatasetCatalog.list()) test_vis() diff --git a/core/gdrn_modeling/datasets/ycbv_bop_test.py b/core/gdrn_modeling/datasets/ycbv_bop_test.py index 58634f50..63e3f9e8 100644 --- a/core/gdrn_modeling/datasets/ycbv_bop_test.py +++ b/core/gdrn_modeling/datasets/ycbv_bop_test.py @@ -414,14 +414,14 @@ def test_vis(): python -m core.datasets.ycbv_bop_test dataset_name """ from lib.vis_utils.image import grid_show - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger import detectron2.data.datasets # noqa # add pre-defined metadata from core.utils.data_utils import read_image_cv2 from lib.vis_utils.image import vis_image_mask_bbox_cv2 print("sys.argv:", sys.argv) - logger = setup_my_logger(name="core") + setup_logger() register_with_name_cfg(sys.argv[1]) print("dataset catalog: ", DatasetCatalog.list()) diff --git a/core/gdrn_modeling/datasets/ycbv_d2.py b/core/gdrn_modeling/datasets/ycbv_d2.py index 19af3b5a..4f235e37 100755 --- a/core/gdrn_modeling/datasets/ycbv_d2.py +++ b/core/gdrn_modeling/datasets/ycbv_d2.py @@ -618,7 +618,7 @@ def test_vis(): "dataset_name" can be any pre-registered ones """ from lib.vis_utils.image import grid_show - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger import detectron2.data.datasets # noqa # add pre-defined metadata from lib.vis_utils.image import vis_image_mask_bbox_cv2 @@ -626,7 +626,7 @@ def test_vis(): from core.utils.data_utils import read_image_cv2 print("sys.argv:", sys.argv) - logger = setup_my_logger(name="core") + setup_logger() register_with_name_cfg(sys.argv[1]) print("dataset catalog: ", DatasetCatalog.list()) diff --git a/core/gdrn_modeling/datasets/ycbv_pbr.py b/core/gdrn_modeling/datasets/ycbv_pbr.py index 03c764bf..b6399e52 100644 --- a/core/gdrn_modeling/datasets/ycbv_pbr.py +++ b/core/gdrn_modeling/datasets/ycbv_pbr.py @@ -448,7 +448,7 @@ def test_vis(): python -m core/datasets/ycbv_pbr.py ycbv_pbr_train """ from lib.vis_utils.image import grid_show - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger import detectron2.data.datasets # noqa # add pre-defined metadata from lib.vis_utils.image import vis_image_mask_bbox_cv2 @@ -456,7 +456,7 @@ def test_vis(): from core.utils.data_utils import read_image_cv2 print("sys.argv:", sys.argv) - logger = setup_my_logger(name="core") + setup_logger() register_with_name_cfg(sys.argv[1]) print("dataset catalog: ", DatasetCatalog.list()) diff --git a/core/gdrn_modeling/engine.py b/core/gdrn_modeling/engine.py index 2f24797e..e18b15e0 100644 --- a/core/gdrn_modeling/engine.py +++ b/core/gdrn_modeling/engine.py @@ -2,7 +2,6 @@ import os import os.path as osp import torch -from torch.cuda.amp import autocast, GradScaler import mmcv import time import cv2 @@ -20,11 +19,11 @@ LVISEvaluator, PascalVOCDetectionEvaluator, SemSegEvaluator, - print_csv_format, ) from detectron2.data.common import AspectRatioGroupedDataset from detectron2.data import MetadataCatalog +from pytorch_lightning.lite import LightningLite # import LightningLite from lib.utils.utils import dprint, iprint, get_time_str @@ -40,224 +39,206 @@ from .gdrn_custom_evaluator import GDRN_EvaluatorCustom import ref -try: - import horovod.torch as hvd -except ImportError: - print("You requested to import horovod which is missing or not supported for your OS.") - logger = logging.getLogger(__name__) -def get_evaluator(cfg, dataset_name, output_folder=None): - """Create evaluator(s) for a given dataset. - - This uses the special metadata "evaluator_type" associated with each - builtin dataset. For your own dataset, you can simply create an - evaluator manually in your script and do not have to worry about the - hacky if-else logic here. - """ - if output_folder is None: - output_folder = osp.join(cfg.OUTPUT_DIR, "inference") - evaluator_list = [] - evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type - if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: - evaluator_list.append( - SemSegEvaluator( - dataset_name, - distributed=True, - num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, - ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, - output_dir=output_folder, +class GDRN_Lite(LightningLite): + def get_evaluator(self, cfg, dataset_name, output_folder=None): + """Create evaluator(s) for a given dataset. + + This uses the special metadata "evaluator_type" associated with + each builtin dataset. For your own dataset, you can simply + create an evaluator manually in your script and do not have to + worry about the hacky if-else logic here. + """ + if output_folder is None: + output_folder = osp.join(cfg.OUTPUT_DIR, "inference") + evaluator_list = [] + evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: + evaluator_list.append( + SemSegEvaluator( + dataset_name, + distributed=True, + num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + output_dir=output_folder, + ) ) - ) - if evaluator_type in ["coco", "coco_panoptic_seg"]: - evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) - if evaluator_type == "coco_panoptic_seg": - evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) - if evaluator_type == "cityscapes_instance": - assert ( - torch.cuda.device_count() >= comm.get_rank() - ), "CityscapesEvaluator currently do not work with multiple machines." - return CityscapesInstanceEvaluator(dataset_name) - if evaluator_type == "cityscapes_sem_seg": - assert ( - torch.cuda.device_count() >= comm.get_rank() - ), "CityscapesEvaluator currently do not work with multiple machines." - return CityscapesSemSegEvaluator(dataset_name) - if evaluator_type == "pascal_voc": - return PascalVOCDetectionEvaluator(dataset_name) - if evaluator_type == "lvis": - return LVISEvaluator(dataset_name, cfg, True, output_folder) - - _distributed = comm.get_world_size() > 1 - dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) - train_obj_names = dataset_meta.objs - if evaluator_type == "bop": - if cfg.VAL.get("USE_BOP", False): - return GDRN_Evaluator( - cfg, dataset_name, distributed=_distributed, output_dir=output_folder, train_objs=train_obj_names + if evaluator_type in ["coco", "coco_panoptic_seg"]: + evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) + if evaluator_type == "coco_panoptic_seg": + evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) + if evaluator_type == "cityscapes_instance": + assert ( + torch.cuda.device_count() >= self.global_rank + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesInstanceEvaluator(dataset_name) + if evaluator_type == "cityscapes_sem_seg": + assert ( + torch.cuda.device_count() >= self.global_rank + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesSemSegEvaluator(dataset_name) + if evaluator_type == "pascal_voc": + return PascalVOCDetectionEvaluator(dataset_name) + if evaluator_type == "lvis": + return LVISEvaluator(dataset_name, cfg, True, output_folder) + + _distributed = self.world_size > 1 + dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) + train_obj_names = dataset_meta.objs + if evaluator_type == "bop": + if cfg.VAL.get("USE_BOP", False): + return GDRN_Evaluator( + cfg, dataset_name, distributed=_distributed, output_dir=output_folder, train_objs=train_obj_names + ) + else: + return GDRN_EvaluatorCustom( + cfg, dataset_name, distributed=_distributed, output_dir=output_folder, train_objs=train_obj_names + ) + + if len(evaluator_list) == 0: + raise NotImplementedError( + "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type) ) + if len(evaluator_list) == 1: + return evaluator_list[0] + return DatasetEvaluators(evaluator_list) + + def get_tbx_event_writer(self, out_dir, backup=False): + tb_logdir = osp.join(out_dir, "tb") + mmcv.mkdir_or_exist(tb_logdir) + if backup and self.is_global_zero: + old_tb_logdir = osp.join(out_dir, "tb_old") + mmcv.mkdir_or_exist(old_tb_logdir) + os.system("mv -v {} {}".format(osp.join(tb_logdir, "events.*"), old_tb_logdir)) + + tbx_event_writer = MyTensorboardXWriter(tb_logdir, backend="tensorboardX") + return tbx_event_writer + + def do_test(self, cfg, model, epoch=None, iteration=None): + results = OrderedDict() + model_name = osp.basename(cfg.MODEL.WEIGHTS).split(".")[0] + for dataset_name in cfg.DATASETS.TEST: + if epoch is not None and iteration is not None: + evaluator = self.get_evaluator( + cfg, + dataset_name, + osp.join(cfg.OUTPUT_DIR, f"inference_epoch_{epoch}_iter_{iteration}", dataset_name), + ) + else: + evaluator = self.get_evaluator( + cfg, dataset_name, osp.join(cfg.OUTPUT_DIR, f"inference_{model_name}", dataset_name) + ) + data_loader = build_gdrn_test_loader(cfg, dataset_name, train_objs=evaluator.train_objs) + data_loader = self.setup_dataloaders(data_loader, replace_sampler=False, move_to_device=False) + results_i = gdrn_inference_on_dataset(cfg, model, data_loader, evaluator, amp_test=cfg.TEST.AMP_TEST) + results[dataset_name] = results_i + + if len(results) == 1: + results = list(results.values())[0] + return results + + def do_train(self, cfg, args, model, optimizer, resume=False): + model.train() + + # some basic settings ========================= + dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) + data_ref = ref.__dict__[dataset_meta.ref_key] + obj_names = dataset_meta.objs + + # load data =================================== + train_dset_names = cfg.DATASETS.TRAIN + data_loader = build_gdrn_train_loader(cfg, train_dset_names) + data_loader_iter = iter(data_loader) + + # load 2nd train dataloader if needed + train_2_dset_names = cfg.DATASETS.get("TRAIN2", ()) + train_2_ratio = cfg.DATASETS.get("TRAIN2_RATIO", 0.0) + if train_2_ratio > 0.0 and len(train_2_dset_names) > 0: + data_loader_2 = build_gdrn_train_loader(cfg, train_2_dset_names) + data_loader_2_iter = iter(data_loader_2) else: - return GDRN_EvaluatorCustom( - cfg, dataset_name, distributed=_distributed, output_dir=output_folder, train_objs=train_obj_names - ) + data_loader_2 = None + data_loader_2_iter = None + + images_per_batch = cfg.SOLVER.IMS_PER_BATCH + if isinstance(data_loader, AspectRatioGroupedDataset): + dataset_len = len(data_loader.dataset.dataset) + if data_loader_2 is not None: + dataset_len += len(data_loader_2.dataset.dataset) + iters_per_epoch = dataset_len // images_per_batch + else: + dataset_len = len(data_loader.dataset) + if data_loader_2 is not None: + dataset_len += len(data_loader_2.dataset) + iters_per_epoch = dataset_len // images_per_batch + max_iter = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch + dprint("images_per_batch: ", images_per_batch) + dprint("dataset length: ", dataset_len) + dprint("iters per epoch: ", iters_per_epoch) + dprint("total iters: ", max_iter) + + data_loader = self.setup_dataloaders(data_loader, replace_sampler=False, move_to_device=False) + if data_loader_2 is not None: + data_loader_2 = self.setup_dataloaders(data_loader_2, replace_sampler=False, move_to_device=False) + + scheduler = solver_utils.build_lr_scheduler(cfg, optimizer, total_iters=max_iter) - if len(evaluator_list) == 0: - raise NotImplementedError( - "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type) + # resume or load model =================================== + extra_ckpt_dict = dict( + optimizer=optimizer, + scheduler=scheduler, ) - if len(evaluator_list) == 1: - return evaluator_list[0] - return DatasetEvaluators(evaluator_list) - - -def do_test(cfg, model, epoch=None, iteration=None): - results = OrderedDict() - model_name = osp.basename(cfg.MODEL.WEIGHTS).split(".")[0] - for dataset_name in cfg.DATASETS.TEST: - if epoch is not None and iteration is not None: - evaluator = get_evaluator( - cfg, dataset_name, osp.join(cfg.OUTPUT_DIR, f"inference_epoch_{epoch}_iter_{iteration}", dataset_name) - ) + if hasattr(self._precision_plugin, "scaler"): + extra_ckpt_dict["gradscaler"] = self._precision_plugin.scaler + checkpointer = MyCheckpointer( + model, + cfg.OUTPUT_DIR, + save_to_disk=self.is_global_zero, + **extra_ckpt_dict, + ) + start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 + + if cfg.SOLVER.CHECKPOINT_BY_EPOCH: + ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD * iters_per_epoch else: - evaluator = get_evaluator( - cfg, dataset_name, osp.join(cfg.OUTPUT_DIR, f"inference_{model_name}", dataset_name) - ) - data_loader = build_gdrn_test_loader(cfg, dataset_name, train_objs=evaluator.train_objs) - results_i = gdrn_inference_on_dataset(cfg, model, data_loader, evaluator, amp_test=cfg.TEST.AMP_TEST) - results[dataset_name] = results_i - # if comm.is_main_process(): - # logger.info("Evaluation results for {} in csv format:".format(dataset_name)) - # print_csv_format(results_i) - if len(results) == 1: - results = list(results.values())[0] - return results - - -def get_tbx_event_writer(out_dir, backup=False): - tb_logdir = osp.join(out_dir, "tb") - mmcv.mkdir_or_exist(tb_logdir) - if backup: - old_tb_logdir = osp.join(out_dir, "tb_old") - mmcv.mkdir_or_exist(old_tb_logdir) - os.system("mv -v {} {}".format(osp.join(tb_logdir, "events.*"), old_tb_logdir)) - - tbx_event_writer = MyTensorboardXWriter(tb_logdir, backend="tensorboardX") - return tbx_event_writer - - -def do_train(cfg, args, model, optimizer, resume=False): - model.train() - - # some basic settings ========================= - dataset_meta = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) - data_ref = ref.__dict__[dataset_meta.ref_key] - obj_names = dataset_meta.objs - - # load data =================================== - train_dset_names = cfg.DATASETS.TRAIN - data_loader = build_gdrn_train_loader(cfg, train_dset_names) - data_loader_iter = iter(data_loader) - - # load 2nd train dataloader if needed - train_2_dset_names = cfg.DATASETS.get("TRAIN2", ()) - train_2_ratio = cfg.DATASETS.get("TRAIN2_RATIO", 0.0) - if train_2_ratio > 0.0 and len(train_2_dset_names) > 0: - data_loader_2 = build_gdrn_train_loader(cfg, train_2_dset_names) - data_loader_2_iter = iter(data_loader_2) - else: - data_loader_2 = None - data_loader_2_iter = None - - images_per_batch = cfg.SOLVER.IMS_PER_BATCH - if isinstance(data_loader, AspectRatioGroupedDataset): - dataset_len = len(data_loader.dataset.dataset) - if data_loader_2 is not None: - dataset_len += len(data_loader_2.dataset.dataset) - iters_per_epoch = dataset_len // images_per_batch - else: - dataset_len = len(data_loader.dataset) - if data_loader_2 is not None: - dataset_len += len(data_loader_2.dataset) - iters_per_epoch = dataset_len // images_per_batch - max_iter = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch - dprint("images_per_batch: ", images_per_batch) - dprint("dataset length: ", dataset_len) - dprint("iters per epoch: ", iters_per_epoch) - dprint("total iters: ", max_iter) - scheduler = solver_utils.build_lr_scheduler(cfg, optimizer, total_iters=max_iter) - - AMP_ON = cfg.SOLVER.AMP.ENABLED - logger.info(f"AMP enabled: {AMP_ON}") - grad_scaler = GradScaler() - - # resume or load model =================================== - checkpointer = MyCheckpointer( - model, - cfg.OUTPUT_DIR, - optimizer=optimizer, - scheduler=scheduler, - gradscaler=grad_scaler, - save_to_disk=comm.is_main_process(), - ) - start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 - - if comm._USE_HVD: # hvd may be not available, so do not use the one in args - # not needed - # start_iter = hvd.broadcast(torch.tensor(start_iter), root_rank=0, name="start_iter").item() - - # Horovod: broadcast parameters & optimizer state. - hvd.broadcast_parameters(model.state_dict(), root_rank=0) - hvd.broadcast_optimizer_state(optimizer, root_rank=0) - # Horovod: (optional) compression algorithm. - compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - optimizer = hvd.DistributedOptimizer( - optimizer, - named_parameters=model.named_parameters(), - op=hvd.Adasum if args.use_adasum else hvd.Average, - compression=compression, - ) # device_dense='/cpu:0' - - if cfg.SOLVER.CHECKPOINT_BY_EPOCH: - ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD * iters_per_epoch - else: - ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD - periodic_checkpointer = PeriodicCheckpointer( - checkpointer, ckpt_period, max_iter=max_iter, max_to_keep=cfg.SOLVER.MAX_TO_KEEP - ) - - # build writers ============================================== - tbx_event_writer = get_tbx_event_writer(cfg.OUTPUT_DIR, backup=not cfg.get("RESUME", False)) - tbx_writer = tbx_event_writer._writer # NOTE: we want to write some non-scalar data - writers = ( - [MyCommonMetricPrinter(max_iter), MyJSONWriter(osp.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_event_writer] - if comm.is_main_process() - else [] - ) - - # compared to "train_net.py", we do not support accurate timing and - # precise BN here, because they are not trivial to implement - logger.info("Starting training from iteration {}".format(start_iter)) - iter_time = None - with EventStorage(start_iter) as storage: - # for data, iteration in zip(data_loader, range(start_iter, max_iter)): - for iteration in range(start_iter, max_iter): - storage.iter = iteration - epoch = iteration // dataset_len + 1 - - if np.random.rand() < train_2_ratio: - data = next(data_loader_2_iter) - else: - data = next(data_loader_iter) + ckpt_period = cfg.SOLVER.CHECKPOINT_PERIOD + periodic_checkpointer = PeriodicCheckpointer( + checkpointer, ckpt_period, max_iter=max_iter, max_to_keep=cfg.SOLVER.MAX_TO_KEEP + ) - if iter_time is not None: - storage.put_scalar("time", time.perf_counter() - iter_time) - iter_time = time.perf_counter() + # build writers ============================================== + tbx_event_writer = self.get_tbx_event_writer(cfg.OUTPUT_DIR, backup=not cfg.get("RESUME", False)) + tbx_writer = tbx_event_writer._writer # NOTE: we want to write some non-scalar data + writers = ( + [MyCommonMetricPrinter(max_iter), MyJSONWriter(osp.join(cfg.OUTPUT_DIR, "metrics.json")), tbx_event_writer] + if self.is_global_zero + else [] + ) + + # compared to "train_net.py", we do not support accurate timing and + # precise BN here, because they are not trivial to implement + logger.info("Starting training from iteration {}".format(start_iter)) + iter_time = None + with EventStorage(start_iter) as storage: + for iteration in range(start_iter, max_iter): + storage.iter = iteration + epoch = iteration // dataset_len + 1 + + if np.random.rand() < train_2_ratio: + data = next(data_loader_2_iter) + else: + data = next(data_loader_iter) + + if iter_time is not None: + storage.put_scalar("time", time.perf_counter() - iter_time) + iter_time = time.perf_counter() + + # forward ============================================================ + batch = batch_data(cfg, data) - # forward ============================================================ - batch = batch_data(cfg, data) - with autocast(enabled=AMP_ON): out_dict, loss_dict = model( batch["roi_img"], gt_xyz=batch.get("roi_xyz", None), @@ -287,70 +268,58 @@ def do_train(cfg, args, model, optimizer, resume=False): losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict - loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} - losses_reduced = sum(loss for loss in loss_dict_reduced.values()) - if comm.is_main_process(): - storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) - - optimizer.zero_grad() - if AMP_ON: - grad_scaler.scale(losses).backward() - - # # Unscales the gradients of optimizer's assigned params in-place - # grad_scaler.unscale_(optimizer) - # # Since the gradients of optimizer's assigned params are unscaled, clips as usual: - # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) - if comm._USE_HVD: - optimizer.synchronize() - with optimizer.skip_synchronize(): - grad_scaler.step(optimizer) - grad_scaler.update() - else: - grad_scaler.step(optimizer) - grad_scaler.update() - else: - losses.backward() + loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + if self.is_global_zero: + storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) + + optimizer.zero_grad(set_to_none=True) + self.backward(losses) optimizer.step() - storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) - scheduler.step() - - if cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1: - do_test(cfg, model, epoch=epoch, iteration=iteration) - # Compared to "train_net.py", the test results are not dumped to EventStorage - comm.synchronize() - - if iteration - start_iter > 5 and ( - (iteration + 1) % cfg.TRAIN.PRINT_FREQ == 0 or iteration == max_iter - 1 or iteration < 100 - ): - for writer in writers: - writer.write() - # visualize some images ======================================== - if cfg.TRAIN.VIS_IMG: - with torch.no_grad(): - vis_i = 0 - roi_img_vis = batch["roi_img"][vis_i].cpu().numpy() - roi_img_vis = denormalize_image(roi_img_vis, cfg).transpose(1, 2, 0).astype("uint8") - tbx_writer.add_image("input_image", roi_img_vis, iteration) - - out_coor_x = out_dict["coor_x"].detach() - out_coor_y = out_dict["coor_y"].detach() - out_coor_z = out_dict["coor_z"].detach() - out_xyz = get_out_coor(cfg, out_coor_x, out_coor_y, out_coor_z) - - out_xyz_vis = out_xyz[vis_i].cpu().numpy().transpose(1, 2, 0) - out_xyz_vis = get_emb_show(out_xyz_vis) - tbx_writer.add_image("out_xyz", out_xyz_vis, iteration) - - gt_xyz_vis = batch["roi_xyz"][vis_i].cpu().numpy().transpose(1, 2, 0) - gt_xyz_vis = get_emb_show(gt_xyz_vis) - tbx_writer.add_image("gt_xyz", gt_xyz_vis, iteration) - - out_mask = out_dict["mask"].detach() - out_mask = get_out_mask(cfg, out_mask) - out_mask_vis = out_mask[vis_i, 0].cpu().numpy() - tbx_writer.add_image("out_mask", out_mask_vis, iteration) - - gt_mask_vis = batch["roi_mask"][vis_i].detach().cpu().numpy() - tbx_writer.add_image("gt_mask", gt_mask_vis, iteration) - periodic_checkpointer.step(iteration, epoch=epoch) + storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) + scheduler.step() + + if ( + cfg.TEST.EVAL_PERIOD > 0 + and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 + and iteration != max_iter - 1 + ): + self.do_test(cfg, model, epoch=epoch, iteration=iteration) + # Compared to "train_net.py", the test results are not dumped to EventStorage + self.barrier() + + if iteration - start_iter > 5 and ( + (iteration + 1) % cfg.TRAIN.PRINT_FREQ == 0 or iteration == max_iter - 1 or iteration < 100 + ): + for writer in writers: + writer.write() + # visualize some images ======================================== + if cfg.TRAIN.VIS_IMG: + with torch.no_grad(): + vis_i = 0 + roi_img_vis = batch["roi_img"][vis_i].cpu().numpy() + roi_img_vis = denormalize_image(roi_img_vis, cfg).transpose(1, 2, 0).astype("uint8") + tbx_writer.add_image("input_image", roi_img_vis, iteration) + + out_coor_x = out_dict["coor_x"].detach() + out_coor_y = out_dict["coor_y"].detach() + out_coor_z = out_dict["coor_z"].detach() + out_xyz = get_out_coor(cfg, out_coor_x, out_coor_y, out_coor_z) + + out_xyz_vis = out_xyz[vis_i].cpu().numpy().transpose(1, 2, 0) + out_xyz_vis = get_emb_show(out_xyz_vis) + tbx_writer.add_image("out_xyz", out_xyz_vis, iteration) + + gt_xyz_vis = batch["roi_xyz"][vis_i].cpu().numpy().transpose(1, 2, 0) + gt_xyz_vis = get_emb_show(gt_xyz_vis) + tbx_writer.add_image("gt_xyz", gt_xyz_vis, iteration) + + out_mask = out_dict["mask"].detach() + out_mask = get_out_mask(cfg, out_mask) + out_mask_vis = out_mask[vis_i, 0].cpu().numpy() + tbx_writer.add_image("out_mask", out_mask_vis, iteration) + + gt_mask_vis = batch["roi_mask"][vis_i].detach().cpu().numpy() + tbx_writer.add_image("gt_mask", gt_mask_vis, iteration) + periodic_checkpointer.step(iteration, epoch=epoch) diff --git a/core/gdrn_modeling/main_gdrn.py b/core/gdrn_modeling/main_gdrn.py index 452ffa39..bf27a6f1 100644 --- a/core/gdrn_modeling/main_gdrn.py +++ b/core/gdrn_modeling/main_gdrn.py @@ -1,25 +1,22 @@ import logging +from loguru import logger as loguru_logger import os import os.path as osp import sys from setproctitle import setproctitle import torch -from torch.nn.parallel import DistributedDataParallel -from detectron2.engine import default_setup, launch + from mmcv import Config import cv2 +from pytorch_lightning import seed_everything +from pytorch_lightning.lite import LightningLite # import LightningLite cv2.setNumThreads(0) # pytorch issue 1355: possible deadlock in dataloader # OpenCL may be enabled by default in OpenCV3; disable it because it's not # thread safe and causes unwanted GPU memory allocations. cv2.ocl.setUseOpenCL(False) -try: - import horovod.torch as hvd -except ImportError: - print("You requested to import horovod which is missing or not supported for your OS.") - cur_dir = osp.dirname(osp.abspath(__file__)) sys.path.insert(0, osp.join(cur_dir, "../../")) from core.utils.default_args_setup import my_default_argument_parser, my_default_setup @@ -28,12 +25,11 @@ from core.utils import my_comm as comm from lib.utils.utils import iprint -from lib.utils.setup_logger import setup_my_logger from lib.utils.time_utils import get_time_str from core.gdrn_modeling.dataset_factory import register_datasets_in_cfg -from core.gdrn_modeling.engine import do_test, do_train -from core.gdrn_modeling.models import GDRN +from core.gdrn_modeling.engine import GDRN_Lite +from core.gdrn_modeling.models import GDRN # noqa logger = logging.getLogger("detectron2") @@ -60,13 +56,15 @@ def setup(args): iprint("Disable AMP for older GPUs") cfg.SOLVER.AMP.ENABLED = False - # NOTE: pop some unwanterd configs in detectron2 + # NOTE: pop some unwanted configs in detectron2 + # --------------------------------------------------------- cfg.SOLVER.pop("STEPS", None) cfg.SOLVER.pop("MAX_ITER", None) # NOTE: get optimizer from string cfg dict if cfg.SOLVER.OPTIMIZER_CFG != "": if isinstance(cfg.SOLVER.OPTIMIZER_CFG, str): optim_cfg = eval(cfg.SOLVER.OPTIMIZER_CFG) + cfg.SOLVER.OPTIMIZER_CFG = optim_cfg else: optim_cfg = cfg.SOLVER.OPTIMIZER_CFG iprint("optimizer_cfg:", optim_cfg) @@ -74,6 +72,7 @@ def setup(args): cfg.SOLVER.BASE_LR = optim_cfg["lr"] cfg.SOLVER.MOMENTUM = optim_cfg.get("momentum", 0.9) cfg.SOLVER.WEIGHT_DECAY = optim_cfg.get("weight_decay", 1e-4) + # ------------------------------------------------------------------------- if cfg.get("DEBUG", False): iprint("DEBUG") args.num_gpus = 1 @@ -94,33 +93,53 @@ def setup(args): cfg.EXP_ID = exp_id cfg.RESUME = args.resume #################################### - my_default_setup(cfg, args) - # Setup logger - setup_for_distributed(is_master=comm.is_main_process()) - setup_my_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="core") - setup_my_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="lib") return cfg -def main(args): - cfg = setup(args) +class Lite(GDRN_Lite): + def set_my_env(self, args, cfg): + my_default_setup(cfg, args) # will set os.environ["PYTHONHASHSEED"] + seed_everything(int(os.environ["PYTHONHASHSEED"])) + setup_for_distributed(is_master=self.is_global_zero) - logger.info(f"Used CDPN module name: {cfg.MODEL.CDPN.NAME}") - model, optimizer = eval(cfg.MODEL.CDPN.NAME).build_model_optimizer(cfg) - logger.info("Model:\n{}".format(model)) + def run(self, args, cfg): + self.set_my_env(args, cfg) - if args.eval_only: - MyCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(cfg.MODEL.WEIGHTS, resume=args.resume) - return do_test(cfg, model) + logger.info(f"Used GDRN module name: {cfg.MODEL.CDPN.NAME}") + model, optimizer = eval(cfg.MODEL.CDPN.NAME).build_model_optimizer(cfg) + logger.info("Model:\n{}".format(model)) + + # don't forget to call `setup` to prepare for model / optimizer for distributed training. + # the model is moved automatically to the right device. + model, optimizer = self.setup(model, optimizer) + + if True: + # sum(p.numel() for p in model.parameters() if p.requires_grad) + params = sum(p.numel() for p in model.parameters()) / 1e6 + logger.info("{}M params".format(params)) - distributed = comm.get_world_size() > 1 - if distributed and not args.use_hvd: - model = DistributedDataParallel( - model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True - ) + if args.eval_only: + MyCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(cfg.MODEL.WEIGHTS, resume=args.resume) + return self.do_test(cfg, model) - do_train(cfg, args, model, optimizer, resume=args.resume) - return do_test(cfg, model) + self.do_train(cfg, args, model, optimizer, resume=args.resume) + return self.do_test(cfg, model) + + +@loguru_logger.catch +def main(args): + cfg = setup(args) + + logger.info(f"start to train with {args.num_machines} nodes and {args.num_gpus} GPUs") + if args.num_gpus > 1 and args.strategy is None: + args.strategy = "ddp" + Lite( + accelerator="gpu", + strategy=args.strategy, + devices=args.num_gpus, + num_nodes=args.num_machines, + precision=16 if cfg.SOLVER.AMP.ENABLED else 32, + ).run(args, cfg) if __name__ == "__main__": @@ -133,28 +152,17 @@ def main(args): iprint("soft limit: ", soft_limit, "hard limit: ", hard_limit) resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) - args = my_default_argument_parser().parse_args() - iprint("Command Line Args:", args) + parser = my_default_argument_parser() + parser.add_argument( + "--strategy", + default=None, + type=str, + help="the strategy for parallel training: dp | ddp | ddp_spawn | deepspeed | ddp_sharded", + ) + args = parser.parse_args() + iprint("Command Line Args: {}".format(args)) if args.eval_only: torch.multiprocessing.set_sharing_strategy("file_system") - USE_HVD = False - if args.use_hvd: - if comm.HVD_AVAILABLE: - iprint("Using horovod") - comm.init_hvd() - USE_HVD = True - main(args) - else: - iprint("horovod is not available. Fall back to default setting.") - - if not USE_HVD: - launch( - main, - args.num_gpus, - num_machines=args.num_machines, - machine_rank=args.machine_rank, - dist_url=args.dist_url, - args=(args,), - ) + main(args) diff --git a/core/gdrn_modeling/test_utils.py b/core/gdrn_modeling/test_utils.py index 1d55161d..d5d6437d 100644 --- a/core/gdrn_modeling/test_utils.py +++ b/core/gdrn_modeling/test_utils.py @@ -346,7 +346,7 @@ def load_and_print_val_scores_tab( if __name__ == "__main__": import argparse from mmcv import Config, DictAction - from lib.utils.setup_logger import setup_my_logger + from lib.utils.setup_logger import setup_logger parser = argparse.ArgumentParser(description="wrapper functions to evaluate with bop toolkit") parser.add_argument( @@ -381,8 +381,7 @@ def load_and_print_val_scores_tab( else: obj_ids = args.obj_ids result_dir = args.result_dir - setup_my_logger(name="core") - setup_my_logger(name="__main__") + setup_logger() result_names_str = args.result_names if "," not in result_names_str: result_names = [result_names_str] diff --git a/core/utils/default_args_setup.py b/core/utils/default_args_setup.py index 04f261f0..91c9cbbc 100644 --- a/core/utils/default_args_setup.py +++ b/core/utils/default_args_setup.py @@ -1,15 +1,17 @@ +import logging import argparse import os import os.path as osp import sys +import mmcv from mmcv import DictAction import torch - +from loguru import logger from detectron2.utils.env import seed_all_rng from fvcore.common.file_io import PathManager from detectron2.utils.collect_env import collect_env_info -from detectron2.utils.logger import setup_logger from core.utils import my_comm as comm +from lib.utils.setup_logger import setup_logger def my_default_argument_parser(epilog=None): @@ -78,11 +80,11 @@ def my_default_setup(cfg, args): """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: - PathManager.mkdirs(output_dir) + mmcv.mkdir_or_exist(output_dir) rank = comm.get_rank() - setup_logger(output_dir, distributed_rank=rank, name="fvcore") - logger = setup_logger(output_dir, distributed_rank=rank) + setup_logger(output_dir, distributed_rank=rank) + logging.getLogger("PIL").setLevel(logging.INFO) logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) diff --git a/core/utils/my_comm.py b/core/utils/my_comm.py index 1d055897..ac5950d7 100644 --- a/core/utils/my_comm.py +++ b/core/utils/my_comm.py @@ -4,177 +4,20 @@ import logging import pickle -try: - import horovod.torch as hvd -except (ModuleNotFoundError, ImportError): - HVD_AVAILABLE = False -else: - HVD_AVAILABLE = True - - -global _USE_HVD -_USE_HVD = False - def reduce_dict(input_dict, average=True): - global _USE_HVD - if _USE_HVD: - return reduce_dict_hvd(input_dict, average=average) return comm.reduce_dict(input_dict, average=average) -def reduce_dict_hvd(input_dict, average=True): - """ - Args: - input_dict (dict): all the values will be reduced - average (bool): whether to do average or sum - Reduce the values in the dictionary from all processes so that all processes - have the averaged results. Returns a dict with the same fields as - input_dict, after reduction. - """ - global _USE_HVD - world_size = get_world_size() - if world_size < 2: - return input_dict - with torch.no_grad(): - names = [] - values = [] - # sort the keys so that they are consistent across processes - for k in sorted(input_dict.keys()): - names.append(k) - values.append(input_dict[k]) - values = torch.stack(values, dim=0) - if _USE_HVD: # TODO: check this in hvd - hvd.allreduce_(values, op=hvd.Average if average else hvd.Adasum, name="reduce_dict") - else: - dist.all_reduce(values) - if average: - values /= world_size - reduced_dict = {k: v for k, v in zip(names, values)} - return reduced_dict - - def all_gather(data, group=None): - global _USE_HVD - if _USE_HVD: - return all_gather_hvd(data, group=group) return comm.all_gather(data, group=group) def synchronize(): - global _USE_HVD - if _USE_HVD: - hvd.broadcast_object(0) - return return comm.synchronize() -def all_gather_hvd(data, group=None): - global _USE_HVD - assert _USE_HVD, f"_USE_HVD: {_USE_HVD}" - world_size = get_world_size() - if world_size == 1: - return [data] - - tensor = _serialize_to_tensor(data, group) - - size_list, tensor = _pad_to_largest_tensor(tensor, group) - max_size = max(size_list) - - # receiving Tensor from all ranks - tensor_list = [torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list] - if _USE_HVD: - # NOTE: concatenated on the first dimension - tensor_list = hvd.allgather( - tensor[ - None, - ] - ) - else: - dist.all_gather(tensor_list, tensor, group=group) - - data_list = [] - for size, tensor in zip(size_list, tensor_list): - buffer = tensor.cpu().numpy().tobytes()[:size] - data_list.append(pickle.loads(buffer)) - - return data_list - - -def _serialize_to_tensor(data, group): - global _USE_HVD - if _USE_HVD: - backend = "nccl" - else: - backend = dist.get_backend(group) - assert backend in ["gloo", "nccl"] - device = torch.device("cpu" if backend == "gloo" else "cuda") - - buffer = pickle.dumps(data) - if len(buffer) > 1024 ** 3: - logger = logging.getLogger(__name__) - logger.warning( - "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( - get_rank(), len(buffer) / (1024 ** 3), device - ) - ) - storage = torch.ByteStorage.from_buffer(buffer) - tensor = torch.ByteTensor(storage).to(device=device) - return tensor - - -def _pad_to_largest_tensor(tensor, group): - """ - Returns: - list[int]: size of the tensor, on each rank - Tensor: padded tensor that has the max size - """ - global _USE_HVD - if _USE_HVD: - world_size = get_world_size() - else: - world_size = dist.get_world_size(group=group) - assert world_size >= 1, "comm.gather/all_gather must be called from ranks within the given group!" - local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) - size_list = [torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)] - if _USE_HVD: - size_list = hvd.allgather(local_size) # a tensor with (world_size,) actually - else: - dist.all_gather(size_list, local_size, group=group) - size_list = [int(size.item()) for size in size_list] - - max_size = max(size_list) - - # we pad the tensor because torch all_gather does not support - # gathering tensors of different shapes - if local_size != max_size: - padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) - tensor = torch.cat((tensor, padding), dim=0) - return size_list, tensor - - -def init_hvd(): - global _USE_HVD - if _USE_HVD: - return True - if not HVD_AVAILABLE: - raise RuntimeError("horovod is not available") - else: - hvd.init() - _USE_HVD = True - if not torch.cuda.is_available(): - raise RuntimeError("cuda is not available!") - # Horovod: pin GPU to local rank. - local_rank = get_local_rank() - assert local_rank < torch.cuda.device_count() - torch.cuda.set_device(local_rank) - return True - - def is_dist_avail_and_initialized(): - global _USE_HVD - if _USE_HVD: - return True if not dist.is_available(): return False if not dist.is_initialized(): @@ -187,30 +30,18 @@ def shared_random_seed(): def get_world_size(): - global _USE_HVD - if _USE_HVD: - return hvd.size() return comm.get_world_size() def get_rank(): - global _USE_HVD - if _USE_HVD: - return hvd.rank() return comm.get_rank() def get_local_rank(): - global _USE_HVD - if _USE_HVD: - return hvd.local_rank() return comm.get_local_rank() def get_local_size(): - global _USE_HVD - if _USE_HVD: - return hvd.local_size() return comm.get_local_size() diff --git a/lib/utils/mask_utils.py b/lib/utils/mask_utils.py index 38a0c224..1d61fdf0 100644 --- a/lib/utils/mask_utils.py +++ b/lib/utils/mask_utils.py @@ -36,12 +36,6 @@ def get_edge(mask, bw=1, out_channel=3): return edges -def read_mask_np(mask_path, dtype=np.uint8): - mask = Image.open(mask_path) - mask_seg = np.array(mask).astype(dtype) - return mask_seg - - def mask2bbox_xyxy(mask): """NOTE: the bottom right point is included""" ys, xs = np.nonzero(mask)[:2] diff --git a/lib/utils/setup_logger.py b/lib/utils/setup_logger.py index 739c48dc..5a8fb222 100644 --- a/lib/utils/setup_logger.py +++ b/lib/utils/setup_logger.py @@ -1,204 +1,181 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -# modified from detectron2.utils.logger -# support showing line number and debug mode color -import functools -import logging +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +# modified by Gu Wang +import inspect import os -import os.path as osp import sys +import logging +from loguru import logger +import time from collections import Counter -from fvcore.common.file_io import PathManager -from tabulate import tabulate +import warnings from termcolor import colored -import datetime - - -def _get_time_str(): - # return datetime.now().strftime("%Y%m%d-%H%M%S") - return datetime.now().strftime("%Y%m%d_%H%M%S") - - -class _ColorfulFormatter(logging.Formatter): - def __init__(self, *args, **kwargs): - self._root_name = kwargs.pop("root_name") + "." - self._abbrev_name = kwargs.pop("abbrev_name", "") - if len(self._abbrev_name): - self._abbrev_name = self._abbrev_name + "." - super(_ColorfulFormatter, self).__init__(*args, **kwargs) - - def formatMessage(self, record): - record.name = record.name.replace(self._root_name, self._abbrev_name) - log = super(_ColorfulFormatter, self).formatMessage(record) - if record.levelno == logging.WARNING: - prefix = colored("WRN", "red", attrs=["blink"]) - elif record.levelno == logging.DEBUG: - prefix = colored("DBG", "yellow", attrs=["blink"]) - elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: - prefix = colored("ERROR", "red", attrs=["blink", "underline"]) - else: - return log - return prefix + " " + log - - -@functools.lru_cache() # so that calling setup_my_logger multiple times won't add many handlers -def setup_my_logger(output=None, distributed_rank=0, *, color=True, name="mylib", abbrev_name=None): - """ - Args: - output (str): a file name or a directory to save log. If None, will not save log file. - If ends with ".txt" or ".log", assumed to be a file name. - Otherwise, logs will be saved to `output/log.txt`. - name (str): the root module name of this logger - abbrev_name (str): an abbreviation of the module, to avoid long names in logs. - Set to "" to not log the root module in logs. - By default, will abbreviate: - detectron2 --> d2 - mylib --> lib - """ - logger = logging.getLogger(name) - logger.setLevel(logging.DEBUG) - logger.propagate = False - - if abbrev_name is None: - if name == "mylib": - abbrev_name = "lib" - elif name == "detectron2": - abbrev_name = "d2" - else: - abbrev_name = name - - plain_formatter = logging.Formatter( - "[%(asctime)s] %(name)s %(levelname)s@%(lineno)d: %(message)s", datefmt="%m%d_%H%M%S" - ) - # stdout logging: master only - if distributed_rank == 0: - ch = logging.StreamHandler(stream=sys.stdout) - ch.setLevel(logging.DEBUG) - if color: - formatter = _ColorfulFormatter( - colored("[%(asctime)s %(name)s@%(lineno)d]: ", "green") + "%(message)s", - datefmt="%m%d_%H%M%S", - root_name=name, - abbrev_name=str(abbrev_name), - ) - else: - formatter = plain_formatter - ch.setFormatter(formatter) - logger.addHandler(ch) +from functools import partial - # file logging: all workers - if output is not None: - if output.endswith(".txt") or output.endswith(".log"): - filename = output - else: - filename = osp.join(output, "log.txt") - if distributed_rank > 0: - filename = filename + ".rank{}".format(distributed_rank) - PathManager.mkdirs(osp.dirname(filename)) - - fh = logging.StreamHandler(_cached_log_stream(filename)) - fh.setLevel(logging.DEBUG) - fh.setFormatter(plain_formatter) - logger.addHandler(fh) - return logger +class InterceptHandler(logging.Handler): + # https://github.com/Delgan/loguru#entirely-compatible-with-standard-logging + def emit(self, record): + # Get corresponding Loguru level if it exists + try: + level = logger.level(record.levelname).name + except ValueError: + level = record.levelno + # Find caller from where originated the logged message + frame, depth = logging.currentframe(), 2 + while frame.f_code.co_filename == logging.__file__: + frame = frame.f_back + depth += 1 -# cache the opened file object, so that different calls to `setup_logger` -# with the same file name can safely write to the same file. -@functools.lru_cache(maxsize=None) -def _cached_log_stream(filename): - return PathManager.open(filename, "a") + logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage()) -""" -Below are some other convenient logging methods. -They are mainly adopted from -https://github.com/abseil/abseil-py/blob/master/absl/logging/__init__.py -""" +def setup_intercept(): + logging.basicConfig(handlers=[InterceptHandler()], level=0) -def _find_caller(): +def get_caller_name(depth=0): """ + Args: + depth (int): Depth of caller conext, use 0 for caller depth. Default value: 0. + Returns: str: module name of the caller - tuple: a hashable key to be used to identify different callers """ - frame = sys._getframe(2) - while frame: - code = frame.f_code - if ( - osp.join("utils", "setup_logger.") not in code.co_filename - and osp.join("utils", "logger.") not in code.co_filename - ): - mod_name = frame.f_globals["__name__"] - if mod_name == "__main__": - mod_name = "lib" - return mod_name, (code.co_filename, frame.f_lineno, code.co_name) - frame = frame.f_back + # the following logic is a little bit faster than inspect.stack() logic + frame = inspect.currentframe().f_back + # caller = inspect.getframeinfo(inspect.stack()[1][0]) + # import ipdb; ipdb.set_trace() + for _ in range(depth): + if frame.f_back is not None: + frame = frame.f_back + + return frame.f_globals["__name__"] + + +class StreamToLoguru: + """stream object that redirects writes to a logger instance.""" + + def __init__(self, level="INFO", caller_names=("apex", "pycocotools"), stream_logger=None): + """ + Args: + level(str): log level string of loguru. Default value: "INFO". + caller_names(tuple): caller names of redirected module. + Default value: (apex, pycocotools). + """ + self._logger = logger if stream_logger is None else stream_logger + self.level = level + self.linebuf = "" + self.caller_names = caller_names + + def write(self, buf): + full_name = get_caller_name(depth=1) + module_name = full_name.rsplit(".", maxsplit=-1)[0] + if module_name in self.caller_names: + for line in buf.rstrip().splitlines(): + # use caller level log + if module_name in ["__main__"]: + log_depth = -1 + else: + log_depth = 2 + if isinstance(self._logger, logging.Logger): + self._logger.log(logging.getLevelName(self.level), line.rstrip()) + else: + self._logger.opt(depth=log_depth).log(self.level, line.rstrip()) + else: + sys.__stdout__.write(buf) + def flush(self): + pass -_LOG_COUNTER = Counter() +def redirect_sys_output( + log_level="INFO", caller_names=("apex", "pycocotools", "__main__"), stdout_logger=None, stderr_logger=None +): + # logging.getLogger("STDOUT") + # stderr_logger = None # logging.getLogger("STDERR") + sys.stdout = StreamToLoguru(log_level, caller_names=caller_names, stream_logger=stdout_logger) + sys.stderr = StreamToLoguru(log_level, caller_names=caller_names, stream_logger=stderr_logger) -def log_first_n(lvl, msg, n=1, *, name=None, key="caller"): - """Log only for the first n times. +def setup_logger( + output=None, + distributed_rank=0, + log_level="DEBUG", + redirect_sys_out_callers=("apex", "pycocotools", "__main__"), +): + """setup logger for training and testing. Args: - lvl (int): the logging level - msg (str): - n (int): - name (str): name of the logger to use. Will use the caller's module by default. - key (str or tuple[str]): the string(s) can be one of "caller" or - "message", which defines how to identify duplicated logs. - For example, if called with `n=1, key="caller"`, this function - will only log the first call from the same caller, regardless of - the message content. - If called with `n=1, key="message"`, this function will log the - same content only once, even if they are called from different places. - If called with `n=1, key=("caller", "message")`, this function - will not log only if the same caller has logged the same message before. - """ - if isinstance(key, str): - key = (key,) - assert len(key) > 0 - - caller_module, caller_key = _find_caller() - hash_key = () - if "caller" in key: - hash_key = hash_key + caller_key - if "message" in key: - hash_key = hash_key + (msg,) - - _LOG_COUNTER[hash_key] += 1 - if _LOG_COUNTER[hash_key] <= n: - logging.getLogger(name or caller_module).log(lvl, msg) - - -def log_every_n(lvl, msg, n=1, *, name=None): - """Log once per n times. + save_dir(str): location to save log file + distributed_rank(int): device rank when multi-gpu environment + filename (string): log save name. + mode(str): log file write mode, `append` or `override`. default is `a`. - Args: - lvl (int): the logging level - msg (str): - n (int): - name (str): name of the logger to use. Will use the caller's module by default. + Return: + logger instance. """ - caller_module, key = _find_caller() - _LOG_COUNTER[key] += 1 - if n == 1 or _LOG_COUNTER[key] % n == 1: - logging.getLogger(name or caller_module).log(lvl, msg) - - -def create_small_table(small_dict): - """Create a small table using the keys of small_dict as headers. This is - only suitable for small dictionaries. - Args: - small_dict (dict): a result dictionary of only a few items. + def formatter(record, is_file=False): + level_name = record["level"].name + level_name_map = { + "INFO": "", # "INF", + "WARNING": "{}|".format(colored("WRN", "red", attrs=["blink"])), + "ERROR": "{}|".format(colored("ERR", "red", attrs=["blink", "underline"])), + "CRITICAL": "{}|".format(colored("ERR", "red", attrs=["blink", "underline"])), + "DEBUG": "{}|".format(colored("DBG", "yellow", attrs=["blink"])), + } + + level_abbr = level_name_map.get(level_name, f"{level_name}|") + + caller_name = record["name"] + # print(record["file"].name, record["file"].path) + # print(record) + # print(get_caller_name(3)) + if caller_name.startswith("detectron2."): + caller_abbr = caller_name.replace("detectron2.", "d2.") + else: + caller_abbr = caller_name + if is_file: + func_name = ":{function}" + else: + func_name = "" + loguru_format = ( + "{time:YYYYMMDD_HHmmss}|" + "%s" + "%s%s@{line}: {message}" + "\n{exception}" + ) % (level_abbr, caller_abbr, func_name) + return loguru_format + + logger.remove() # Remove the pre-configured handler + # only keep logger in rank0 process + if distributed_rank == 0: + logger.add( + sys.stderr, + format=formatter, + level=log_level, + enqueue=True, + ) - Returns: - str: the table as a string. - """ - keys, values = tuple(zip(*small_dict.items())) - table = tabulate([values], headers=keys, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center") - return table + # file logging: all workers + if output is not None: + if output.endswith(".txt") or output.endswith(".log"): + filename = output + else: + filename = os.path.join(output, "log.txt") + if distributed_rank > 0: + filename = filename + ".rank{}".format(distributed_rank) + os.makedirs(os.path.dirname(filename), exist_ok=True) + logger.add( + filename, + format=partial(formatter, is_file=True), + level=log_level, + enqueue=True, + ) + + setup_intercept() + + # redirect stdout/stderr to loguru + redirect_sys_output("INFO", caller_names=redirect_sys_out_callers) diff --git a/scripts/format_code.sh b/scripts/format_code.sh index c6d69667..bfea91c9 100755 --- a/scripts/format_code.sh +++ b/scripts/format_code.sh @@ -7,7 +7,7 @@ # [ "$2" = "$(echo -e "$1\\n$2" | sort -V | head -n1)" ] #} -BLACK_VERSION="21.7b0" +BLACK_VERSION="21.11b1" { black --version | grep -E $BLACK_VERSION > /dev/null } || { diff --git a/scripts/install_deps.sh b/scripts/install_deps.sh index 0f9bf5a1..78a1117b 100644 --- a/scripts/install_deps.sh +++ b/scripts/install_deps.sh @@ -13,7 +13,6 @@ pip install plyfile pip install pycocotools # or install the nvidia version which is cpp-accelerated - pip install cffi pip install ninja pip install setproctitle @@ -39,5 +38,9 @@ pip install open3d pip install fvcore pip install tensorboardX +pip install pytorch-lightning # 1.6.0.dev0 +pip install fairscale +pip install deepspeed + pip uninstall pillow CC="cc -mavx2" pip install -U --force-reinstall pillow-simd From 031a5cb2cb73418ccd8664420285458f3334ff79 Mon Sep 17 00:00:00 2001 From: Gu Wang Date: Thu, 25 Nov 2021 15:36:21 +0800 Subject: [PATCH 2/3] use lite for training (also for ddp) --- configs/gdrn/lm/a6_cPnP_lm13_lite.py | 70 --------------------- configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py | 70 --------------------- core/gdrn_modeling/data_loader.py | 6 +- core/gdrn_modeling/engine.py | 8 +++ core/gdrn_modeling/engine_utils.py | 3 +- core/gdrn_modeling/gdrn_custom_evaluator.py | 4 +- core/gdrn_modeling/gdrn_evaluator.py | 2 + core/gdrn_modeling/main_gdrn.py | 2 +- core/gdrn_modeling/test_gdrn.sh | 4 ++ core/utils/default_args_setup.py | 3 +- core/utils/my_checkpoint.py | 13 ++++ scripts/install_deps.sh | 34 +--------- 12 files changed, 39 insertions(+), 180 deletions(-) delete mode 100644 configs/gdrn/lm/a6_cPnP_lm13_lite.py delete mode 100644 configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py diff --git a/configs/gdrn/lm/a6_cPnP_lm13_lite.py b/configs/gdrn/lm/a6_cPnP_lm13_lite.py deleted file mode 100644 index 8e1bbf04..00000000 --- a/configs/gdrn/lm/a6_cPnP_lm13_lite.py +++ /dev/null @@ -1,70 +0,0 @@ -_base_ = ["../../_base_/gdrn_base.py"] - -OUTPUT_DIR = "output/gdrn/lm/a6_cPnP_lm13_lite" -INPUT = dict( - DZI_PAD_SCALE=1.5, - COLOR_AUG_PROB=0.0, - COLOR_AUG_TYPE="code", - COLOR_AUG_CODE=( - "Sequential([" - "Sometimes(0.4, CoarseDropout( p=0.1, size_percent=0.05) )," - # "Sometimes(0.5, Affine(scale=(1.0, 1.2)))," - "Sometimes(0.5, GaussianBlur(np.random.rand()))," - "Sometimes(0.5, Add((-20, 20), per_channel=0.3))," - "Sometimes(0.4, Invert(0.20, per_channel=True))," - "Sometimes(0.5, Multiply((0.7, 1.4), per_channel=0.8))," - "Sometimes(0.5, Multiply((0.7, 1.4)))," - "Sometimes(0.5, ContrastNormalization((0.5, 2.0), per_channel=0.3))" - "], random_order=False)" - ), -) - -SOLVER = dict( - IMS_PER_BATCH=24, - LR_SCHEDULER_NAME="flat_and_anneal", - ANNEAL_METHOD="cosine", # "cosine" - ANNEAL_POINT=0.72, - # REL_STEPS=(0.3125, 0.625, 0.9375), - OPTIMIZER_CFG=dict(_delete_=True, type="Ranger", lr=1e-4, weight_decay=0), - WEIGHT_DECAY=0.0, - WARMUP_FACTOR=0.001, - WARMUP_ITERS=1000, -) - -DATASETS = dict( - TRAIN=("lm_13_train", "lm_imgn_13_train_1k_per_obj"), - TEST=("lm_13_test",), - DET_FILES_TEST=("datasets/BOP_DATASETS/lm/test/test_bboxes/bbox_faster_all.json",), -) - -MODEL = dict( - LOAD_DETS_TEST=True, - PIXEL_MEAN=[0.0, 0.0, 0.0], - PIXEL_STD=[255.0, 255.0, 255.0], - CDPN=dict( - ROT_HEAD=dict( - FREEZE=False, - ROT_CLASS_AWARE=False, - MASK_CLASS_AWARE=False, - XYZ_LW=1.0, - REGION_CLASS_AWARE=False, - NUM_REGIONS=64, - ), - PNP_NET=dict( - R_ONLY=False, - REGION_ATTENTION=True, - WITH_2D_COORD=True, - ROT_TYPE="allo_rot6d", - TRANS_TYPE="centroid_z", - PM_NORM_BY_EXTENT=True, - PM_R_ONLY=True, - CENTROID_LOSS_TYPE="L1", - CENTROID_LW=1.0, - Z_LOSS_TYPE="L1", - Z_LW=1.0, - ), - TRANS_HEAD=dict(FREEZE=True), - ), -) - -TEST = dict(EVAL_PERIOD=0, VIS=False, TEST_BBOX_TYPE="est") # gt | est diff --git a/configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py b/configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py deleted file mode 100644 index 44784ceb..00000000 --- a/configs/gdrn/lm/a6_cPnP_lm13_lite_2gpus.py +++ /dev/null @@ -1,70 +0,0 @@ -_base_ = ["../../_base_/gdrn_base.py"] - -OUTPUT_DIR = "output/gdrn/lm/a6_cPnP_lm13_lite_2gpus" -INPUT = dict( - DZI_PAD_SCALE=1.5, - COLOR_AUG_PROB=0.0, - COLOR_AUG_TYPE="code", - COLOR_AUG_CODE=( - "Sequential([" - "Sometimes(0.4, CoarseDropout( p=0.1, size_percent=0.05) )," - # "Sometimes(0.5, Affine(scale=(1.0, 1.2)))," - "Sometimes(0.5, GaussianBlur(np.random.rand()))," - "Sometimes(0.5, Add((-20, 20), per_channel=0.3))," - "Sometimes(0.4, Invert(0.20, per_channel=True))," - "Sometimes(0.5, Multiply((0.7, 1.4), per_channel=0.8))," - "Sometimes(0.5, Multiply((0.7, 1.4)))," - "Sometimes(0.5, ContrastNormalization((0.5, 2.0), per_channel=0.3))" - "], random_order=False)" - ), -) - -SOLVER = dict( - IMS_PER_BATCH=48, - LR_SCHEDULER_NAME="flat_and_anneal", - ANNEAL_METHOD="cosine", # "cosine" - ANNEAL_POINT=0.72, - # REL_STEPS=(0.3125, 0.625, 0.9375), - OPTIMIZER_CFG=dict(_delete_=True, type="Ranger", lr=1e-4, weight_decay=0), - WEIGHT_DECAY=0.0, - WARMUP_FACTOR=0.001, - WARMUP_ITERS=1000, -) - -DATASETS = dict( - TRAIN=("lm_13_train", "lm_imgn_13_train_1k_per_obj"), - TEST=("lm_13_test",), - DET_FILES_TEST=("datasets/BOP_DATASETS/lm/test/test_bboxes/bbox_faster_all.json",), -) - -MODEL = dict( - LOAD_DETS_TEST=True, - PIXEL_MEAN=[0.0, 0.0, 0.0], - PIXEL_STD=[255.0, 255.0, 255.0], - CDPN=dict( - ROT_HEAD=dict( - FREEZE=False, - ROT_CLASS_AWARE=False, - MASK_CLASS_AWARE=False, - XYZ_LW=1.0, - REGION_CLASS_AWARE=False, - NUM_REGIONS=64, - ), - PNP_NET=dict( - R_ONLY=False, - REGION_ATTENTION=True, - WITH_2D_COORD=True, - ROT_TYPE="allo_rot6d", - TRANS_TYPE="centroid_z", - PM_NORM_BY_EXTENT=True, - PM_R_ONLY=True, - CENTROID_LOSS_TYPE="L1", - CENTROID_LW=1.0, - Z_LOSS_TYPE="L1", - Z_LW=1.0, - ), - TRANS_HEAD=dict(FREEZE=True), - ), -) - -TEST = dict(EVAL_PERIOD=0, VIS=False, TEST_BBOX_TYPE="est") # gt | est diff --git a/core/gdrn_modeling/data_loader.py b/core/gdrn_modeling/data_loader.py index c4ed8f9e..6a535587 100644 --- a/core/gdrn_modeling/data_loader.py +++ b/core/gdrn_modeling/data_loader.py @@ -442,12 +442,12 @@ def read_data(self, dataset_dict): for _key in roi_keys: if _key in ["roi_img", "roi_coord_2d"]: - dataset_dict[_key] = torch.as_tensor(roi_infos[_key]).contiguous() + dataset_dict[_key] = torch.as_tensor(np.array(roi_infos[_key])).contiguous() elif _key in ["model_info", "scene_im_id", "file_name"]: # can not convert to tensor dataset_dict[_key] = roi_infos[_key] else: - dataset_dict[_key] = torch.tensor(roi_infos[_key]) + dataset_dict[_key] = torch.as_tensor(np.array(roi_infos[_key])) return dataset_dict ####################################################################################### @@ -458,7 +458,7 @@ def read_data(self, dataset_dict): # extent roi_extent = self._get_extents(dataset_name)[roi_cls] - dataset_dict["roi_extent"] = torch.tensor(roi_extent, dtype=torch.float32) + dataset_dict["roi_extent"] = torch.as_tensor(np.array(roi_extent), dtype=torch.float32) # load xyz ======================================================= xyz_info = mmcv.load(inst_infos["xyz_path"]) diff --git a/core/gdrn_modeling/engine.py b/core/gdrn_modeling/engine.py index e18b15e0..24dbbecf 100644 --- a/core/gdrn_modeling/engine.py +++ b/core/gdrn_modeling/engine.py @@ -2,6 +2,7 @@ import os import os.path as osp import torch + import mmcv import time import cv2 @@ -193,6 +194,7 @@ def do_train(self, cfg, args, model, optimizer, resume=False): ) if hasattr(self._precision_plugin, "scaler"): extra_ckpt_dict["gradscaler"] = self._precision_plugin.scaler + checkpointer = MyCheckpointer( model, cfg.OUTPUT_DIR, @@ -322,4 +324,10 @@ def do_train(self, cfg, args, model, optimizer, resume=False): gt_mask_vis = batch["roi_mask"][vis_i].detach().cpu().numpy() tbx_writer.add_image("gt_mask", gt_mask_vis, iteration) + + if (iteration + 1) % periodic_checkpointer.period == 0 or ( + periodic_checkpointer.max_iter is not None and (iteration + 1) >= periodic_checkpointer.max_iter + ): + if hasattr(optimizer, "consolidate_state_dict"): # for ddp_sharded + optimizer.consolidate_state_dict() periodic_checkpointer.step(iteration, epoch=epoch) diff --git a/core/gdrn_modeling/engine_utils.py b/core/gdrn_modeling/engine_utils.py index 7e81a9cd..c1d9790d 100644 --- a/core/gdrn_modeling/engine_utils.py +++ b/core/gdrn_modeling/engine_utils.py @@ -62,7 +62,8 @@ def batch_data(cfg, data, device="cuda", phase="train"): def batch_data_test(cfg, data, device="cuda"): batch = {} - + if not isinstance(data, list): # bs = 1 + data = [data] # yapf: disable roi_keys = ["im_H", "im_W", "roi_img", "inst_id", "roi_coord_2d", "roi_cls", "score", "roi_extent", diff --git a/core/gdrn_modeling/gdrn_custom_evaluator.py b/core/gdrn_modeling/gdrn_custom_evaluator.py index bad01347..94d4c8ca 100644 --- a/core/gdrn_modeling/gdrn_custom_evaluator.py +++ b/core/gdrn_modeling/gdrn_custom_evaluator.py @@ -780,7 +780,9 @@ def _eval_predictions(self): f.write("{}\n".format(res_log_tab_str)) if self._distributed: - self._logger.warning("\n The current evaluation on multi-gpu is not correct, run with single-gpu instead.") + self._logger.warning( + "\n The current evaluation on multi-gpu might be incorrect, run with single-gpu instead." + ) return {} diff --git a/core/gdrn_modeling/gdrn_evaluator.py b/core/gdrn_modeling/gdrn_evaluator.py index af157ce9..e7045fa5 100644 --- a/core/gdrn_modeling/gdrn_evaluator.py +++ b/core/gdrn_modeling/gdrn_evaluator.py @@ -590,6 +590,8 @@ def gdrn_inference_on_dataset(cfg, model, data_loader, evaluator, amp_test=False start_compute_time = time.perf_counter() ############################# # process input + if not isinstance(inputs, list): # bs=1 + inputs = [inputs] batch = batch_data(cfg, inputs, phase="test") if evaluator.train_objs is not None: roi_labels = batch["roi_cls"].cpu().numpy().tolist() diff --git a/core/gdrn_modeling/main_gdrn.py b/core/gdrn_modeling/main_gdrn.py index bf27a6f1..e37cf06b 100644 --- a/core/gdrn_modeling/main_gdrn.py +++ b/core/gdrn_modeling/main_gdrn.py @@ -6,7 +6,6 @@ from setproctitle import setproctitle import torch - from mmcv import Config import cv2 from pytorch_lightning import seed_everything @@ -123,6 +122,7 @@ def run(self, args, cfg): return self.do_test(cfg, model) self.do_train(cfg, args, model, optimizer, resume=args.resume) + torch.multiprocessing.set_sharing_strategy("file_system") return self.do_test(cfg, model) diff --git a/core/gdrn_modeling/test_gdrn.sh b/core/gdrn_modeling/test_gdrn.sh index 1411255f..fe75fc13 100755 --- a/core/gdrn_modeling/test_gdrn.sh +++ b/core/gdrn_modeling/test_gdrn.sh @@ -11,6 +11,10 @@ IFS=',' read -ra GPUS <<< "$CUDA_VISIBLE_DEVICES" NGPU=${#GPUS[@]} # echo "${GPUS[0]}" echo "use gpu ids: $CUDA_VISIBLE_DEVICES num gpus: $NGPU" CKPT=$3 +if [ ! -f "$CKPT" ]; then + echo "$CKPT does not exist." + exit 1 +fi NCCL_DEBUG=INFO OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 diff --git a/core/utils/default_args_setup.py b/core/utils/default_args_setup.py index 91c9cbbc..1c627307 100644 --- a/core/utils/default_args_setup.py +++ b/core/utils/default_args_setup.py @@ -84,7 +84,8 @@ def my_default_setup(cfg, args): rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank) - logging.getLogger("PIL").setLevel(logging.INFO) + for _mod in ["PIL", "chardet"]: # disable DEBUG logs + logging.getLogger(_mod).setLevel(logging.INFO) logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) diff --git a/core/utils/my_checkpoint.py b/core/utils/my_checkpoint.py index 73baecc7..bdb3be2f 100644 --- a/core/utils/my_checkpoint.py +++ b/core/utils/my_checkpoint.py @@ -2,6 +2,8 @@ from fvcore.common.file_io import PathManager from detectron2.checkpoint import DetectionCheckpointer from mmcv.runner import _load_checkpoint +from torch.nn.parallel import DataParallel, DistributedDataParallel +from pytorch_lightning.lite.wrappers import _LiteModule class MyCheckpointer(DetectionCheckpointer): @@ -10,6 +12,17 @@ class MyCheckpointer(DetectionCheckpointer): :class:`DetectronCheckpointer`, but is able to convert models in AdelaiDet, such as LPF backbone.""" + def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables): + # HACK: deal with lite model + if isinstance(model, (DistributedDataParallel, DataParallel, _LiteModule)): + model = model.module + super().__init__( + model, + save_dir, + save_to_disk=save_to_disk, + **checkpointables, + ) + def _load_file(self, filename): if filename.endswith(".pkl"): with PathManager.open(filename, "rb") as f: diff --git a/scripts/install_deps.sh b/scripts/install_deps.sh index 78a1117b..86c02393 100644 --- a/scripts/install_deps.sh +++ b/scripts/install_deps.sh @@ -8,39 +8,7 @@ sudo apt-get install libglfw3-dev libglfw3 sudo apt-get install libassimp-dev # conda install ipython -pip install cython -pip install plyfile - -pip install pycocotools # or install the nvidia version which is cpp-accelerated - -pip install cffi -pip install ninja -pip install setproctitle -pip install fastfunc -pip install meshplex -pip install OpenEXR -pip install "vispy>=0.6.4" -pip install tabulate -pip install pytest-runner -pip install pytest -pip install ipdb -pip install tqdm -pip install numba -pip install mmcv -pip install imagecorruptions -pip install pyassimp==4.1.3 # 4.1.4 will cause egl_renderer SegmentFault -pip install pypng -pip install albumentations -pip install transforms3d -pip install pyquaternion -pip install torchvision -pip install open3d -pip install fvcore -pip install tensorboardX - -pip install pytorch-lightning # 1.6.0.dev0 -pip install fairscale -pip install deepspeed +pip install -r requirements.txt pip uninstall pillow CC="cc -mavx2" pip install -U --force-reinstall pillow-simd From 9ebab1e8b732abbd32f480af3c0842d302d038f0 Mon Sep 17 00:00:00 2001 From: Gu Wang Date: Thu, 25 Nov 2021 15:41:23 +0800 Subject: [PATCH 3/3] minor --- core/gdrn_modeling/gdrn_custom_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/gdrn_modeling/gdrn_custom_evaluator.py b/core/gdrn_modeling/gdrn_custom_evaluator.py index 94c50bfb..eda0632f 100644 --- a/core/gdrn_modeling/gdrn_custom_evaluator.py +++ b/core/gdrn_modeling/gdrn_custom_evaluator.py @@ -843,5 +843,5 @@ def _eval_predictions_precision(self): with open(dump_tab_name, "w") as f: f.write("{}\n".format(res_log_tab_str)) if self._distributed: - self._logger.warning("\n The current evaluation on multi-gpu is not correct, run with single-gpu instead.") + self._logger.warning("\n The current evaluation on multi-gpu might be incorrect, run with single-gpu instead.") return {}