diff --git a/README.md b/README.md
index 9f64852ca..9cd3e3272 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
+
+

+
+

@@ -246,4 +250,4 @@ It is hoped that every AI practitioner in the world will stick to the concept of

没有孙剑博士的指导,YOLOX也不会问世并开源给社区使用。
孙剑博士的离去是CV领域的一大损失,我们在此特别添加了这个部分来表达对我们的“船长”孙老师的纪念和哀思。
-希望世界上的每个AI从业者秉持着“持续创新拓展认知边界,非凡科技成就产品价值”的观念,一路向前。
\ No newline at end of file
+希望世界上的每个AI从业者秉持着“持续创新拓展认知边界,非凡科技成就产品价值”的观念,一路向前。
diff --git a/datasets/pedestrian_coco/annotations/train_annotations.json b/datasets/pedestrian_coco/annotations/train_annotations.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_coco/annotations/valid_annotations.json b/datasets/pedestrian_coco/annotations/valid_annotations.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_coco/train/README.md b/datasets/pedestrian_coco/train/README.md
new file mode 100644
index 000000000..7b9800885
--- /dev/null
+++ b/datasets/pedestrian_coco/train/README.md
@@ -0,0 +1 @@
+put the train images
\ No newline at end of file
diff --git a/datasets/pedestrian_coco/valid/README.md b/datasets/pedestrian_coco/valid/README.md
new file mode 100644
index 000000000..657f11b45
--- /dev/null
+++ b/datasets/pedestrian_coco/valid/README.md
@@ -0,0 +1 @@
+put the valid images
\ No newline at end of file
diff --git a/datasets/pedestrian_voc/Annotations/README.md b/datasets/pedestrian_voc/Annotations/README.md
new file mode 100644
index 000000000..2b9cfa086
--- /dev/null
+++ b/datasets/pedestrian_voc/Annotations/README.md
@@ -0,0 +1 @@
+put the train and valid annotations
\ No newline at end of file
diff --git a/datasets/pedestrian_voc/ImageSets/Main/train.txt b/datasets/pedestrian_voc/ImageSets/Main/train.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_voc/ImageSets/Main/valid.txt b/datasets/pedestrian_voc/ImageSets/Main/valid.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_voc/JPEGImages/README.md b/datasets/pedestrian_voc/JPEGImages/README.md
new file mode 100644
index 000000000..62ac03c99
--- /dev/null
+++ b/datasets/pedestrian_voc/JPEGImages/README.md
@@ -0,0 +1 @@
+put the train and valid images
\ No newline at end of file
diff --git a/exps/example/custom/nano.py b/exps/example/custom/coco_format/yolox_nano.py
similarity index 100%
rename from exps/example/custom/nano.py
rename to exps/example/custom/coco_format/yolox_nano.py
diff --git a/exps/example/custom/yolox_s.py b/exps/example/custom/coco_format/yolox_s.py
similarity index 100%
rename from exps/example/custom/yolox_s.py
rename to exps/example/custom/coco_format/yolox_s.py
diff --git a/exps/example/custom/voc_format/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc b/exps/example/custom/voc_format/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc
new file mode 100644
index 000000000..0b3dc1929
Binary files /dev/null and b/exps/example/custom/voc_format/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc differ
diff --git a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
new file mode 100644
index 000000000..0fb62da78
--- /dev/null
+++ b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch
+import torch.distributed as dist
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.num_classes = 20
+ self.depth = 0.33
+ self.width = 0.25
+ self.input_size = (416, 416)
+ self.mosaic_scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ self.test_size = (416, 416)
+ self.warmup_epochs = 1
+ # ---------- transform config ------------ #
+ #self.mosaic_prob = 1.0
+ self.enable_mixup = False
+ #self.mixup_prob = 1.0
+ #self.hsv_prob = 1.0
+ #self.flip_prob = 0.5
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_model(self, sublinear=False):
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+ in_channels = [256, 512, 1024]
+ # NANO model use depthwise = True, which is main difference.
+ backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+ head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ return self.model
+
+ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+ from yolox.data import (
+ VOCDetection,
+ TrainTransform,
+ YoloBatchSampler,
+ DataLoader,
+ InfiniteSampler,
+ MosaicDetection,
+ worker_init_reset_seed,
+ )
+ from yolox.utils import (
+ wait_for_the_master,
+ get_local_rank,
+ )
+ local_rank = get_local_rank()
+
+ with wait_for_the_master(local_rank):
+ dataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('train')],
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ cache=cache_img,
+ )
+
+ dataset = MosaicDetection(
+ dataset,
+ mosaic=not no_aug,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=120,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ degrees=self.degrees,
+ translate=self.translate,
+ mosaic_scale=self.mosaic_scale,
+ mixup_scale=self.mixup_scale,
+ shear=self.shear,
+ enable_mixup=self.enable_mixup,
+ mosaic_prob=self.mosaic_prob,
+ mixup_prob=self.mixup_prob,
+ )
+
+ self.dataset = dataset
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+
+ sampler = InfiniteSampler(
+ len(self.dataset), seed=self.seed if self.seed else 0
+ )
+
+ batch_sampler = YoloBatchSampler(
+ sampler=sampler,
+ batch_size=batch_size,
+ drop_last=False,
+ mosaic=not no_aug,
+ )
+
+ dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+ dataloader_kwargs["batch_sampler"] = batch_sampler
+
+ # Make sure each process has different random seed, especially for 'fork' method
+ dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+ train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+ return train_loader
+
+ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.data import VOCDetection, ValTransform
+
+ valdataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('valid')],
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+ sampler = torch.utils.data.distributed.DistributedSampler(
+ valdataset, shuffle=False
+ )
+ else:
+ sampler = torch.utils.data.SequentialSampler(valdataset)
+
+ dataloader_kwargs = {
+ "num_workers": self.data_num_workers,
+ "pin_memory": True,
+ "sampler": sampler,
+ }
+ dataloader_kwargs["batch_size"] = batch_size
+ val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+ return val_loader
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import VOCEvaluator
+
+ val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
+ evaluator = VOCEvaluator(
+ dataloader=val_loader,
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ )
+ return evaluator
diff --git a/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py b/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
new file mode 100644
index 000000000..11b82f646
--- /dev/null
+++ b/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch
+import torch.distributed as dist
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.num_classes = 20
+ self.depth = 0.33
+ self.width = 0.25
+ self.input_size = (416, 416)
+ self.mosaic_scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ self.test_size = (416, 416)
+ self.eps = 1e-8
+ self.warmup_epochs = 1
+ # ---------- transform config ------------ #
+ #self.mosaic_prob = 1.0
+ self.enable_mixup = False
+ #self.mixup_prob = 1.0
+ #self.hsv_prob = 1.0
+ #self.flip_prob = 0.5
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_model(self, sublinear=False):
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+ in_channels = [256, 512, 1024]
+ # NANO model use depthwise = True, which is main difference.
+ backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+ head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ return self.model
+
+ def get_optimizer(self, batch_size):
+ if "optimizer" not in self.__dict__:
+ if self.warmup_epochs > 0:
+ lr = self.warmup_lr
+ else:
+ lr = self.basic_lr_per_img * batch_size
+
+ pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
+
+ for k, v in self.model.named_modules():
+ if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+ pg2.append(v.bias) # biases
+ if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+ pg0.append(v.weight) # no decay
+ elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+ pg1.append(v.weight) # apply decay
+
+ optimizer = torch.optim.Adam(
+ pg0, lr=lr, eps=self.eps, amsgrad=False
+ )
+ optimizer.add_param_group(
+ {"params": pg1, "weight_decay": self.weight_decay}
+ ) # add pg1 with weight_decay
+ optimizer.add_param_group({"params": pg2})
+ self.optimizer = optimizer
+ return self.optimizer
+
+ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+ from yolox.data import (
+ VOCDetection,
+ TrainTransform,
+ YoloBatchSampler,
+ DataLoader,
+ InfiniteSampler,
+ MosaicDetection,
+ worker_init_reset_seed,
+ )
+ from yolox.utils import (
+ wait_for_the_master,
+ get_local_rank,
+ )
+ local_rank = get_local_rank()
+
+ with wait_for_the_master(local_rank):
+ dataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+ image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ cache=cache_img,
+ )
+
+ dataset = MosaicDetection(
+ dataset,
+ mosaic=not no_aug,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=120,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ degrees=self.degrees,
+ translate=self.translate,
+ mosaic_scale=self.mosaic_scale,
+ mixup_scale=self.mixup_scale,
+ shear=self.shear,
+ enable_mixup=self.enable_mixup,
+ mosaic_prob=self.mosaic_prob,
+ mixup_prob=self.mixup_prob,
+ )
+
+ self.dataset = dataset
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+
+ sampler = InfiniteSampler(
+ len(self.dataset), seed=self.seed if self.seed else 0
+ )
+
+ batch_sampler = YoloBatchSampler(
+ sampler=sampler,
+ batch_size=batch_size,
+ drop_last=False,
+ mosaic=not no_aug,
+ )
+
+ dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+ dataloader_kwargs["batch_sampler"] = batch_sampler
+
+ # Make sure each process has different random seed, especially for 'fork' method
+ dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+ train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+ return train_loader
+
+ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.data import VOCDetection, ValTransform
+
+ valdataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+ image_sets=[('2007', 'test')],
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+ sampler = torch.utils.data.distributed.DistributedSampler(
+ valdataset, shuffle=False
+ )
+ else:
+ sampler = torch.utils.data.SequentialSampler(valdataset)
+
+ dataloader_kwargs = {
+ "num_workers": self.data_num_workers,
+ "pin_memory": True,
+ "sampler": sampler,
+ }
+ dataloader_kwargs["batch_size"] = batch_size
+ val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+ return val_loader
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import VOCEvaluator
+
+ val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
+ evaluator = VOCEvaluator(
+ dataloader=val_loader,
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ )
+ return evaluator
diff --git a/exps/example/custom/voc_format/yolox_voc_s/__pycache__/yolox_voc_s.cpython-38.pyc b/exps/example/custom/voc_format/yolox_voc_s/__pycache__/yolox_voc_s.cpython-38.pyc
new file mode 100644
index 000000000..cea337ed0
Binary files /dev/null and b/exps/example/custom/voc_format/yolox_voc_s/__pycache__/yolox_voc_s.cpython-38.pyc differ
diff --git a/exps/example/yolox_voc/yolox_voc_s.py b/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
similarity index 100%
rename from exps/example/yolox_voc/yolox_voc_s.py
rename to exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
diff --git a/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
new file mode 100644
index 000000000..d8ffbdac9
--- /dev/null
+++ b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch
+import torch.distributed as dist
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.375
+ self.scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ self.test_size = (416, 416)
+ self.warmup_epochs = 1
+ # ---------- transform config ------------ #
+ #self.mosaic_prob = 1.0
+ self.enable_mixup = False
+ #self.mixup_prob = 1.0
+ #self.hsv_prob = 1.0
+ #self.flip_prob = 0.5
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+ from yolox.data import (
+ VOCDetection,
+ TrainTransform,
+ YoloBatchSampler,
+ DataLoader,
+ InfiniteSampler,
+ MosaicDetection,
+ worker_init_reset_seed,
+ )
+ from yolox.utils import (
+ wait_for_the_master,
+ get_local_rank,
+ )
+ local_rank = get_local_rank()
+
+ with wait_for_the_master(local_rank):
+ dataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+ image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ cache=cache_img,
+ )
+
+ dataset = MosaicDetection(
+ dataset,
+ mosaic=not no_aug,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=120,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ degrees=self.degrees,
+ translate=self.translate,
+ mosaic_scale=self.mosaic_scale,
+ mixup_scale=self.mixup_scale,
+ shear=self.shear,
+ enable_mixup=self.enable_mixup,
+ mosaic_prob=self.mosaic_prob,
+ mixup_prob=self.mixup_prob,
+ )
+
+ self.dataset = dataset
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+
+ sampler = InfiniteSampler(
+ len(self.dataset), seed=self.seed if self.seed else 0
+ )
+
+ batch_sampler = YoloBatchSampler(
+ sampler=sampler,
+ batch_size=batch_size,
+ drop_last=False,
+ mosaic=not no_aug,
+ )
+
+ dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+ dataloader_kwargs["batch_sampler"] = batch_sampler
+
+ # Make sure each process has different random seed, especially for 'fork' method
+ dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+ train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+ return train_loader
+
+ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.data import VOCDetection, ValTransform
+
+ valdataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+ image_sets=[('2007', 'test')],
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+ sampler = torch.utils.data.distributed.DistributedSampler(
+ valdataset, shuffle=False
+ )
+ else:
+ sampler = torch.utils.data.SequentialSampler(valdataset)
+
+ dataloader_kwargs = {
+ "num_workers": self.data_num_workers,
+ "pin_memory": True,
+ "sampler": sampler,
+ }
+ dataloader_kwargs["batch_size"] = batch_size
+ val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+ return val_loader
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import VOCEvaluator
+
+ val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
+ evaluator = VOCEvaluator(
+ dataloader=val_loader,
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ )
+ return evaluator
diff --git a/exps/example/yolox_pedestrian/coco_format/__pycache__/nano.cpython-38.pyc b/exps/example/yolox_pedestrian/coco_format/__pycache__/nano.cpython-38.pyc
new file mode 100644
index 000000000..c5a2714b6
Binary files /dev/null and b/exps/example/yolox_pedestrian/coco_format/__pycache__/nano.cpython-38.pyc differ
diff --git a/exps/example/yolox_pedestrian/coco_format/yolox_nano.py b/exps/example/yolox_pedestrian/coco_format/yolox_nano.py
new file mode 100644
index 000000000..90cc639ad
--- /dev/null
+++ b/exps/example/yolox_pedestrian/coco_format/yolox_nano.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.25
+ self.input_size = (416, 416)
+ self.mosaic_scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ self.test_size = (416, 416)
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+ self.enable_mixup = False
+
+ # Define yourself dataset path
+ self.data_dir = "datasets/pedestrian_coco"
+ self.train_ann = "train_annotations.json"
+ self.val_ann = "valid_annotations.json"
+
+ self.num_classes = 1
+
+ def get_model(self, sublinear=False):
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+ in_channels = [256, 512, 1024]
+ # NANO model use depthwise = True, which is main difference.
+ backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+ head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ return self.model
diff --git a/exps/example/yolox_pedestrian/coco_format/yolox_s.py b/exps/example/yolox_pedestrian/coco_format/yolox_s.py
new file mode 100644
index 000000000..97291a30d
--- /dev/null
+++ b/exps/example/yolox_pedestrian/coco_format/yolox_s.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.50
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ # Define yourself dataset path
+ self.data_dir = "datasets/pedestrian_coco"
+ self.train_ann = "train_annotations.json"
+ self.val_ann = "valid_annotations.json"
+
+ self.num_classes = 1
+
+ self.max_epoch = 10
+ self.data_num_workers = 4
+ self.eval_interval = 1
diff --git a/exps/example/yolox_pedestrian/voc_format/__pycache__/yolox_voc_nano.cpython-38.pyc b/exps/example/yolox_pedestrian/voc_format/__pycache__/yolox_voc_nano.cpython-38.pyc
new file mode 100644
index 000000000..4472ab2e8
Binary files /dev/null and b/exps/example/yolox_pedestrian/voc_format/__pycache__/yolox_voc_nano.cpython-38.pyc differ
diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
new file mode 100644
index 000000000..c38679cca
--- /dev/null
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch
+import torch.distributed as dist
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.num_classes = 1
+ self.depth = 0.33
+ self.width = 0.25
+ #self.input_size = (416, 416)
+ self.mosaic_scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ #self.test_size = (416, 416)
+ self.warmup_epochs = 1
+ # ---------- transform config ------------ #
+ #self.mosaic_prob = 1.0
+ self.enable_mixup = False
+ #self.mixup_prob = 1.0
+ #self.hsv_prob = 1.0
+ #self.flip_prob = 0.5
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_model(self, sublinear=False):
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+ in_channels = [256, 512, 1024]
+ # NANO model use depthwise = True, which is main difference.
+ backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+ head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ return self.model
+
+ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+ from yolox.data import (
+ VOCDetection,
+ TrainTransform,
+ YoloBatchSampler,
+ DataLoader,
+ InfiniteSampler,
+ MosaicDetection,
+ worker_init_reset_seed,
+ )
+ from yolox.utils import (
+ wait_for_the_master,
+ get_local_rank,
+ )
+ local_rank = get_local_rank()
+
+ with wait_for_the_master(local_rank):
+ dataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('train')],
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ cache=cache_img,
+ )
+
+ dataset = MosaicDetection(
+ dataset,
+ mosaic=not no_aug,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=120,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ degrees=self.degrees,
+ translate=self.translate,
+ mosaic_scale=self.mosaic_scale,
+ mixup_scale=self.mixup_scale,
+ shear=self.shear,
+ enable_mixup=self.enable_mixup,
+ mosaic_prob=self.mosaic_prob,
+ mixup_prob=self.mixup_prob,
+ )
+
+ self.dataset = dataset
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+
+ sampler = InfiniteSampler(
+ len(self.dataset), seed=self.seed if self.seed else 0
+ )
+
+ batch_sampler = YoloBatchSampler(
+ sampler=sampler,
+ batch_size=batch_size,
+ drop_last=False,
+ mosaic=not no_aug,
+ )
+
+ dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+ dataloader_kwargs["batch_sampler"] = batch_sampler
+
+ # Make sure each process has different random seed, especially for 'fork' method
+ dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+ train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+ return train_loader
+
+ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.data import VOCDetection, ValTransform
+
+ valdataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('valid')],
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+ sampler = torch.utils.data.distributed.DistributedSampler(
+ valdataset, shuffle=False
+ )
+ else:
+ sampler = torch.utils.data.SequentialSampler(valdataset)
+
+ dataloader_kwargs = {
+ "num_workers": self.data_num_workers,
+ "pin_memory": True,
+ "sampler": sampler,
+ }
+ dataloader_kwargs["batch_size"] = batch_size
+ val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+ return val_loader
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import VOCEvaluator
+
+ val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
+ evaluator = VOCEvaluator(
+ dataloader=val_loader,
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ )
+ return evaluator
diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
new file mode 100644
index 000000000..fb5cb1e60
--- /dev/null
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch
+import torch.distributed as dist
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.num_classes = 1
+ self.depth = 0.33
+ self.width = 0.25
+ self.input_size = (416, 416)
+ self.mosaic_scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ self.test_size = (416, 416)
+ self.eps = 1e-8
+ self.warmup_epochs = 1
+ # ---------- transform config ------------ #
+ #self.mosaic_prob = 1.0
+ self.enable_mixup = False
+ #self.mixup_prob = 1.0
+ #self.hsv_prob = 1.0
+ #self.flip_prob = 0.5
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_model(self, sublinear=False):
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+ in_channels = [256, 512, 1024]
+ # NANO model use depthwise = True, which is main difference.
+ backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+ head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ return self.model
+
+ def get_optimizer(self, batch_size):
+ if "optimizer" not in self.__dict__:
+ if self.warmup_epochs > 0:
+ lr = self.warmup_lr
+ else:
+ lr = self.basic_lr_per_img * batch_size
+
+ pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
+
+ for k, v in self.model.named_modules():
+ if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+ pg2.append(v.bias) # biases
+ if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+ pg0.append(v.weight) # no decay
+ elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+ pg1.append(v.weight) # apply decay
+
+ optimizer = torch.optim.Adam(
+ pg0, lr=lr, eps=self.eps, amsgrad=False
+ )
+ optimizer.add_param_group(
+ {"params": pg1, "weight_decay": self.weight_decay}
+ ) # add pg1 with weight_decay
+ optimizer.add_param_group({"params": pg2})
+ self.optimizer = optimizer
+ return self.optimizer
+
+ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+ from yolox.data import (
+ VOCDetection,
+ TrainTransform,
+ YoloBatchSampler,
+ DataLoader,
+ InfiniteSampler,
+ MosaicDetection,
+ worker_init_reset_seed,
+ )
+ from yolox.utils import (
+ wait_for_the_master,
+ get_local_rank,
+ )
+ local_rank = get_local_rank()
+
+ with wait_for_the_master(local_rank):
+ dataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('train')],
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ cache=cache_img,
+ )
+
+ dataset = MosaicDetection(
+ dataset,
+ mosaic=not no_aug,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=120,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ degrees=self.degrees,
+ translate=self.translate,
+ mosaic_scale=self.mosaic_scale,
+ mixup_scale=self.mixup_scale,
+ shear=self.shear,
+ enable_mixup=self.enable_mixup,
+ mosaic_prob=self.mosaic_prob,
+ mixup_prob=self.mixup_prob,
+ )
+
+ self.dataset = dataset
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+
+ sampler = InfiniteSampler(
+ len(self.dataset), seed=self.seed if self.seed else 0
+ )
+
+ batch_sampler = YoloBatchSampler(
+ sampler=sampler,
+ batch_size=batch_size,
+ drop_last=False,
+ mosaic=not no_aug,
+ )
+
+ dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+ dataloader_kwargs["batch_sampler"] = batch_sampler
+
+ # Make sure each process has different random seed, especially for 'fork' method
+ dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+ train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+ return train_loader
+
+ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.data import VOCDetection, ValTransform
+
+ valdataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('valid')],
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+ sampler = torch.utils.data.distributed.DistributedSampler(
+ valdataset, shuffle=False
+ )
+ else:
+ sampler = torch.utils.data.SequentialSampler(valdataset)
+
+ dataloader_kwargs = {
+ "num_workers": self.data_num_workers,
+ "pin_memory": True,
+ "sampler": sampler,
+ }
+ dataloader_kwargs["batch_size"] = batch_size
+ val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+ return val_loader
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import VOCEvaluator
+
+ val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
+ evaluator = VOCEvaluator(
+ dataloader=val_loader,
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ )
+ return evaluator
diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
new file mode 100644
index 000000000..4801f559d
--- /dev/null
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
@@ -0,0 +1,138 @@
+# encoding: utf-8
+import os
+
+import torch
+import torch.distributed as dist
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.num_classes = 1
+ self.depth = 0.33
+ self.width = 0.50
+ self.warmup_epochs = 1
+
+ # ---------- transform config ------------ #
+ self.mosaic_prob = 1.0
+ self.mixup_prob = 1.0
+ self.hsv_prob = 1.0
+ self.flip_prob = 0.5
+
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+ from yolox.data import (
+ VOCDetection,
+ TrainTransform,
+ YoloBatchSampler,
+ DataLoader,
+ InfiniteSampler,
+ MosaicDetection,
+ worker_init_reset_seed,
+ )
+ from yolox.utils import (
+ wait_for_the_master,
+ get_local_rank,
+ )
+ local_rank = get_local_rank()
+
+ with wait_for_the_master(local_rank):
+ dataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('train')],
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ cache=cache_img,
+ )
+
+ dataset = MosaicDetection(
+ dataset,
+ mosaic=not no_aug,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=120,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ degrees=self.degrees,
+ translate=self.translate,
+ mosaic_scale=self.mosaic_scale,
+ mixup_scale=self.mixup_scale,
+ shear=self.shear,
+ enable_mixup=self.enable_mixup,
+ mosaic_prob=self.mosaic_prob,
+ mixup_prob=self.mixup_prob,
+ )
+
+ self.dataset = dataset
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+
+ sampler = InfiniteSampler(
+ len(self.dataset), seed=self.seed if self.seed else 0
+ )
+
+ batch_sampler = YoloBatchSampler(
+ sampler=sampler,
+ batch_size=batch_size,
+ drop_last=False,
+ mosaic=not no_aug,
+ )
+
+ dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+ dataloader_kwargs["batch_sampler"] = batch_sampler
+
+ # Make sure each process has different random seed, especially for 'fork' method
+ dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+ train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+ return train_loader
+
+ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.data import VOCDetection, ValTransform
+
+ valdataset = VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+ image_sets=[('valid')],
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+ sampler = torch.utils.data.distributed.DistributedSampler(
+ valdataset, shuffle=False
+ )
+ else:
+ sampler = torch.utils.data.SequentialSampler(valdataset)
+
+ dataloader_kwargs = {
+ "num_workers": self.data_num_workers,
+ "pin_memory": True,
+ "sampler": sampler,
+ }
+ dataloader_kwargs["batch_size"] = batch_size
+ val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+ return val_loader
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import VOCEvaluator
+
+ val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
+ evaluator = VOCEvaluator(
+ dataloader=val_loader,
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ )
+ return evaluator
diff --git a/requirements.txt b/requirements.txt
index 7227f09b4..46efe646f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,6 @@ tabulate
# verified versions
# pycocotools corresponds to https://github.com/ppwwyyxx/cocoapi
pycocotools>=2.0.2
-onnx==1.8.1
-onnxruntime==1.8.0
+onnx>=1.8.1
+onnxruntime>=1.8.0
onnx-simplifier==0.3.5
diff --git a/tools/demo.py b/tools/demo.py
index b16598d5f..dc065ca20 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -12,7 +12,7 @@
import torch
from yolox.data.data_augment import ValTransform
-from yolox.data.datasets import COCO_CLASSES
+from yolox.data.datasets import COCO_CLASSES, VOC_CLASSES
from yolox.exp import get_exp
from yolox.utils import fuse_model, get_model_info, postprocess, vis
@@ -169,6 +169,29 @@ def visual(self, output, img_info, cls_conf=0.35):
ratio = img_info["ratio"]
img = img_info["raw_img"]
if output is None:
+ font = cv2.FONT_HERSHEY_SIMPLEX
+ class_count = {}
+ class_AP = {}
+ for i in self.cls_names:
+ class_count[i] = 0
+ class_AP[i] = 0.0
+ x0 = 15
+ y0 = 0
+ row = 0
+ for k in class_count:
+ if((y0+row+50)>=img.shape[0]):
+ x0 = x0+200
+ y0 = 25
+ row = 0
+ else:
+ row = row+25
+ cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+ if class_count[k] !=0:
+ class_AP[k]=class_AP[k]/class_count[k]
+ else:
+ class_AP[k]=0.0
+ row = row+25
+ cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
return img
output = output.cpu()
diff --git a/tools/demo_sliding_window.py b/tools/demo_sliding_window.py
new file mode 100644
index 000000000..7a0e20f00
--- /dev/null
+++ b/tools/demo_sliding_window.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import argparse
+import os
+import time
+from loguru import logger
+
+import cv2
+
+import torch
+
+from yolox.data.data_augment import preproc, sliding_window
+from yolox.data.datasets import COCO_CLASSES, VOC_CLASSES
+from yolox.exp import get_exp
+from yolox.utils import fuse_model, get_model_info, postprocess, vis
+
+IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
+
+
+def make_parser():
+ parser = argparse.ArgumentParser("YOLOX Demo!")
+ parser.add_argument(
+ "demo", default="image", help="demo type, eg. image, video and webcam"
+ )
+ parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+ parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+ parser.add_argument(
+ "--path", default="./assets/dog.jpg", help="path to images or video"
+ )
+ parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
+ parser.add_argument(
+ "--save_result",
+ action="store_true",
+ help="whether to save the inference result of image/video",
+ )
+
+ # exp file
+ parser.add_argument(
+ "-f",
+ "--exp_file",
+ default=None,
+ type=str,
+ help="please input your experiment description file",
+ )
+ parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
+ parser.add_argument(
+ "--device",
+ default="cpu",
+ type=str,
+ help="device to run our model, can either be cpu or gpu",
+ )
+ parser.add_argument("--conf", default=0.3, type=float, help="test conf")
+ parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
+ parser.add_argument("--tsize", default=None, type=int, help="test img size")
+ parser.add_argument(
+ "--fp16",
+ dest="fp16",
+ default=False,
+ action="store_true",
+ help="Adopting mix precision evaluating.",
+ )
+ """
+ parser.add_argument(
+ "--legacy",
+ dest="legacy",
+ default=False,
+ action="store_true",
+ help="To be compatible with older versions",
+ )
+ """
+ parser.add_argument(
+ "--fuse",
+ dest="fuse",
+ default=False,
+ action="store_true",
+ help="Fuse conv and bn for testing.",
+ )
+ parser.add_argument(
+ "--trt",
+ dest="trt",
+ default=False,
+ action="store_true",
+ help="Using TensorRT model for testing.",
+ )
+ return parser
+
+
+def get_image_list(path):
+ image_names = []
+ for maindir, subdir, file_name_list in os.walk(path):
+ for filename in file_name_list:
+ apath = os.path.join(maindir, filename)
+ ext = os.path.splitext(apath)[1]
+ if ext in IMAGE_EXT:
+ image_names.append(apath)
+ return image_names
+
+
+class Predictor(object):
+ def __init__(
+ self,
+ model,
+ exp,
+ cls_names=COCO_CLASSES,
+ trt_file=None,
+ decoder=None,
+ device="cpu",
+ #fp16=False,
+ #legacy=False,
+ ):
+ self.model = model
+ self.cls_names = cls_names
+ self.decoder = decoder
+ self.num_classes = exp.num_classes
+ self.confthre = exp.test_conf
+ self.nmsthre = exp.nmsthre
+ self.test_size = exp.test_size
+ self.device = device
+ #self.fp16 = fp16
+ #self.preproc = ValTransform(legacy=legacy)
+ if trt_file is not None:
+ from torch2trt import TRTModule
+
+ model_trt = TRTModule()
+ model_trt.load_state_dict(torch.load(trt_file))
+
+ x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+ self.model(x)
+ self.model = model_trt
+ self.rgb_means = (0.485, 0.456, 0.406)
+ self.std = (0.229, 0.224, 0.225)
+
+
+ def inference(self, img):
+ img_info = {"id": 0}
+ if isinstance(img, str):
+ img_info["file_name"] = os.path.basename(img)
+ img = cv2.imread(img)
+ else:
+ img_info["file_name"] = None
+
+ height, width = img.shape[:2]
+ img_info["height"] = height
+ img_info["width"] = width
+ img_info["raw_img"] = img
+
+ #img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT,value=(0,0,0))
+ #ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
+ #img_info["ratio"] = ratio
+ """
+ if (img.shape[0]>exp.test_size[0]):
+ h_r = (img.shape[0]//exp.test_size[0]+1)*exp.test_size[0]-img.shape[0]
+ elif(img.shape[0]exp.test_size[1]):
+ w_r = (img.shape[1]//exp.test_size[1]+1)*exp.test_size[1]-img.shape[1]
+ elif(img.shape[1]=img.shape[0]):
+ x0 = x0+200
+ y0 = 25
+ row = 0
+ else:
+ row = row+25
+ cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+ if class_count[k] !=0:
+ class_AP[k]=class_AP[k]/class_count[k]
+ else:
+ class_AP[k]=0.0
+ row = row+25
+ cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+ return img
+ output = output.cpu()
+
+ bboxes = output[:, 0:4]
+
+ # preprocessing: resize
+ #bboxes /= ratio
+
+ cls = output[:, 6]
+ scores = output[:, 4] * output[:, 5]
+
+ vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
+ return vis_res
+
+
+def image_demo(predictor, vis_folder, path, current_time, save_result):
+ if os.path.isdir(path):
+ files = get_image_list(path)
+ else:
+ files = [path]
+ files.sort()
+ for image_name in files:
+ outputs, img_info = predictor.inference(image_name)
+ result_image = predictor.visual(outputs[0], img_info, predictor.confthre)
+ if save_result:
+ save_folder = os.path.join(
+ vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+ )
+ os.makedirs(save_folder, exist_ok=True)
+ save_file_name = os.path.join(save_folder, os.path.basename(image_name))
+ logger.info("Saving detection result in {}".format(save_file_name))
+ cv2.imwrite(save_file_name, result_image)
+ ch = cv2.waitKey(0)
+ if ch == 27 or ch == ord("q") or ch == ord("Q"):
+ break
+
+
+def imageflow_demo(predictor, vis_folder, current_time, args):
+ cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid)
+ width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float
+ height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float
+ fps = cap.get(cv2.CAP_PROP_FPS)
+ if args.save_result:
+ save_folder = os.path.join(
+ vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+ )
+ os.makedirs(save_folder, exist_ok=True)
+ if args.demo == "video":
+ save_path = os.path.join(save_folder, os.path.basename(args.path))
+ else:
+ save_path = os.path.join(save_folder, "camera.mp4")
+ logger.info(f"video save_path is {save_path}")
+ vid_writer = cv2.VideoWriter(
+ save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
+ )
+ while True:
+ ret_val, frame = cap.read()
+ if ret_val:
+ outputs, img_info = predictor.inference(frame)
+ result_frame = predictor.visual(outputs[0], img_info, predictor.confthre)
+ if args.save_result:
+ vid_writer.write(result_frame)
+ else:
+ cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
+ cv2.imshow("yolox", result_frame)
+ ch = cv2.waitKey(1)
+ if ch == 27 or ch == ord("q") or ch == ord("Q"):
+ break
+ else:
+ break
+
+
+def main(exp, args):
+ if not args.experiment_name:
+ args.experiment_name = exp.exp_name
+
+ file_name = os.path.join(exp.output_dir, args.experiment_name)
+ os.makedirs(file_name, exist_ok=True)
+
+ vis_folder = None
+ if args.save_result:
+ vis_folder = os.path.join(file_name, "vis_res")
+ os.makedirs(vis_folder, exist_ok=True)
+
+ if args.trt:
+ args.device = "gpu"
+
+ logger.info("Args: {}".format(args))
+
+ if args.conf is not None:
+ exp.test_conf = args.conf
+ if args.nms is not None:
+ exp.nmsthre = args.nms
+ if args.tsize is not None:
+ exp.test_size = (args.tsize, args.tsize)
+
+ model = exp.get_model()
+ logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
+
+ if args.device == "gpu":
+ model.cuda()
+ if args.fp16:
+ model.half() # to FP16
+ model.eval()
+
+ if not args.trt:
+ if args.ckpt is None:
+ ckpt_file = os.path.join(file_name, "best_ckpt.pth")
+ else:
+ ckpt_file = args.ckpt
+ logger.info("loading checkpoint")
+ ckpt = torch.load(ckpt_file, map_location="cpu")
+ # load the model state dict
+ model.load_state_dict(ckpt["model"])
+ logger.info("loaded checkpoint done.")
+
+ if args.fuse:
+ logger.info("\tFusing model...")
+ model = fuse_model(model)
+
+ if args.trt:
+ assert not args.fuse, "TensorRT model is not support model fusing!"
+ trt_file = os.path.join(file_name, "model_trt.pth")
+ assert os.path.exists(
+ trt_file
+ ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
+ model.head.decode_in_inference = False
+ decoder = model.head.decode_outputs
+ logger.info("Using TensorRT to inference")
+ else:
+ trt_file = None
+ decoder = None
+
+ predictor = Predictor(
+ model, exp, COCO_CLASSES, trt_file, decoder,
+ args.device#, args.fp16, args.legacy,
+ )
+ current_time = time.localtime()
+ if args.demo == "image":
+ image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
+ elif args.demo == "video" or args.demo == "webcam":
+ imageflow_demo(predictor, vis_folder, current_time, args)
+
+
+if __name__ == "__main__":
+ args = make_parser().parse_args()
+ exp = get_exp(args.exp_file, args.name)
+
+ main(exp, args)
diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py
index 21cd7b56d..f4da7580b 100644
--- a/yolox/data/data_augment.py
+++ b/yolox/data/data_augment.py
@@ -157,6 +157,13 @@ def preproc(img, input_size, swap=(2, 0, 1)):
padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
return padded_img, r
+def sliding_window(image, YstepSize, XstepSize, windowSize):
+ # slide a window across the image
+ for y in range(0, image.shape[0], YstepSize):
+ for x in range(0, image.shape[1], XstepSize):
+ # yield the current window
+ yield (x, y, image[y:y + windowSize[1], x:x + windowSize[0]])
+
class TrainTransform:
def __init__(self, max_labels=50, flip_prob=0.5, hsv_prob=1.0):
diff --git a/yolox/data/datasets/__init__.py b/yolox/data/datasets/__init__.py
index dee2c9f48..6ea2be14f 100644
--- a/yolox/data/datasets/__init__.py
+++ b/yolox/data/datasets/__init__.py
@@ -6,4 +6,5 @@
from .coco_classes import COCO_CLASSES
from .datasets_wrapper import ConcatDataset, Dataset, MixConcatDataset
from .mosaicdetection import MosaicDetection
+from .voc_classes import VOC_CLASSES
from .voc import VOCDetection
diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
index 4fbdf8836..5ead905e4 100644
--- a/yolox/data/datasets/coco.py
+++ b/yolox/data/datasets/coco.py
@@ -40,8 +40,8 @@ class COCODataset(Dataset):
def __init__(
self,
data_dir=None,
- json_file="instances_train2017.json",
- name="train2017",
+ json_file="train_annotations.json",
+ name="train",
img_size=(416, 416),
preproc=None,
cache=False,
@@ -57,7 +57,7 @@ def __init__(
"""
super().__init__(img_size)
if data_dir is None:
- data_dir = os.path.join(get_yolox_datadir(), "COCO")
+ data_dir = os.path.join(get_yolox_datadir(), "pedestrian_coco")
self.data_dir = data_dir
self.json_file = json_file
diff --git a/yolox/data/datasets/coco_classes.py b/yolox/data/datasets/coco_classes.py
index 17f5cbe6e..760945eaf 100644
--- a/yolox/data/datasets/coco_classes.py
+++ b/yolox/data/datasets/coco_classes.py
@@ -2,6 +2,10 @@
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
+COCO_CLASSES = (
+ "pedestrian",
+)
+"""
COCO_CLASSES = (
"person",
"bicycle",
@@ -84,3 +88,4 @@
"hair drier",
"toothbrush",
)
+"""
diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py
index 56675a297..09e9833de 100644
--- a/yolox/data/datasets/voc.py
+++ b/yolox/data/datasets/voc.py
@@ -120,6 +120,13 @@ def __init__(
self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg")
self._classes = VOC_CLASSES
self.ids = list()
+ for name in image_sets:
+ rootpath = self.root
+ for line in open(
+ os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+ ):
+ self.ids.append((rootpath, line.strip()))
+ """
for (year, name) in image_sets:
self._year = year
rootpath = os.path.join(self.root, "VOC" + year)
@@ -127,7 +134,7 @@ def __init__(
os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
):
self.ids.append((rootpath, line.strip()))
-
+ """
self.annotations = self._load_coco_annotations()
self.imgs = None
if cache:
@@ -279,7 +286,9 @@ def evaluate_detections(self, all_boxes, output_dir=None):
def _get_voc_results_file_template(self):
filename = "comp4_det_test" + "_{:s}.txt"
- filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
+ #filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
+ filedir = os.path.join(self.root, "results")
+ #filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
if not os.path.exists(filedir):
os.makedirs(filedir)
path = os.path.join(filedir, filename)
@@ -311,12 +320,14 @@ def _write_voc_results_file(self, all_boxes):
)
def _do_python_eval(self, output_dir="output", iou=0.5):
- rootpath = os.path.join(self.root, "VOC" + self._year)
+ #rootpath = os.path.join(self.root, "VOC" + self._year)
+ rootpath = self.root
name = self.image_set[0][1]
annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
cachedir = os.path.join(
- self.root, "annotations_cache", "VOC" + self._year, name
+ #self.root, "annotations_cache", "VOC" + self._year, name
+ self.root, "annotations_cache"
)
if not os.path.exists(cachedir):
os.makedirs(cachedir)
diff --git a/yolox/data/datasets/voc_classes.py b/yolox/data/datasets/voc_classes.py
index 89354b3fd..438c5b78b 100644
--- a/yolox/data/datasets/voc_classes.py
+++ b/yolox/data/datasets/voc_classes.py
@@ -3,6 +3,11 @@
# Copyright (c) Megvii, Inc. and its affiliates.
# VOC_CLASSES = ( '__background__', # always index 0
+
+VOC_CLASSES = (
+ "pedestrian",
+)
+"""
VOC_CLASSES = (
"aeroplane",
"bicycle",
@@ -25,3 +30,4 @@
"train",
"tvmonitor",
)
+"""
diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
index 6e52e6eac..c96e195d0 100644
--- a/yolox/exp/yolox_base.py
+++ b/yolox/exp/yolox_base.py
@@ -275,7 +275,8 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=Fals
valdataset = COCODataset(
data_dir=self.data_dir,
json_file=self.val_ann if not testdev else self.test_ann,
- name="val2017" if not testdev else "test2017",
+ #name="val2017" if not testdev else "test2017",
+ name="valid" if not testdev else "test",
img_size=self.test_size,
preproc=ValTransform(legacy=legacy),
)
diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
index e714a3ee7..16aa9dee5 100644
--- a/yolox/utils/visualize.py
+++ b/yolox/utils/visualize.py
@@ -9,6 +9,11 @@
def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
+ class_count = {}
+ class_AP = {}
+ for j in class_names:
+ class_count[j] = 0
+ class_AP[j] = 0
for i in range(len(boxes)):
box = boxes[i]
@@ -22,7 +27,7 @@ def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
y1 = int(box[3])
color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
- text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
+ text = '{:.1f}%'.format(score * 100)#'{}:{:.1f}%'.format(class_names[cls_id], score * 100)
txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
font = cv2.FONT_HERSHEY_SIMPLEX
@@ -37,8 +42,28 @@ def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
txt_bk_color,
-1
)
+ class_count[class_names[cls_id]] = class_count[class_names[cls_id]]+1
+ class_AP[class_names[cls_id]] = class_AP[class_names[cls_id]]+float('{:.1f}'.format(score * 100))
cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
-
+
+ x0 = 15
+ y0 = 0
+ row = 0
+ for k in class_count:
+ if((y0+row+50)>=img.shape[0]):
+ x0 = x0+200
+ y0 = 25
+ row = 0
+ else:
+ row = row+25
+ cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+ if class_count[k] !=0:
+ class_AP[k]=class_AP[k]/class_count[k]
+ else:
+ class_AP[k]=0.0
+ row = row+25
+ cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+
return img