mmaaz60
diff --git a/‎README.md
Lines changed: 10 additions & 1 deletion b/‎README.md
Lines changed: 10 additions & 1 deletion
diff --git a/‎datasets.py
Lines changed: 3 additions & 11 deletions b/‎datasets.py
Lines changed: 3 additions & 11 deletions
diff --git a/‎engine.py
Lines changed: 9 additions & 17 deletions b/‎engine.py
Lines changed: 9 additions & 17 deletions
diff --git a/‎images/EdgeNext.jpeg
587 KB b/‎images/EdgeNext.jpeg
587 KB
diff --git a/‎images/Figure_1.png
701 KB b/‎images/Figure_1.png
701 KB
diff --git a/‎images/table_2.png
77.7 KB b/‎images/table_2.png
77.7 KB
@@ -2,4 +2,13 @@
 Light-weight and Efficient Networks for Mobile Vision Applications
 
 ## :rocket: News
-* Training and evaluation code along with pre-trained models will be released soon. Stay tuned!
+* Training and evaluation code along with pre-trained models will be released soon. Stay tuned!
+
+<hr />
+
+![main figure](images/EdgeNext.jpeg)
+> **Abstract:** *Designing lightweight general purpose networks for edge devices is a challenging task due to the compute constraints. In this domain, CNN-based light-weight architectures are considered the de-facto choice due to their efficiency in terms of parameters and complexity. However, they are based on spatially local operations and exhibit a limited receptive field. While vision transformers alleviate these issues and can learn global representations, they are typically compute intensive and difficult to optimize. Here, we investigate how to effectively encode both local and global information, while being efficient in terms of both parameters and MAdds on vision tasks. To this end, we propose EdgeNeXt, a hybrid CNN-Transformer architecture that strives to jointly optimize parameters and MAdds for efficient inference on edge devices. Within our EdgeNeXt, we introduce split depthwise transpose attention (SDTA) encoder that splits input tensors into multiple channel groups and utilizes depthwise convolution along with self-attention across channel dimensions to implicitly increase the receptive field and encode multi-scale features. Our extensive experiments on classification, detection and segmentation settings, reveal the merits of the proposed approach, outperforming state-of-the-art methods with comparatively lower compute requirements. Our EdgeNeXt model with 1.3M parameters achieves 71.2\% top-1 accuracy on ImageNet-1K, outperforming MobileViT with an absolute gain of 2.2\% with similar parameters and 28\% reduction in MAdds. Further, our EdgeNeXt model with 5.6M parameters achieves 79.4\% top-1 accuracy on ImageNet-1K.* 
+<hr />
+
+## Comparison with SOTA ViTs and Hybrid Designs
+![main figure](images/Figure_1.png)
@@ -1,11 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-
 import os
 from torchvision import datasets, transforms
 
@@ -59,7 +51,7 @@ def build_transform(is_train, args):
     std = IMAGENET_INCEPTION_STD if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_STD
 
     if is_train:
-        # this should always dispatch to transforms_imagenet_train
+        # This should always dispatch to transforms_imagenet_train
         transform = create_transform(
             input_size=args.input_size,
             is_training=True,
@@ -87,7 +79,7 @@ def build_transform(is_train, args):
 
     t = []
     if resize_im:
-        # warping (no cropping) when evaluated at 384 or larger
+        # Warping (no cropping) when evaluated at 384 or larger
         if args.input_size >= 384:
             t.append(
                 transforms.Resize((args.input_size, args.input_size),
@@ -99,7 +91,7 @@ def build_transform(is_train, args):
                 args.crop_pct = 224 / 256
             size = int(args.input_size / args.crop_pct)
             t.append(
-                # to maintain same ratio w.r.t. 224 images
+                # To maintain same ratio w.r.t. 224 images
                 transforms.Resize(size, interpolation=transforms.InterpolationMode.BICUBIC),
             )
             t.append(transforms.CenterCrop(args.input_size))
 
@@ -1,11 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-
 import math
 from typing import Iterable, Optional
 import torch
@@ -34,7 +26,7 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
         step = data_iter_step // update_freq
         if step >= num_training_steps_per_epoch:
             continue
-        it = start_steps + step  # global training iteration
+        it = start_steps + step  # Global training iteration
         # Update LR & WD for the first acc
         if lr_schedule_values is not None or wd_schedule_values is not None and data_iter_step % update_freq == 0:
             for i, param_group in enumerate(optimizer.param_groups):
@@ -53,18 +45,18 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
             with torch.cuda.amp.autocast():
                 output = model(samples)
                 loss = criterion(output, targets)
-        else:  # full precision
+        else:  # Full precision
             output = model(samples)
             loss = criterion(output, targets)
 
         loss_value = loss.item()
 
-        if not math.isfinite(loss_value):  # this could trigger if using AMP
+        if not math.isfinite(loss_value):  # This could trigger if using AMP
             print("Loss is {}, stopping training".format(loss_value))
             assert math.isfinite(loss_value)
 
         if use_amp:
-            # this attribute is added by timm on one optimizer (adahessian)
+            # This attribute is added by timm on one optimizer (adahessian)
             is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
             loss /= update_freq
             grad_norm = loss_scaler(loss, optimizer, clip_grad=max_norm,
@@ -74,7 +66,7 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
                 optimizer.zero_grad()
                 if model_ema is not None:
                     model_ema.update(model)
-        else:  # full precision
+        else:  # Full precision
             loss /= update_freq
             loss.backward()
             if (data_iter_step + 1) % update_freq == 0:
@@ -129,7 +121,7 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
                 wandb_logger._wandb.log({'Rank-0 Batch Wise/train_grad_norm': grad_norm}, commit=False)
             wandb_logger._wandb.log({'Rank-0 Batch Wise/global_train_step': it})
 
-    # gather the stats from all processes
+    # Gather the stats from all processes
     metric_logger.synchronize_between_processes()
     print("Averaged stats:", metric_logger)
     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
@@ -142,7 +134,7 @@ def evaluate(data_loader, model, device, use_amp=False):
     metric_logger = utils.MetricLogger(delimiter="  ")
     header = 'Test:'
 
-    # switch to evaluation mode
+    # Switch to evaluation mode
     model.eval()
     for batch in metric_logger.log_every(data_loader, 10, header):
         images = batch[0]
@@ -151,7 +143,7 @@ def evaluate(data_loader, model, device, use_amp=False):
         images = images.to(device, non_blocking=True)
         target = target.to(device, non_blocking=True)
 
-        # compute output
+        # Compute output
         if use_amp:
             with torch.cuda.amp.autocast():
                 output = model(images)
@@ -166,7 +158,7 @@ def evaluate(data_loader, model, device, use_amp=False):
         metric_logger.update(loss=loss.item())
         metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
         metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
-    # gather the stats from all processes
+    # Gather the stats from all processes
     metric_logger.synchronize_between_processes()
     print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
           .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))