long8v · long8v · Jul 21, 2022
diff --git a/.lh/.lhignore b/.lh/.lhignore
@@ -0,0 +1,6 @@
+# list file to not track by the local-history extension. comment line starts with a '#' character
+# each line describe a regular expression pattern (search for "Javascript regex")
+# it will relate to the workspace directory root. for example:
+# ".*\.txt" ignores any file with "txt" extension
+# "/test/.*" ignores all the files under the "test" directory
+# ".*/test/.*" ignores all the files under any "test" directory (even under sub-folders)
diff --git a/RelTR/.gitignore b/RelTR/.gitignore
@@ -0,0 +1,4 @@
+ckpt/
+data/
+.idea/
+__pycache__/
diff --git a/RelTR/README.md b/RelTR/README.md
@@ -0,0 +1,107 @@
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-U642OoCyb8OSM8nx9lme49dmWa_aUcU?usp=sharing)
+# RelTR: Relation Transformer for Scene Graph Generation
+
+We now provide [[Colab](https://colab.research.google.com/drive/1-U642OoCyb8OSM8nx9lme49dmWa_aUcU?usp=sharing)] Demo! 
+
+PyTorch Implementation of the Paper [**RelTR: Relation Transformer for Scene Graph Generation**](https://arxiv.org/abs/2201.11460)
+
+Different from most existing advanced approaches that infer the **dense** relationships between all entity proposals, our one-stage method can directly generate a **sparse** scene graph by decoding the visual appearance.
+
+<p align="center">
+  <img src="demo/demo.png">
+</p>
+
+# 0. Checklist
+
+- [x] Inference Code :tada:
+- [x] Training Code for Visual Genome :tada:
+- [x] Evaluation Code for Visual Genome :tada:
+- [x] Colab Demo :tada:
+- [ ] Training Code for OpenImages V6 :clock9:
+- [ ] Evaluation Code for OpenImages V6 :clock9:
+
+# 1. Installation
+Download **RelTR Repo** with:
+```
+git clone https://github.com/yrcong/RelTR.git
+cd RelTR
+```
+
+## For Inference
+:smile: It is super easy to configure the RelTR environment.
+
+If you want to **infer an image**, only python=3.6, PyTorch=1.6 and matplotlib are required!
+You can configure the environment as follows:
+```
+# create a conda environment 
+conda create -n reltr python=3.6
+conda activate reltr
+
+# install packages
+conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.1 -c pytorch
+conda install matplotlib
+```
+
+## Training/Evaluation on Visual Genome
+If you want to **train/evaluate** RelTR on Visual Genome, you need a little more preparation:
+
+a) Scipy (we used 1.5.2) and pycocotools are required. 
+```
+conda install scipy
+pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+```
+
+b) Download the annotations of [Visual Genome (in COCO-format)](https://drive.google.com/file/d/1aGwEu392DiECGdvwaYr-LgqGLmWhn8yD/view?usp=sharing) and unzip it in the ```data/``` forder.
+
+c) Download the the images of VG [Part1](https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip) and [Part2](https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip). Unzip and place all images in a folder ```data/vg/images/```
+
+d) Some widely-used evaluation code (**IoU**) need to be compiled... We will replace it with Pytorch code.
+```
+# compile the code computing box intersection
+cd lib/fpn
+sh make.sh
+```
+
+The directory structure looks like:
+```
+RelTR
+| 
+│
+└───data
+│   └───vg
+│       │   rel.json
+│       │   test.json
+│       |   train.json
+|       |   val.json
+|       |   images
+└───datasets    
+... 
+```
+
+# 2. Usage
+
+## Inference
+a) Download our [RelTR model](https://drive.google.com/file/d/1id6oD_iwiNDD6HyCn2ORgRTIKkPD3tUD/view) pretrained on the Visual Genome dataset and put it under 
+```
+ckpt/checkpoint0149.pth
+```
+b) Infer the relationships in an image with the command:
+```
+python inference.py --img_path $IMAGE_PATH --resume $MODEL_PATH
+```
+We attached 5 images from **VG** dataset and 1 image from **internet**. You can also test with your customized image. The result should look like:
+<p align="center">
+  <img src="demo/vg1_pred.png">
+</p>
+
+## Training
+a) Train RelTR on Visual Genome on a single node with 8 GPUs (2 images per GPU):
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset vg --img_folder data/vg/images/ --batch_size 2 --output_dir ckpt
+```
+
+## Evaluation
+b) Evaluate the pretrained RelTR on Visual Genome with a single GPU (1 image per GPU):
+```
+python main.py --dataset vg --img_folder data/vg/images/ --eval --batch_size 1 --resume ckpt/checkpoint0149.pth
+```
diff --git a/RelTR/datasets/__init__.py b/RelTR/datasets/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch.utils.data
+import torchvision
+
+from .coco import build as build_coco
+
+
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+
+
+def build_dataset(image_set, args):
+    if args.dataset == 'vg' or args.dataset_file == 'oi':
+        return build_coco(image_set, args)
+    raise ValueError(f'dataset {args.dataset} not supported')
diff --git a/RelTR/datasets/coco.py b/RelTR/datasets/coco.py
@@ -0,0 +1,182 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Institute of Information Processing, Leibniz University Hannover.
+
+"""
+dataset (COCO-like) which returns image_id for evaluation.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+import json
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+
+import datasets.transforms as T
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+
+        #TODO load relationship
+        with open('/'.join(ann_file.split('/')[:-1])+'/rel.json', 'r') as f:
+            all_rels = json.load(f)
+        if 'train' in ann_file:
+            self.rel_annotations = all_rels['train']
+        elif 'val' in ann_file:
+            self.rel_annotations = all_rels['val']
+        else:
+            self.rel_annotations = all_rels['test']
+
+        self.rel_categories = all_rels['rel_categories']
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        rel_target = self.rel_annotations[str(image_id)]
+
+        target = {'image_id': image_id, 'annotations': target, 'rel_annotations': rel_target}
+
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        # TODO add relation gt in the target
+        rel_annotations = target['rel_annotations']
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        # TODO add relation gt in the target
+        target['rel_annotations'] = torch.tensor(rel_annotations)
+
+        return image, target
+
+
+def make_coco_transforms(image_set):
+
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    #T.RandomSizeCrop(384, 600), # TODO: cropping causes that some boxes are dropped then no tensor in the relation part! What should we do?
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize])
+
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+
+    ann_path = args.ann_path
+    img_folder = args.img_folder
+
+    #TODO: adapt vg as coco
+    if image_set == 'train':
+        ann_file = ann_path + 'train.json'
+    elif image_set == 'val':
+        if args.eval:
+            ann_file = ann_path + 'test.json'
+        else:
+            ann_file = ann_path + 'val.json'
+
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=False)
+    return dataset