From 40505a5b6daa03632691b433197810f528231d26 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Wed, 9 Feb 2022 19:57:13 +0800
Subject: [PATCH 01/27] [Enhancement] Upgrade isort in pre-commit hook (#7130)

* upgrade isort to v5.10.1

* replace known_standard_library with extra_standard_library

* upgrade isort to v5.10.1

replace known_standard_library with extra_standard_library

* imports order changes
---
 .dev_scripts/benchmark_inference_fps.py                   | 1 +
 .pre-commit-config.yaml                                   | 8 ++------
 mmdet/core/bbox/assigners/assign_result.py                | 1 +
 mmdet/core/bbox/samplers/sampling_result.py               | 4 ++--
 mmdet/core/mask/structures.py                             | 1 +
 mmdet/datasets/api_wrappers/panoptic_evaluation.py        | 2 +-
 mmdet/datasets/builder.py                                 | 4 ++--
 mmdet/datasets/lvis.py                                    | 2 +-
 mmdet/models/backbones/pvt.py                             | 2 +-
 setup.cfg                                                 | 2 +-
 setup.py                                                  | 2 +-
 .../test_pipelines/test_transform/test_img_augment.py     | 7 ++++---
 tests/test_models/test_forward.py                         | 3 ++-
 tests/test_models/test_roi_heads/test_bbox_head.py        | 4 ++--
 tests/test_models/test_roi_heads/test_mask_head.py        | 1 +
 tests/test_runtime/test_config.py                         | 4 +++-
 tests/test_utils/test_anchor.py                           | 2 ++
 tools/analysis_tools/test_robustness.py                   | 2 +-
 18 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/.dev_scripts/benchmark_inference_fps.py b/.dev_scripts/benchmark_inference_fps.py
index 5d46d52a8e3..81dcd6b1a1f 100644
--- a/.dev_scripts/benchmark_inference_fps.py
+++ b/.dev_scripts/benchmark_inference_fps.py
@@ -7,6 +7,7 @@
 from mmcv import Config, DictAction
 from mmcv.runner import init_dist
 from terminaltables import GithubFlavoredMarkdownTable
+
 from tools.analysis_tools.benchmark import repeat_measure_inference_speed
 
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 13e4cc406c4..8c21d42d8b0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,12 +3,8 @@ repos:
     rev: 3.8.3
     hooks:
       - id: flake8
-  - repo: https://github.com/asottile/seed-isort-config
-    rev: v2.2.0
-    hooks:
-      - id: seed-isort-config
-  - repo: https://github.com/timothycrosley/isort
-    rev: 4.3.21
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.10.1
     hooks:
       - id: isort
   - repo: https://github.com/pre-commit/mirrors-yapf
diff --git a/mmdet/core/bbox/assigners/assign_result.py b/mmdet/core/bbox/assigners/assign_result.py
index eda0a01d56f..c1a2d5f371f 100644
--- a/mmdet/core/bbox/assigners/assign_result.py
+++ b/mmdet/core/bbox/assigners/assign_result.py
@@ -140,6 +140,7 @@ def random(cls, **kwargs):
                 labels = None
         else:
             import numpy as np
+
             # Create an overlap for each predicted box
             max_overlaps = torch.from_numpy(rng.rand(num_preds))
 
diff --git a/mmdet/core/bbox/samplers/sampling_result.py b/mmdet/core/bbox/samplers/sampling_result.py
index 6c03c7bb4d7..7d106cbeffd 100644
--- a/mmdet/core/bbox/samplers/sampling_result.py
+++ b/mmdet/core/bbox/samplers/sampling_result.py
@@ -112,9 +112,9 @@ def random(cls, rng=None, **kwargs):
             >>> self = SamplingResult.random()
             >>> print(self.__dict__)
         """
-        from mmdet.core.bbox.samplers.random_sampler import RandomSampler
-        from mmdet.core.bbox.assigners.assign_result import AssignResult
         from mmdet.core.bbox import demodata
+        from mmdet.core.bbox.assigners.assign_result import AssignResult
+        from mmdet.core.bbox.samplers.random_sampler import RandomSampler
         rng = demodata.ensure_rng(rng)
 
         # make probabalistic?
diff --git a/mmdet/core/mask/structures.py b/mmdet/core/mask/structures.py
index 8fd30680a67..285e9cc1e7e 100644
--- a/mmdet/core/mask/structures.py
+++ b/mmdet/core/mask/structures.py
@@ -960,6 +960,7 @@ def _gen_polygon(n, irregularity, spikeyness):
                 a list of vertices, in CCW order.
             """
             from scipy.stats import truncnorm
+
             # Generate around the unit circle
             cx, cy = (0.0, 0.0)
             radius = 1
diff --git a/mmdet/datasets/api_wrappers/panoptic_evaluation.py b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
index 1a21fe8f098..9b34201e1c5 100644
--- a/mmdet/datasets/api_wrappers/panoptic_evaluation.py
+++ b/mmdet/datasets/api_wrappers/panoptic_evaluation.py
@@ -11,7 +11,7 @@
 import numpy as np
 
 try:
-    from panopticapi.evaluation import PQStat, VOID, OFFSET
+    from panopticapi.evaluation import OFFSET, VOID, PQStat
     from panopticapi.utils import rgb2id
 except ImportError:
     PQStat = None
diff --git a/mmdet/datasets/builder.py b/mmdet/datasets/builder.py
index ab0d3a801b5..30e1ee91a05 100644
--- a/mmdet/datasets/builder.py
+++ b/mmdet/datasets/builder.py
@@ -56,8 +56,8 @@ def _concat_dataset(cfg, default_args=None):
 
 
 def build_dataset(cfg, default_args=None):
-    from .dataset_wrappers import (ConcatDataset, RepeatDataset,
-                                   ClassBalancedDataset, MultiImageMixDataset)
+    from .dataset_wrappers import (ClassBalancedDataset, ConcatDataset,
+                                   MultiImageMixDataset, RepeatDataset)
     if isinstance(cfg, (list, tuple)):
         dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
     elif cfg['type'] == 'ConcatDataset':
diff --git a/mmdet/datasets/lvis.py b/mmdet/datasets/lvis.py
index d91138effca..511e31aebec 100644
--- a/mmdet/datasets/lvis.py
+++ b/mmdet/datasets/lvis.py
@@ -343,7 +343,7 @@ def evaluate(self,
                 warnings.warn(
                     'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
                     UserWarning)
-            from lvis import LVISResults, LVISEval
+            from lvis import LVISEval, LVISResults
         except ImportError:
             raise ImportError(
                 'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
diff --git a/mmdet/models/backbones/pvt.py b/mmdet/models/backbones/pvt.py
index 9443273a62e..8b7d5d5344a 100644
--- a/mmdet/models/backbones/pvt.py
+++ b/mmdet/models/backbones/pvt.py
@@ -158,7 +158,7 @@ def __init__(self,
             self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
 
         # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
-        from mmdet import mmcv_version, digit_version
+        from mmdet import digit_version, mmcv_version
         if mmcv_version < digit_version('1.3.17'):
             warnings.warn('The legacy version of forward function in'
                           'SpatialReductionAttention is deprecated in'
diff --git a/setup.cfg b/setup.cfg
index 18adf687165..c298a0e6e3b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,7 +1,7 @@
 [isort]
 line_length = 79
 multi_line_output = 0
-known_standard_library = setuptools
+extra_standard_library = setuptools
 known_first_party = mmdet
 known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml
 no_lines_before = STDLIB,LOCALFOLDER
diff --git a/setup.py b/setup.py
index e5cb8bea6fc..6bc78853ac1 100755
--- a/setup.py
+++ b/setup.py
@@ -67,9 +67,9 @@ def parse_requirements(fname='requirements.txt', with_version=True):
     CommandLine:
         python -c "import setup; print(setup.parse_requirements())"
     """
+    import re
     import sys
     from os.path import exists
-    import re
     require_fpath = fname
 
     def parse_line(line):
diff --git a/tests/test_data/test_pipelines/test_transform/test_img_augment.py b/tests/test_data/test_pipelines/test_transform/test_img_augment.py
index 2a65fc2e81c..f28030e9689 100644
--- a/tests/test_data/test_pipelines/test_transform/test_img_augment.py
+++ b/tests/test_data/test_pipelines/test_transform/test_img_augment.py
@@ -45,7 +45,7 @@ def test_imequalize(nb_rand_test=100):
 
     def _imequalize(img):
         # equalize the image using PIL.ImageOps.equalize
-        from PIL import ImageOps, Image
+        from PIL import Image, ImageOps
         img = Image.fromarray(img)
         equalized_img = np.asarray(ImageOps.equalize(img))
         return equalized_img
@@ -81,8 +81,8 @@ def test_adjust_brightness(nb_rand_test=100):
     def _adjust_brightness(img, factor):
         # adjust the brightness of image using
         # PIL.ImageEnhance.Brightness
-        from PIL.ImageEnhance import Brightness
         from PIL import Image
+        from PIL.ImageEnhance import Brightness
         img = Image.fromarray(img)
         brightened_img = Brightness(img).enhance(factor)
         return np.asarray(brightened_img)
@@ -124,8 +124,9 @@ def _adjust_brightness(img, factor):
 def test_adjust_contrast(nb_rand_test=100):
 
     def _adjust_contrast(img, factor):
-        from PIL.ImageEnhance import Contrast
         from PIL import Image
+        from PIL.ImageEnhance import Contrast
+
         # Image.fromarray defaultly supports RGB, not BGR.
         # convert from BGR to RGB
         img = Image.fromarray(img[..., ::-1], mode='RGB')
diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py
index 9e51c17176a..db75b2fd418 100644
--- a/tests/test_models/test_forward.py
+++ b/tests/test_models/test_forward.py
@@ -616,9 +616,10 @@ def test_detr_forward():
 
 
 def test_inference_detector():
+    from mmcv import ConfigDict
+
     from mmdet.apis import inference_detector
     from mmdet.models import build_detector
-    from mmcv import ConfigDict
 
     # small RetinaNet
     num_class = 3
diff --git a/tests/test_models/test_roi_heads/test_bbox_head.py b/tests/test_models/test_roi_heads/test_bbox_head.py
index 07753b83470..e839d0672c5 100644
--- a/tests/test_models/test_roi_heads/test_bbox_head.py
+++ b/tests/test_models/test_roi_heads/test_bbox_head.py
@@ -213,8 +213,8 @@ def _demodata_refine_boxes(n_roi, n_img, rng=0):
     """Create random test data for the
     ``mmdet.models.bbox_heads.bbox_head.BBoxHead.refine_boxes`` method."""
     import numpy as np
-    from mmdet.core.bbox.demodata import random_boxes
-    from mmdet.core.bbox.demodata import ensure_rng
+
+    from mmdet.core.bbox.demodata import ensure_rng, random_boxes
     try:
         import kwarray
     except ImportError:
diff --git a/tests/test_models/test_roi_heads/test_mask_head.py b/tests/test_models/test_roi_heads/test_mask_head.py
index 0aa20099a72..89a476dc7a1 100644
--- a/tests/test_models/test_roi_heads/test_mask_head.py
+++ b/tests/test_models/test_roi_heads/test_mask_head.py
@@ -30,6 +30,7 @@ def test_mask_head_loss():
 
     # create dummy mask
     import numpy as np
+
     from mmdet.core import BitmapMasks
     dummy_mask = np.random.randint(0, 2, (1, 160, 240), dtype=np.uint8)
     gt_masks = [BitmapMasks(dummy_mask, 160, 240)]
diff --git a/tests/test_runtime/test_config.py b/tests/test_runtime/test_config.py
index 2f9c33ba087..dce88f41bf4 100644
--- a/tests/test_runtime/test_config.py
+++ b/tests/test_runtime/test_config.py
@@ -115,6 +115,7 @@ def _check_roi_head(config, head):
 
 def _check_roi_extractor(config, roi_extractor, prev_roi_extractor=None):
     import torch.nn as nn
+
     # Separate roi_extractor and prev_roi_extractor checks for flexibility
     if isinstance(roi_extractor, nn.ModuleList):
         roi_extractor = roi_extractor[0]
@@ -259,9 +260,10 @@ def test_config_data_pipeline(config_rpath):
         xdoctest -m tests/test_runtime/
             test_config.py test_config_build_data_pipeline
     """
+    import numpy as np
     from mmcv import Config
+
     from mmdet.datasets.pipelines import Compose
-    import numpy as np
 
     config_dpath = _get_config_directory()
     print(f'Found config_dpath = {config_dpath}')
diff --git a/tests/test_utils/test_anchor.py b/tests/test_utils/test_anchor.py
index 83a1befb290..a9aef721203 100644
--- a/tests/test_utils/test_anchor.py
+++ b/tests/test_utils/test_anchor.py
@@ -11,6 +11,7 @@
 
 def test_standard_points_generator():
     from mmdet.core.anchor import build_prior_generator
+
     # teat init
     anchor_generator_cfg = dict(
         type='MlvlPointGenerator', strides=[4, 8], offset=0)
@@ -286,6 +287,7 @@ def test_standard_anchor_generator():
 
 def test_strides():
     from mmdet.core import AnchorGenerator
+
     # Square strides
     self = AnchorGenerator([10], [1.], [1.], [10])
     anchors = self.grid_anchors([(2, 2)], device='cpu')
diff --git a/tools/analysis_tools/test_robustness.py b/tools/analysis_tools/test_robustness.py
index 29cb41da657..0c1ddbeec54 100644
--- a/tools/analysis_tools/test_robustness.py
+++ b/tools/analysis_tools/test_robustness.py
@@ -12,13 +12,13 @@
                          wrap_fp16_model)
 from pycocotools.coco import COCO
 from pycocotools.cocoeval import COCOeval
-from tools.analysis_tools.robustness_eval import get_results
 
 from mmdet import datasets
 from mmdet.apis import multi_gpu_test, set_random_seed, single_gpu_test
 from mmdet.core import eval_map
 from mmdet.datasets import build_dataloader, build_dataset
 from mmdet.models import build_detector
+from tools.analysis_tools.robustness_eval import get_results
 
 
 def coco_eval_with_return(result_files,

From 82e5cce9ad550571f3a1c55c29203c09682a0079 Mon Sep 17 00:00:00 2001
From: LuooChen <33743370+LuooChen@users.noreply.github.com>
Date: Wed, 9 Feb 2022 19:57:49 +0800
Subject: [PATCH 02/27] [Fix] cannot to save the best checkpoint when the
 key_score is None (#7101)

---
 mmdet/core/evaluation/eval_hooks.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mmdet/core/evaluation/eval_hooks.py b/mmdet/core/evaluation/eval_hooks.py
index a20e2cff00b..7c1fbe968d2 100644
--- a/mmdet/core/evaluation/eval_hooks.py
+++ b/mmdet/core/evaluation/eval_hooks.py
@@ -56,7 +56,9 @@ def _do_evaluate(self, runner):
         results = single_gpu_test(runner.model, self.dataloader, show=False)
         runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
         key_score = self.evaluate(runner, results)
-        if self.save_best:
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
             self._save_ckpt(runner, key_score)
 
 
@@ -122,5 +124,7 @@ def _do_evaluate(self, runner):
             runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
             key_score = self.evaluate(runner, results)
 
-            if self.save_best:
+            # the key_score may be `None` so it needs to skip
+            # the action to save the best checkpoint
+            if self.save_best and key_score:
                 self._save_ckpt(runner, key_score)

From c9e1906f00505621b0247ab364feb0ca862fa98c Mon Sep 17 00:00:00 2001
From: Daniel van Sabben Alsina <13849741+dvansa@users.noreply.github.com>
Date: Wed, 9 Feb 2022 13:00:28 +0100
Subject: [PATCH 03/27] [Fix] Fix MixUp transform filter boxes failing case.
 Added test case (#7080)

---
 mmdet/datasets/pipelines/transforms.py        |  5 ++--
 .../test_transform/test_transform.py          | 27 +++++++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/mmdet/datasets/pipelines/transforms.py b/mmdet/datasets/pipelines/transforms.py
index 06c27bfa8c3..fb51922886f 100644
--- a/mmdet/datasets/pipelines/transforms.py
+++ b/mmdet/datasets/pipelines/transforms.py
@@ -2426,9 +2426,8 @@ def _mixup_transform(self, results):
             keep_list = self._filter_box_candidates(retrieve_gt_bboxes.T,
                                                     cp_retrieve_gt_bboxes.T)
 
-            if keep_list.sum() >= 1.0:
-                retrieve_gt_labels = retrieve_gt_labels[keep_list]
-                cp_retrieve_gt_bboxes = cp_retrieve_gt_bboxes[keep_list]
+            retrieve_gt_labels = retrieve_gt_labels[keep_list]
+            cp_retrieve_gt_bboxes = cp_retrieve_gt_bboxes[keep_list]
 
         mixup_gt_bboxes = np.concatenate(
             (results['gt_bboxes'], cp_retrieve_gt_bboxes), axis=0)
diff --git a/tests/test_data/test_pipelines/test_transform/test_transform.py b/tests/test_data/test_pipelines/test_transform/test_transform.py
index ba848dad9a8..d256ef1bb68 100644
--- a/tests/test_data/test_pipelines/test_transform/test_transform.py
+++ b/tests/test_data/test_pipelines/test_transform/test_transform.py
@@ -967,6 +967,33 @@ def test_mixup():
     assert results['gt_bboxes'].dtype == np.float32
     assert results['gt_bboxes_ignore'].dtype == np.float32
 
+    # test filter bbox :
+    # 2 boxes with sides 1 and 3 are filtered as min_bbox_size=5
+    gt_bboxes = np.array([[0, 0, 1, 1], [0, 0, 3, 3]], dtype=np.float32)
+    results['gt_labels'] = np.ones(gt_bboxes.shape[0], dtype=np.int64)
+    results['gt_bboxes'] = gt_bboxes
+    results['gt_bboxes_ignore'] = np.array([], dtype=np.float32)
+    mixresults = results['mix_results'][0]
+    mixresults['gt_labels'] = copy.deepcopy(results['gt_labels'])
+    mixresults['gt_bboxes'] = copy.deepcopy(results['gt_bboxes'])
+    mixresults['gt_bboxes_ignore'] = copy.deepcopy(results['gt_bboxes_ignore'])
+    transform = dict(
+        type='MixUp',
+        img_scale=(10, 12),
+        ratio_range=(1.5, 1.5),
+        min_bbox_size=5,
+        skip_filter=False)
+    mixup_module = build_from_cfg(transform, PIPELINES)
+
+    results = mixup_module(results)
+
+    assert results['gt_bboxes'].shape[0] == 2
+    assert results['gt_labels'].shape[0] == 2
+    assert results['gt_labels'].shape[0] == results['gt_bboxes'].shape[0]
+    assert results['gt_labels'].dtype == np.int64
+    assert results['gt_bboxes'].dtype == np.float32
+    assert results['gt_bboxes_ignore'].dtype == np.float32
+
 
 def test_photo_metric_distortion():
     img = mmcv.imread(

From 951996c51d0d0ac7f8eada84a0cc64bf65ba590e Mon Sep 17 00:00:00 2001
From: jbwang1997 <jbwang1997@gmail.com>
Date: Wed, 9 Feb 2022 20:03:02 +0800
Subject: [PATCH 04/27] [Fix] Update the version limitation of mmcv-full and
 pytorch in CI. (#7133)

* Update

* Update build.yml

* Update build.yml
---
 .github/workflows/build.yml | 20 ++++++++++----------
 docs/en/get_started.md      |  7 +++++++
 docs/zh_cn/get_started.md   |  7 +++++++
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4666f244981..08f0de96f1a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -37,19 +37,19 @@ jobs:
         include:
           - torch: 1.5.1
             torchvision: 0.6.1
-            mmcv: 1.5.0
+            mmcv: 1.5
           - torch: 1.6.0
             torchvision: 0.7.0
-            mmcv: 1.6.0
+            mmcv: 1.6
           - torch: 1.7.0
             torchvision: 0.8.1
-            mmcv: 1.7.0
+            mmcv: 1.7
           - torch: 1.8.0
             torchvision: 0.9.0
-            mmcv: 1.8.0
+            mmcv: 1.8
           - torch: 1.9.0
             torchvision: 0.10.0
-            mmcv: 1.9.0
+            mmcv: 1.9
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
@@ -91,19 +91,19 @@ jobs:
           - torch: 1.5.1+cu101
             torch_version: torch1.5.1
             torchvision: 0.6.1+cu101
-            mmcv: 1.5.0
+            mmcv: 1.5
           - torch: 1.6.0+cu101
             torch_version: torch1.6.0
             torchvision: 0.7.0+cu101
-            mmcv: 1.6.0
+            mmcv: 1.6
           - torch: 1.7.0+cu101
             torch_version: torch1.7.0
             torchvision: 0.8.1+cu101
-            mmcv: 1.7.0
+            mmcv: 1.7
           - torch: 1.8.0+cu101
             torch_version: torch1.8.0
             torchvision: 0.9.0+cu101
-            mmcv: 1.8.0
+            mmcv: 1.8
 
     steps:
       - uses: actions/checkout@v2
@@ -165,7 +165,7 @@ jobs:
           - torch: 1.9.0+cu102
             torch_version: torch1.9.0
             torchvision: 0.10.0+cu102
-            mmcv: 1.9.0
+            mmcv: 1.9
 
     steps:
       - uses: actions/checkout@v2
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index 153ecfa85e3..9fbee42127d 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -107,6 +107,13 @@ Or you can still install MMDetection manually:
 
     Optionally you can compile mmcv from source if you need to develop both mmcv and mmdet. Refer to the [guide](https://github.com/open-mmlab/mmcv#installation) for details.
 
+    mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well.
+
+    ```
+    # We can ignore the micro version of PyTorch
+    pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7/index.html
+    ```
+
 2. Install MMDetection.
 
     You can simply install mmdetection with the following command:
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index 8408007c707..8c74b0afbf9 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -115,6 +115,13 @@ MIM 能够自动地安装 OpenMMLab 的项目以及对应的依赖包。
     pip install mmcv-full
     ```
 
+    PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv-full 只提供 1.x.0 的编译包。如果你的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv-full。
+
+    ```
+    # 我们可以忽略 PyTorch 的小版本号
+    pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7/index.html
+    ```
+
 2. 安装 MMDetection：
 
     你可以直接通过如下命令从 pip 安装使用 mmdetection:

From ffff556edc5a96ce72ce5b4d27d1fbcaa0d22122 Mon Sep 17 00:00:00 2001
From: Yosuke Shinya <42844407+shinya7y@users.noreply.github.com>
Date: Fri, 11 Feb 2022 17:23:44 +0900
Subject: [PATCH 05/27] [Feature] Support TIMMBackbone (#7020)

* add TIMMBackbone

based on
https://github.com/open-mmlab/mmclassification/pull/427
https://github.com/open-mmlab/mmsegmentation/pull/998

* update and clean

* fix unit test

* Revert

* add example configs
---
 configs/timm_example/README.md                | 62 +++++++++++++++++++
 ...inanet_timm_efficientnet_b1_fpn_1x_coco.py | 20 ++++++
 .../retinanet_timm_tv_resnet50_fpn_1x_coco.py | 19 ++++++
 requirements/optional.txt                     |  1 +
 4 files changed, 102 insertions(+)
 create mode 100644 configs/timm_example/README.md
 create mode 100644 configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
 create mode 100644 configs/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py

diff --git a/configs/timm_example/README.md b/configs/timm_example/README.md
new file mode 100644
index 00000000000..0eb30cb5c08
--- /dev/null
+++ b/configs/timm_example/README.md
@@ -0,0 +1,62 @@
+# Timm Example
+
+> [PyTorch Image Models](https://github.com/rwightman/pytorch-image-models)
+
+<!-- [OTHERS] -->
+
+## Abstract
+
+Py**T**orch **Im**age **M**odels (`timm`) is a collection of image models, layers, utilities, optimizers, schedulers, data-loaders / augmentations, and reference training / validation scripts that aim to pull together a wide variety of SOTA models with ability to reproduce ImageNet training results.
+
+<!--
+<div align=center>
+<img src="" height="400" />
+</div>
+-->
+
+## Results and Models
+
+### RetinaNet
+
+| Backbone        | Style   | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
+|:---------------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:|
+| R-50            | pytorch | 1x      |          |                |        | [config](./retinanet_timm_tv_resnet50_fpn_1x_coco.py) |      |
+| EfficientNet-B1 | -       | 1x      |          |                |        | [config](./retinanet_timm_efficientnet_b1_fpn_1x_coco.py) |      |
+
+## Usage
+
+### Install additional requirements
+
+MMDetection supports timm backbones via `TIMMBackbone`, a wrapper class in MMClassification.
+Thus, you need to install `mmcls` in addition to timm.
+If you have already installed requirements for mmdet, run
+
+```shell
+pip install 'dataclasses; python_version<"3.7"'
+pip install timm
+pip install 'mmcls>=0.20.0'
+```
+
+See [this document](https://mmclassification.readthedocs.io/en/latest/install.html) for the details of MMClassification installation.
+
+### Edit config
+
+* See example configs for basic usage.
+* See the documents of [timm feature extraction](https://rwightman.github.io/pytorch-image-models/feature_extraction/#multi-scale-feature-maps-feature-pyramid) and [TIMMBackbone](https://mmclassification.readthedocs.io/en/latest/api.html#mmcls.models.backbones.TIMMBackbone) for details.
+* Which feature map is output depends on the backbone.
+  Please check `backbone out_channels` and `backbone out_strides` in your log, and modify `model.neck.in_channels` and `model.backbone.out_indices` if necessary.
+* If you use Vision Transformer models that do not support `features_only=True`, add `custom_hooks = []` to your config to disable `NumClassCheckHook`.
+
+## Citation
+
+```latex
+@misc{rw2019timm,
+  author = {Ross Wightman},
+  title = {PyTorch Image Models},
+  year = {2019},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  doi = {10.5281/zenodo.4414861},
+  howpublished = {\url{https://github.com/rwightman/pytorch-image-models}}
+}
+```
diff --git a/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py b/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
new file mode 100644
index 00000000000..65001167cbf
--- /dev/null
+++ b/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py
@@ -0,0 +1,20 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmcls.TIMMBackbone',
+        model_name='efficientnet_b1',
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)),
+    neck=dict(in_channels=[24, 40, 112, 320]))
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py b/configs/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py
new file mode 100644
index 00000000000..0c5b7a89f65
--- /dev/null
+++ b/configs/timm_example/retinanet_timm_tv_resnet50_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmcls>=0.20.0
+# import mmcls.models to trigger register_module in mmcls
+custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False)
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmcls.TIMMBackbone',
+        model_name='tv_resnet50',  # ResNet-50 with torchvision weights
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)))
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/requirements/optional.txt b/requirements/optional.txt
index da554cf35ec..6782747cd96 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -2,3 +2,4 @@ cityscapesscripts
 imagecorruptions
 scipy
 sklearn
+timm

From 0c97cb3bf0ab7fba5f6d0d663f4a7fd23cd66719 Mon Sep 17 00:00:00 2001
From: siatwangmin <csustwangmin@163.com>
Date: Sun, 13 Feb 2022 17:32:54 +0800
Subject: [PATCH 06/27] Create 2_new_data_model.md (#6476)

fix some typo

Co-authored-by: PJLAB\huanghaian <1286304229@qq.com>
---
 docs/zh_cn/2_new_data_model.md | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/docs/zh_cn/2_new_data_model.md b/docs/zh_cn/2_new_data_model.md
index 6973404e411..e691a6df153 100644
--- a/docs/zh_cn/2_new_data_model.md
+++ b/docs/zh_cn/2_new_data_model.md
@@ -4,17 +4,17 @@
 
 基本步骤如下：
 
-1. 准备自定义数据集
-2. 准备配置文件
-3. 在自定义数据集上进行训练，测试和推理。
+1.  准备自定义数据集
+2.  准备配置文件
+3.  在自定义数据集上进行训练，测试和推理。
 
 ## 准备自定义数据集
 
 MMDetection 一共支持三种形式应用新数据集：
 
-1. 将数据集重新组织为 COCO 格式。
-2. 将数据集重新组织为一个中间格式。
-3. 实现一个新的数据集。
+1.  将数据集重新组织为 COCO 格式。
+2.  将数据集重新组织为一个中间格式。
+3.  实现一个新的数据集。
 
 我们通常建议使用前面两种方法，因为它们通常来说比第三种方法要简单。
 
@@ -27,6 +27,7 @@ MMDetection 一共支持三种形式应用新数据集：
 ### COCO标注格式
 
 用于实例分割的 COCO 数据集格式如下所示，其中的键（key）都是必要的，参考[这里](https://cocodataset.org/#format-data)来获取更多细节。
+
 ```json
 {
     "images": [image],
@@ -58,6 +59,7 @@ categories = [{
     "supercategory": str,
 }]
 ```
+
 现在假设我们使用 balloon dataset。
 
 下载了数据集之后，我们需要实现一个函数将标注格式转化为 COCO 格式。然后我们就可以使用已经实现的 `COCODataset` 类来加载数据并进行训练以及评测。
@@ -147,6 +149,7 @@ categories = [{
     'name': 'polygon'}}},
  'size': 1115004}
 ```
+
 标注文件时是 JSON 格式的，其中所有键（key）组成了一张图片的所有标注。
 
 其中将 balloon dataset 转化为 COCO 格式的代码如下所示。
@@ -204,14 +207,13 @@ def convert_balloon_to_coco(ann_file, out_file, image_prefix):
         annotations=annotations,
         categories=[{'id':0, 'name': 'balloon'}])
     mmcv.dump(coco_format_json, out_file)
-
 ```
 
 使用如上的函数，用户可以成功将标注文件转化为 JSON 格式，之后可以使用 `CocoDataset` 对模型进行训练和评测。
 
 ## 准备配置文件
 
-第二步需要准备一个配置文件来成功加载数据集。假设我们想要用 balloon dataset 来训练配备了 FPN 的 Mask R-CNN ，如下是我们的配置文件。假设配置文件命名为 `mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py`，相应保存路径为 `configs/ballon/`，配置文件内容如下所示。
+第二步需要准备一个配置文件来成功加载数据集。假设我们想要用 balloon dataset 来训练配备了 FPN 的 Mask R-CNN ，如下是我们的配置文件。假设配置文件命名为 `mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py`，相应保存路径为 `configs/balloon/`，配置文件内容如下所示。
 
 ```python
 # 这个新的配置文件继承自一个原始配置文件，只需要突出必要的修改部分即可
@@ -224,7 +226,7 @@ model = dict(
         mask_head=dict(num_classes=1)))
 
 # 修改数据集相关设置
-dataset_type = 'COCODataset'
+dataset_type = 'CocoDataset'
 classes = ('balloon',)
 data = dict(
     train=dict(

From 723a7a4bb1735a2426e78b0c53a8622fcd69a70b Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Sun, 13 Feb 2022 17:33:18 +0800
Subject: [PATCH 07/27] [FIX] add Ci of pytorch 1.10 and comments for bbox
 clamp (#7081) (#7083)

* add comments for bbox clamp

* add CI of pytorch1.10

* add ci of pytorch1.10.1

* mmcv1.9.0->mmcv1.9

* add ci of pytorch1.10
---
 .github/workflows/build.yml                 | 11 +++++++++--
 mmdet/models/dense_heads/autoassign_head.py |  3 +++
 mmdet/models/dense_heads/fcos_head.py       |  3 +++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 08f0de96f1a..85ad6be0a69 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,7 +33,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7]
-        torch: [1.5.1, 1.6.0, 1.7.0, 1.8.0, 1.9.0]
+        torch: [1.5.1, 1.6.0, 1.7.0, 1.8.0, 1.9.0, 1.10.1]
         include:
           - torch: 1.5.1
             torchvision: 0.6.1
@@ -50,6 +50,9 @@ jobs:
           - torch: 1.9.0
             torchvision: 0.10.0
             mmcv: 1.9
+          - torch: 1.10.1
+            torchvision: 0.11.2
+            mmcv: 1.10
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
@@ -160,12 +163,16 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8, 3.9]
-        torch: [1.9.0+cu102]
+        torch: [1.9.0+cu102, 1.10.1+cu102]
         include:
           - torch: 1.9.0+cu102
             torch_version: torch1.9.0
             torchvision: 0.10.0+cu102
             mmcv: 1.9
+          - torch: 1.10.1+cu102
+            torch_version: torch1.10.1
+            torchvision: 0.11.2+cu102
+            mmcv: 1.10
 
     steps:
       - uses: actions/checkout@v2
diff --git a/mmdet/models/dense_heads/autoassign_head.py b/mmdet/models/dense_heads/autoassign_head.py
index 347c9b1f9d7..446da244b9e 100644
--- a/mmdet/models/dense_heads/autoassign_head.py
+++ b/mmdet/models/dense_heads/autoassign_head.py
@@ -197,6 +197,9 @@ def forward_single(self, x, scale, stride):
         # scale the bbox_pred of different level
         # float to avoid overflow when enabling FP16
         bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
         bbox_pred = bbox_pred.clamp(min=0)
         bbox_pred *= stride
         return cls_score, bbox_pred, centerness
diff --git a/mmdet/models/dense_heads/fcos_head.py b/mmdet/models/dense_heads/fcos_head.py
index 8d8b85a6237..d72fb56caa1 100644
--- a/mmdet/models/dense_heads/fcos_head.py
+++ b/mmdet/models/dense_heads/fcos_head.py
@@ -154,6 +154,9 @@ def forward_single(self, x, scale, stride):
         # float to avoid overflow when enabling FP16
         bbox_pred = scale(bbox_pred).float()
         if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
             bbox_pred = bbox_pred.clamp(min=0)
             if not self.training:
                 bbox_pred *= stride

From caa2abb3918171e1146f705cff1d0b299757b810 Mon Sep 17 00:00:00 2001
From: del-zhenwu <dele.zhenwu@gmail.com>
Date: Wed, 16 Feb 2022 08:58:28 +0800
Subject: [PATCH 08/27] Add daily issue owners (#7163)

* Add code owners

Signed-off-by: del-zhenwu <dele.zhenwu@gmail.com>

* Update code owners

Signed-off-by: del-zhenwu <dele.zhenwu@gmail.com>
---
 .owners.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 .owners.yml

diff --git a/.owners.yml b/.owners.yml
new file mode 100644
index 00000000000..97296aabf69
--- /dev/null
+++ b/.owners.yml
@@ -0,0 +1,14 @@
+assign:
+  strategy:
+    # random
+    daily-shift-based
+  scedule:
+    '*/1 * * * *'
+  assignees:
+    - Czm369
+    - hhaAndroid
+    - jbwang1997
+    - RangiLyu
+    - BIGWangYuDong
+    - chhluo
+    - ZwwWayne

From 301d4a2d4cfe1cdb62608e2892924be3e67e3098 Mon Sep 17 00:00:00 2001
From: Guangchen Lin <347630870@qq.com>
Date: Wed, 16 Feb 2022 19:55:55 +0800
Subject: [PATCH 09/27] [Feature] Support visualization for Panoptic
 Segmentation (#7041)

* First commit of v2

* split the functions

* Support to show panoptic result

* temp

* Support to show gt

* support show gt

* fix lint

* Support to browse datasets

* Fix unit tests

* Fix findContours

* fix comments

* Fix pre-commit

* fix lint

* Add the type of an argument
---
 mmdet/apis/inference.py                       |   2 +-
 mmdet/core/evaluation/__init__.py             |   3 +-
 mmdet/core/evaluation/panoptic_utils.py       |   6 +
 mmdet/core/mask/structures.py                 |  30 ++
 mmdet/core/visualization/image.py             | 397 ++++++++++++++----
 mmdet/core/visualization/palette.py           |  41 +-
 mmdet/datasets/coco_panoptic.py               |  44 +-
 mmdet/datasets/pipelines/__init__.py          |   5 +-
 mmdet/datasets/utils.py                       |   6 +-
 .../detectors/panoptic_two_stage_segmentor.py |  78 +++-
 .../heuristic_fusion_head.py                  |   2 +-
 tests/test_utils/test_visualization.py        |  34 +-
 tools/analysis_tools/analyze_results.py       |   6 +
 tools/misc/browse_dataset.py                  |  32 +-
 14 files changed, 546 insertions(+), 140 deletions(-)
 create mode 100644 mmdet/core/evaluation/panoptic_utils.py

diff --git a/mmdet/apis/inference.py b/mmdet/apis/inference.py
index 495327a9802..7c1ddd0c4e7 100644
--- a/mmdet/apis/inference.py
+++ b/mmdet/apis/inference.py
@@ -238,5 +238,5 @@ def show_result_pyplot(model,
         wait_time=wait_time,
         win_name=title,
         bbox_color=palette,
-        text_color=palette,
+        text_color=(200, 200, 200),
         mask_color=palette)
diff --git a/mmdet/core/evaluation/__init__.py b/mmdet/core/evaluation/__init__.py
index 888af6196d9..0aa94c9fe0e 100644
--- a/mmdet/core/evaluation/__init__.py
+++ b/mmdet/core/evaluation/__init__.py
@@ -4,6 +4,7 @@
                           imagenet_vid_classes, voc_classes)
 from .eval_hooks import DistEvalHook, EvalHook
 from .mean_ap import average_precision, eval_map, print_map_summary
+from .panoptic_utils import INSTANCE_OFFSET
 from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
                      print_recall_summary)
 
@@ -12,5 +13,5 @@
     'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes',
     'DistEvalHook', 'EvalHook', 'average_precision', 'eval_map',
     'print_map_summary', 'eval_recalls', 'print_recall_summary',
-    'plot_num_recall', 'plot_iou_recall'
+    'plot_num_recall', 'plot_iou_recall', 'INSTANCE_OFFSET'
 ]
diff --git a/mmdet/core/evaluation/panoptic_utils.py b/mmdet/core/evaluation/panoptic_utils.py
new file mode 100644
index 00000000000..10c9ad934e0
--- /dev/null
+++ b/mmdet/core/evaluation/panoptic_utils.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# A custom value to distinguish instance ID and category ID; need to
+# be greater than the number of categories.
+# For a pixel in the panoptic result map:
+#   pan_id = ins_id * INSTANCE_OFFSET + cat_id
+INSTANCE_OFFSET = 1000
diff --git a/mmdet/core/mask/structures.py b/mmdet/core/mask/structures.py
index 285e9cc1e7e..a9d0ebb4ba4 100644
--- a/mmdet/core/mask/structures.py
+++ b/mmdet/core/mask/structures.py
@@ -1070,3 +1070,33 @@ def polygon_to_bitmap(polygons, height, width):
     rle = maskUtils.merge(rles)
     bitmap_mask = maskUtils.decode(rle).astype(np.bool)
     return bitmap_mask
+
+
+def bitmap_to_polygon(bitmap):
+    """Convert masks from the form of bitmaps to polygons.
+
+    Args:
+        bitmap (ndarray): masks in bitmap representation.
+
+    Return:
+        list[ndarray]: the converted mask in polygon representation.
+        bool: whether the mask has holes.
+    """
+    bitmap = np.ascontiguousarray(bitmap).astype(np.uint8)
+    # cv2.RETR_CCOMP: retrieves all of the contours and organizes them
+    #   into a two-level hierarchy. At the top level, there are external
+    #   boundaries of the components. At the second level, there are
+    #   boundaries of the holes. If there is another contour inside a hole
+    #   of a connected component, it is still put at the top level.
+    # cv2.CHAIN_APPROX_NONE: stores absolutely all the contour points.
+    outs = cv2.findContours(bitmap, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+    contours = outs[-2]
+    hierarchy = outs[-1]
+    if hierarchy is None:
+        return [], False
+    # hierarchy[i]: 4 elements, for the indexes of next, previous,
+    # parent, or nested contours. If there is no corresponding contour,
+    # it will be -1.
+    with_hole = (hierarchy.reshape(-1, 4)[:, 3] >= 0).any()
+    contours = [c.reshape(-1, 2) for c in contours]
+    return contours, with_hole
diff --git a/mmdet/core/visualization/image.py b/mmdet/core/visualization/image.py
index fd883de4250..c574b2d46fe 100644
--- a/mmdet/core/visualization/image.py
+++ b/mmdet/core/visualization/image.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import cv2
 import matplotlib.pyplot as plt
 import mmcv
 import numpy as np
@@ -6,18 +7,25 @@
 from matplotlib.collections import PatchCollection
 from matplotlib.patches import Polygon
 
+from mmdet.core.evaluation.panoptic_utils import INSTANCE_OFFSET
+from ..mask.structures import bitmap_to_polygon
 from ..utils import mask2ndarray
 from .palette import get_palette, palette_val
 
+__all__ = [
+    'color_val_matplotlib', 'draw_masks', 'draw_bboxes', 'draw_labels',
+    'imshow_det_bboxes', 'imshow_gt_det_bboxes'
+]
+
 EPS = 1e-2
 
 
 def color_val_matplotlib(color):
     """Convert various input in BGR order to normalized RGB matplotlib color
-    tuples,
+    tuples.
 
     Args:
-        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+        color (:obj`Color` | str | tuple | int | ndarray): Color inputs.
 
     Returns:
         tuple[float]: A tuple of 3 normalized floats indicating RGB channels.
@@ -27,9 +35,177 @@ def color_val_matplotlib(color):
     return tuple(color)
 
 
+def _get_adaptive_scales(areas, min_area=800, max_area=30000):
+    """Get adaptive scales according to areas.
+
+    The scale range is [0.5, 1.0]. When the area is less than
+    ``'min_area'``, the scale is 0.5 while the area is larger than
+    ``'max_area'``, the scale is 1.0.
+
+    Args:
+        areas (ndarray): The areas of bboxes or masks with the
+            shape of (n, ).
+        min_area (int): Lower bound areas for adaptive scales.
+            Default: 800.
+        max_area (int): Upper bound areas for adaptive scales.
+            Default: 30000.
+
+    Returns:
+        ndarray: The adaotive scales with the shape of (n, ).
+    """
+    scales = 0.5 + (areas - min_area) / (max_area - min_area)
+    scales = np.clip(scales, 0.5, 1.0)
+    return scales
+
+
+def _get_bias_color(base, max_dist=30):
+    """Get different colors for each masks.
+
+    Get different colors for each masks by adding a bias
+    color to the base category color.
+    Args:
+        base (ndarray): The base category color with the shape
+            of (3, ).
+        max_dist (int): The max distance of bias. Default: 30.
+
+    Returns:
+        ndarray: The new color for a mask with the shape of (3, ).
+    """
+    new_color = base + np.random.randint(
+        low=-max_dist, high=max_dist + 1, size=3)
+    return np.clip(new_color, 0, 255, new_color)
+
+
+def draw_bboxes(ax, bboxes, color='g', alpha=0.8, thickness=2):
+    """Draw bounding boxes on the axes.
+
+    Args:
+        ax (matplotlib.Axes): The input axes.
+        bboxes (ndarray): The input bounding boxes with the shape
+            of (n, 4).
+        color (list[tuple] | matplotlib.color): the colors for each
+            bounding boxes.
+        alpha (float): Transparency of bounding boxes. Default: 0.8.
+        thickness (int): Thickness of lines. Default: 2.
+
+    Returns:
+        matplotlib.Axes: The result axes.
+    """
+    polygons = []
+    for i, bbox in enumerate(bboxes):
+        bbox_int = bbox.astype(np.int32)
+        poly = [[bbox_int[0], bbox_int[1]], [bbox_int[0], bbox_int[3]],
+                [bbox_int[2], bbox_int[3]], [bbox_int[2], bbox_int[1]]]
+        np_poly = np.array(poly).reshape((4, 2))
+        polygons.append(Polygon(np_poly))
+    p = PatchCollection(
+        polygons,
+        facecolor='none',
+        edgecolors=color,
+        linewidths=thickness,
+        alpha=alpha)
+    ax.add_collection(p)
+
+    return ax
+
+
+def draw_labels(ax,
+                labels,
+                positions,
+                scores=None,
+                class_names=None,
+                color='w',
+                font_size=8,
+                scales=None,
+                horizontal_alignment='left'):
+    """Draw labels on the axes.
+
+    Args:
+        ax (matplotlib.Axes): The input axes.
+        labels (ndarray): The labels with the shape of (n, ).
+        positions (ndarray): The positions to draw each labels.
+        scores (ndarray): The scores for each labels.
+        class_names (list[str]): The class names.
+        color (list[tuple] | matplotlib.color): The colors for labels.
+        font_size (int): Font size of texts. Default: 8.
+        scales (list[float]): Scales of texts. Default: None.
+        horizontal_alignment (str): The horizontal alignment method of
+            texts. Default: 'left'.
+
+    Returns:
+        matplotlib.Axes: The result axes.
+    """
+    for i, (pos, label) in enumerate(zip(positions, labels)):
+        label_text = class_names[
+            label] if class_names is not None else f'class {label}'
+        if scores is not None:
+            label_text += f'|{scores[i]:.02f}'
+        text_color = color[i] if isinstance(color, list) else color
+
+        font_size_mask = font_size if scales is None else font_size * scales[i]
+        ax.text(
+            pos[0],
+            pos[1],
+            f'{label_text}',
+            bbox={
+                'facecolor': 'black',
+                'alpha': 0.8,
+                'pad': 0.7,
+                'edgecolor': 'none'
+            },
+            color=text_color,
+            fontsize=font_size_mask,
+            verticalalignment='top',
+            horizontalalignment=horizontal_alignment)
+
+    return ax
+
+
+def draw_masks(ax, img, masks, color=None, with_edge=True, alpha=0.8):
+    """Draw masks on the image and their edges on the axes.
+
+    Args:
+        ax (matplotlib.Axes): The input axes.
+        img (ndarray): The image with the shape of (3, h, w).
+        masks (ndarray): The masks with the shape of (n, h, w).
+        color (ndarray): The colors for each masks with the shape
+            of (n, 3).
+        with_edge (bool): Whether to draw edges. Default: True.
+        alpha (float): Transparency of bounding boxes. Default: 0.8.
+
+    Returns:
+        matplotlib.Axes: The result axes.
+        ndarray: The result image.
+    """
+    taken_colors = set([0, 0, 0])
+    if color is None:
+        random_colors = np.random.randint(0, 255, (masks.size(0), 3))
+        color = [tuple(c) for c in random_colors]
+        color = np.array(color, dtype=np.uint8)
+    polygons = []
+    for i, mask in enumerate(masks):
+        if with_edge:
+            contours, _ = bitmap_to_polygon(mask)
+            polygons += [Polygon(c) for c in contours]
+
+        color_mask = color[i]
+        while tuple(color_mask) in taken_colors:
+            color_mask = _get_bias_color(color_mask)
+        taken_colors.add(tuple(color_mask))
+
+        mask = mask.astype(bool)
+        img[mask] = img[mask] * (1 - alpha) + color_mask * alpha
+
+    p = PatchCollection(
+        polygons, facecolor='none', edgecolors='w', linewidths=1, alpha=0.8)
+    ax.add_collection(p)
+
+    return ax, img
+
+
 def imshow_det_bboxes(img,
-                      bboxes,
-                      labels,
+                      bboxes=None,
+                      labels=None,
                       segms=None,
                       class_names=None,
                       score_thr=0,
@@ -37,7 +213,7 @@ def imshow_det_bboxes(img,
                       text_color='green',
                       mask_color=None,
                       thickness=2,
-                      font_size=13,
+                      font_size=8,
                       win_name='',
                       show=True,
                       wait_time=0,
@@ -73,18 +249,23 @@ def imshow_det_bboxes(img,
     Returns:
         ndarray: The image with bboxes drawn on it.
     """
-    assert bboxes.ndim == 2, \
+    assert bboxes is None or bboxes.ndim == 2, \
         f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
     assert labels.ndim == 1, \
         f' labels ndim should be 1, but its ndim is {labels.ndim}.'
-    assert bboxes.shape[0] == labels.shape[0], \
-        'bboxes.shape[0] and labels.shape[0] should have the same length.'
-    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5, \
+    assert bboxes is None or bboxes.shape[1] == 4 or bboxes.shape[1] == 5, \
         f' bboxes.shape[1] should be 4 or 5, but its {bboxes.shape[1]}.'
+    assert bboxes is None or bboxes.shape[0] <= labels.shape[0], \
+        'labels.shape[0] should not be less than bboxes.shape[0].'
+    assert segms is None or segms.shape[0] == labels.shape[0], \
+        'segms.shape[0] and labels.shape[0] should have the same length.'
+    assert segms is not None or bboxes is not None, \
+        'segms and bboxes should not be None at the same time.'
+
     img = mmcv.imread(img).astype(np.uint8)
 
     if score_thr > 0:
-        assert bboxes.shape[1] == 5
+        assert bboxes is not None and bboxes.shape[1] == 5
         scores = bboxes[:, -1]
         inds = scores > score_thr
         bboxes = bboxes[inds, :]
@@ -92,12 +273,6 @@ def imshow_det_bboxes(img,
         if segms is not None:
             segms = segms[inds, ...]
 
-    max_label = int(max(labels)) if labels.shape[0] > 0 else -1
-    bbox_color = palette_val(get_palette(bbox_color, max_label + 1))
-    text_color = palette_val(get_palette(text_color, max_label + 1))
-    mask_color = get_palette(mask_color, max_label + 1)
-    mask_color = np.array(mask_color, dtype=np.uint8)
-
     img = mmcv.bgr2rgb(img)
     width, height = img.shape[1], img.shape[0]
     img = np.ascontiguousarray(img)
@@ -115,44 +290,64 @@ def imshow_det_bboxes(img,
     ax = plt.gca()
     ax.axis('off')
 
-    polygons = []
-    color = []
-    for i, (bbox, label) in enumerate(zip(bboxes, labels)):
-        bbox_int = bbox.astype(np.int32)
-        poly = [[bbox_int[0], bbox_int[1]], [bbox_int[0], bbox_int[3]],
-                [bbox_int[2], bbox_int[3]], [bbox_int[2], bbox_int[1]]]
-        np_poly = np.array(poly).reshape((4, 2))
-        polygons.append(Polygon(np_poly))
-        color.append(bbox_color[label])
-        label_text = class_names[
-            label] if class_names is not None else f'class {label}'
-        if len(bbox) > 4:
-            label_text += f'|{bbox[-1]:.02f}'
-        ax.text(
-            bbox_int[0],
-            bbox_int[1],
-            f'{label_text}',
-            bbox={
-                'facecolor': 'black',
-                'alpha': 0.8,
-                'pad': 0.7,
-                'edgecolor': 'none'
-            },
-            color=text_color[label],
-            fontsize=font_size,
-            verticalalignment='top',
-            horizontalalignment='left')
-        if segms is not None:
-            color_mask = mask_color[labels[i]]
-            mask = segms[i].astype(bool)
-            img[mask] = img[mask] * 0.5 + color_mask * 0.5
+    max_label = int(max(labels) if len(labels) > 0 else 0)
+    text_palette = palette_val(get_palette(text_color, max_label + 1))
+    text_colors = [text_palette[label] for label in labels]
+
+    num_bboxes = 0
+    if bboxes is not None:
+        num_bboxes = bboxes.shape[0]
+        bbox_palette = palette_val(get_palette(bbox_color, max_label + 1))
+        colors = [bbox_palette[label] for label in labels[:num_bboxes]]
+        draw_bboxes(ax, bboxes, colors, alpha=0.8, thickness=thickness)
+
+        horizontal_alignment = 'left'
+        positions = bboxes[:, :2].astype(np.int32) + thickness
+        areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
+        scales = _get_adaptive_scales(areas)
+        scores = bboxes[:, 4] if bboxes.shape[1] == 5 else None
+        draw_labels(
+            ax,
+            labels[:num_bboxes],
+            positions,
+            scores=scores,
+            class_names=class_names,
+            color=text_colors,
+            font_size=font_size,
+            scales=scales,
+            horizontal_alignment=horizontal_alignment)
+
+    if segms is not None:
+        mask_palette = get_palette(mask_color, max_label + 1)
+        colors = [mask_palette[label] for label in labels]
+        colors = np.array(colors, dtype=np.uint8)
+        draw_masks(ax, img, segms, colors, with_edge=True)
+
+        if num_bboxes < segms.shape[0]:
+            segms = segms[num_bboxes:]
+            horizontal_alignment = 'center'
+            areas = []
+            positions = []
+            for mask in segms:
+                _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                    mask.astype(np.uint8), connectivity=8)
+                largest_id = np.argmax(stats[1:, -1]) + 1
+                positions.append(centroids[largest_id])
+                areas.append(stats[largest_id, -1])
+            areas = np.stack(areas, axis=0)
+            scales = _get_adaptive_scales(areas)
+            draw_labels(
+                ax,
+                labels[num_bboxes:],
+                positions,
+                class_names=class_names,
+                color=text_colors,
+                font_size=font_size,
+                scales=scales,
+                horizontal_alignment=horizontal_alignment)
 
     plt.imshow(img)
 
-    p = PatchCollection(
-        polygons, facecolor='none', edgecolors=color, linewidths=thickness)
-    ax.add_collection(p)
-
     stream, _ = canvas.print_to_buffer()
     buffer = np.frombuffer(stream, dtype='uint8')
     img_rgba = buffer.reshape(height, width, 4)
@@ -183,12 +378,12 @@ def imshow_gt_det_bboxes(img,
                          result,
                          class_names=None,
                          score_thr=0,
-                         gt_bbox_color=(255, 102, 61),
-                         gt_text_color=(255, 102, 61),
-                         gt_mask_color=(255, 102, 61),
-                         det_bbox_color=(72, 101, 241),
-                         det_text_color=(72, 101, 241),
-                         det_mask_color=(72, 101, 241),
+                         gt_bbox_color=(61, 102, 255),
+                         gt_text_color=(200, 200, 200),
+                         gt_mask_color=(61, 102, 255),
+                         det_bbox_color=(241, 101, 72),
+                         det_text_color=(200, 200, 200),
+                         det_mask_color=(241, 101, 72),
                          thickness=2,
                          font_size=13,
                          win_name='',
@@ -207,22 +402,22 @@ def imshow_gt_det_bboxes(img,
       score_thr (float): Minimum score of bboxes to be shown. Default: 0.
       gt_bbox_color (list[tuple] | tuple | str | None): Colors of bbox lines.
           If a single color is given, it will be applied to all classes.
-          The tuple of color should be in RGB order. Default: (255, 102, 61).
+          The tuple of color should be in RGB order. Default: (61, 102, 255).
       gt_text_color (list[tuple] | tuple | str | None): Colors of texts.
           If a single color is given, it will be applied to all classes.
-          The tuple of color should be in RGB order. Default: (255, 102, 61).
+          The tuple of color should be in RGB order. Default: (200, 200, 200).
       gt_mask_color (list[tuple] | tuple | str | None, optional): Colors of
           masks. If a single color is given, it will be applied to all classes.
-          The tuple of color should be in RGB order. Default: (255, 102, 61).
+          The tuple of color should be in RGB order. Default: (61, 102, 255).
       det_bbox_color (list[tuple] | tuple | str | None):Colors of bbox lines.
           If a single color is given, it will be applied to all classes.
-          The tuple of color should be in RGB order. Default: (72, 101, 241).
+          The tuple of color should be in RGB order. Default: (241, 101, 72).
       det_text_color (list[tuple] | tuple | str | None):Colors of texts.
           If a single color is given, it will be applied to all classes.
-          The tuple of color should be in RGB order. Default: (72, 101, 241).
+          The tuple of color should be in RGB order. Default: (200, 200, 200).
       det_mask_color (list[tuple] | tuple | str | None, optional): Color of
           masks. If a single color is given, it will be applied to all classes.
-          The tuple of color should be in RGB order. Default: (72, 101, 241).
+          The tuple of color should be in RGB order. Default: (241, 101, 72).
       thickness (int): Thickness of lines. Default: 2.
       font_size (int): Font size of texts. Default: 13.
       win_name (str): The window name. Default: ''.
@@ -236,20 +431,37 @@ def imshow_gt_det_bboxes(img,
     """
     assert 'gt_bboxes' in annotation
     assert 'gt_labels' in annotation
-    assert isinstance(
-        result,
-        (tuple, list)), f'Expected tuple or list, but get {type(result)}'
+    assert isinstance(result, (tuple, list, dict)), 'Expected ' \
+        f'tuple or list or dict, but get {type(result)}'
 
+    gt_bboxes = annotation['gt_bboxes']
+    gt_labels = annotation['gt_labels']
     gt_masks = annotation.get('gt_masks', None)
     if gt_masks is not None:
         gt_masks = mask2ndarray(gt_masks)
 
+    gt_seg = annotation.get('gt_semantic_seg', None)
+    if gt_seg is not None:
+        pad_value = 255  # the padding value of gt_seg
+        sem_labels = np.unique(gt_seg)
+        all_labels = np.concatenate((gt_labels, sem_labels), axis=0)
+        all_labels, counts = np.unique(all_labels, return_counts=True)
+        stuff_labels = all_labels[np.logical_and(counts < 2,
+                                                 all_labels != pad_value)]
+        stuff_masks = gt_seg[None] == stuff_labels[:, None, None]
+        gt_labels = np.concatenate((gt_labels, stuff_labels), axis=0)
+        gt_masks = np.concatenate((gt_masks, stuff_masks.astype(np.uint8)),
+                                  axis=0)
+        # If you need to show the bounding boxes,
+        # please comment the following line
+        # gt_bboxes = None
+
     img = mmcv.imread(img)
 
     img = imshow_det_bboxes(
         img,
-        annotation['gt_bboxes'],
-        annotation['gt_labels'],
+        gt_bboxes,
+        gt_labels,
         gt_masks,
         class_names=class_names,
         bbox_color=gt_bbox_color,
@@ -260,25 +472,38 @@ def imshow_gt_det_bboxes(img,
         win_name=win_name,
         show=False)
 
-    if isinstance(result, tuple):
-        bbox_result, segm_result = result
-        if isinstance(segm_result, tuple):
-            segm_result = segm_result[0]  # ms rcnn
+    if not isinstance(result, dict):
+        if isinstance(result, tuple):
+            bbox_result, segm_result = result
+            if isinstance(segm_result, tuple):
+                segm_result = segm_result[0]  # ms rcnn
+        else:
+            bbox_result, segm_result = result, None
+
+        bboxes = np.vstack(bbox_result)
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_result)
+        ]
+        labels = np.concatenate(labels)
+
+        segms = None
+        if segm_result is not None and len(labels) > 0:  # non empty
+            segms = mmcv.concat_list(segm_result)
+            segms = mask_util.decode(segms)
+            segms = segms.transpose(2, 0, 1)
     else:
-        bbox_result, segm_result = result, None
-
-    bboxes = np.vstack(bbox_result)
-    labels = [
-        np.full(bbox.shape[0], i, dtype=np.int32)
-        for i, bbox in enumerate(bbox_result)
-    ]
-    labels = np.concatenate(labels)
-
-    segms = None
-    if segm_result is not None and len(labels) > 0:  # non empty
-        segms = mmcv.concat_list(segm_result)
-        segms = mask_util.decode(segms)
-        segms = segms.transpose(2, 0, 1)
+        assert class_names is not None, 'We need to know the number ' \
+                                        'of classes.'
+        VOID = len(class_names)
+        bboxes = None
+        pan_results = result['pan_results']
+        # keep objects ahead
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != VOID
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
 
     img = imshow_det_bboxes(
         img,
diff --git a/mmdet/core/visualization/palette.py b/mmdet/core/visualization/palette.py
index 282263c267a..11692cdd086 100644
--- a/mmdet/core/visualization/palette.py
+++ b/mmdet/core/visualization/palette.py
@@ -2,8 +2,6 @@
 import mmcv
 import numpy as np
 
-import mmdet
-
 
 def palette_val(palette):
     """Convert palette to matplotlib palette.
@@ -21,36 +19,45 @@ def palette_val(palette):
     return new_palette
 
 
-def get_palette(palette, num_classes=None):
+def get_palette(palette, num_classes):
     """Get palette from various inputs.
 
     Args:
-        palette (list[tuple]/str/tuple/:obj:`Color`): palette inputs
+        palette (list[tuple] | str | tuple | :obj:`Color`): palette inputs.
+        num_classes (int): the number of classes.
 
     Returns:
         list[tuple[int]]: A list of color tuples.
     """
+    assert isinstance(num_classes, int)
+
     if isinstance(palette, list):
-        return palette
+        dataset_palette = palette
     elif isinstance(palette, tuple):
-        assert isinstance(num_classes, int)
-        return [palette] * num_classes
-    elif palette == 'coco':
-        return mmdet.datasets.CocoDataset.PALETTE
-    elif palette == 'voc':
-        return mmdet.datasets.VOCDataset.PALETTE
-    elif palette == 'citys':
-        return mmdet.datasets.CityscapesDataset.PALETTE
+        dataset_palette = [palette] * num_classes
     elif palette == 'random' or palette is None:
-        assert isinstance(num_classes, int)
         state = np.random.get_state()
         # random color
         np.random.seed(42)
         palette = np.random.randint(0, 256, size=(num_classes, 3))
         np.random.set_state(state)
-        return [tuple(c) for c in palette]
+        dataset_palette = [tuple(c) for c in palette]
+    elif palette == 'coco':
+        from mmdet.datasets import CocoDataset, CocoPanopticDataset
+        dataset_palette = CocoDataset.PALETTE
+        if len(dataset_palette) < num_classes:
+            dataset_palette = CocoPanopticDataset.PALETTE
+    elif palette == 'citys':
+        from mmdet.datasets import CityscapesDataset
+        dataset_palette = CityscapesDataset.PALETTE
+    elif palette == 'voc':
+        from mmdet.datasets import VOCDataset
+        dataset_palette = VOCDataset.PALETTE
     elif mmcv.is_str(palette):
-        assert isinstance(num_classes, int)
-        return [mmcv.color_val(palette)[::-1]] * num_classes
+        dataset_palette = [mmcv.color_val(palette)[::-1]] * num_classes
     else:
         raise TypeError(f'Invalid type for palette: {type(palette)}')
+
+    assert len(dataset_palette) >= num_classes, \
+        'The length of palette should not be less than `num_classes`.'
+    return dataset_palette
diff --git a/mmdet/datasets/coco_panoptic.py b/mmdet/datasets/coco_panoptic.py
index e2b5c636409..cc00c13fcad 100644
--- a/mmdet/datasets/coco_panoptic.py
+++ b/mmdet/datasets/coco_panoptic.py
@@ -8,6 +8,7 @@
 from mmcv.utils import print_log
 from terminaltables import AsciiTable
 
+from mmdet.core import INSTANCE_OFFSET
 from .api_wrappers import COCO, pq_compute_multi_core
 from .builder import DATASETS
 from .coco import CocoDataset
@@ -23,12 +24,6 @@
 
 __all__ = ['CocoPanopticDataset']
 
-# A custom value to distinguish instance ID and category ID; need to
-# be greater than the number of categories.
-# For a pixel in the panoptic result map:
-#   pan_id = ins_id * INSTANCE_OFFSET + cat_id
-INSTANCE_OFFSET = 1000
-
 
 class COCOPanoptic(COCO):
     """This wrapper is for loading the panoptic style annotation file.
@@ -201,6 +196,43 @@ class CocoPanopticDataset(CocoDataset):
         'rock-merged', 'wall-other-merged', 'rug-merged'
     ]
 
+    PALETTE = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230),
+               (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70),
+               (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0),
+               (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255),
+               (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157),
+               (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118),
+               (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182),
+               (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255),
+               (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255),
+               (134, 134, 103), (145, 148, 174), (255, 208, 186),
+               (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255),
+               (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105),
+               (166, 196, 102), (208, 195, 210), (255, 109, 65), (0, 143, 149),
+               (179, 0, 194), (209, 99, 106), (5, 121, 0), (227, 255, 205),
+               (147, 186, 208), (153, 69, 1), (3, 95, 161), (163, 255, 0),
+               (119, 0, 170), (0, 182, 199), (0, 165, 120), (183, 130, 88),
+               (95, 32, 0), (130, 114, 135), (110, 129, 133), (166, 74, 118),
+               (219, 142, 185), (79, 210, 114), (178, 90, 62), (65, 70, 15),
+               (127, 167, 115), (59, 105, 106), (142, 108, 45), (196, 172, 0),
+               (95, 54, 80), (128, 76, 255), (201, 57, 1), (246, 0, 122),
+               (191, 162, 208), (255, 255, 128), (147, 211, 203),
+               (150, 100, 100), (168, 171, 172), (146, 112, 198),
+               (210, 170, 100), (92, 136, 89), (218, 88, 184), (241, 129, 0),
+               (217, 17, 255), (124, 74, 181), (70, 70, 70), (255, 228, 255),
+               (154, 208, 0), (193, 0, 92), (76, 91, 113), (255, 180, 195),
+               (106, 154, 176),
+               (230, 150, 140), (60, 143, 255), (128, 64, 128), (92, 82, 55),
+               (254, 212, 124), (73, 77, 174), (255, 160, 98), (255, 255, 255),
+               (104, 84, 109), (169, 164, 131), (225, 199, 255), (137, 54, 74),
+               (135, 158, 223), (7, 246, 231), (107, 255, 200), (58, 41, 149),
+               (183, 121, 142), (255, 73, 97), (107, 142, 35), (190, 153, 153),
+               (146, 139, 141),
+               (70, 130, 180), (134, 199, 156), (209, 226, 140), (96, 36, 108),
+               (96, 96, 96), (64, 170, 64), (152, 251, 152), (208, 229, 228),
+               (206, 186, 171), (152, 161, 64), (116, 112, 0), (0, 114, 143),
+               (102, 102, 156), (250, 141, 255)]
+
     def load_annotations(self, ann_file):
         """Load annotation from COCO Panoptic style annotation file.
 
diff --git a/mmdet/datasets/pipelines/__init__.py b/mmdet/datasets/pipelines/__init__.py
index 88c30dbb3bb..c457ee1882f 100644
--- a/mmdet/datasets/pipelines/__init__.py
+++ b/mmdet/datasets/pipelines/__init__.py
@@ -7,7 +7,8 @@
                          ToDataContainer, ToTensor, Transpose, to_tensor)
 from .instaboost import InstaBoost
 from .loading import (LoadAnnotations, LoadImageFromFile, LoadImageFromWebcam,
-                      LoadMultiChannelImageFromFiles, LoadProposals)
+                      LoadMultiChannelImageFromFiles, LoadPanopticAnnotations,
+                      LoadProposals)
 from .test_time_aug import MultiScaleFlipAug
 from .transforms import (Albu, CutOut, Expand, MinIoURandomCrop, MixUp, Mosaic,
                          Normalize, Pad, PhotoMetricDistortion, RandomAffine,
@@ -17,7 +18,7 @@
 __all__ = [
     'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
     'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations',
-    'LoadImageFromFile', 'LoadImageFromWebcam',
+    'LoadImageFromFile', 'LoadImageFromWebcam', 'LoadPanopticAnnotations',
     'LoadMultiChannelImageFromFiles', 'LoadProposals', 'MultiScaleFlipAug',
     'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale',
     'MinIoURandomCrop', 'Expand', 'PhotoMetricDistortion', 'Albu',
diff --git a/mmdet/datasets/utils.py b/mmdet/datasets/utils.py
index cda13a6b289..26e922d2ba8 100644
--- a/mmdet/datasets/utils.py
+++ b/mmdet/datasets/utils.py
@@ -6,7 +6,8 @@
 from mmcv.runner.hooks import HOOKS, Hook
 
 from mmdet.datasets.builder import PIPELINES
-from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile
+from mmdet.datasets.pipelines import (LoadAnnotations, LoadImageFromFile,
+                                      LoadPanopticAnnotations)
 from mmdet.models.dense_heads import GARPNHead, RPNHead
 from mmdet.models.roi_heads.mask_heads import FusedSemanticHead
 
@@ -104,7 +105,8 @@ def get_loading_pipeline(pipeline):
         obj_cls = PIPELINES.get(cfg['type'])
         # TODO：use more elegant way to distinguish loading modules
         if obj_cls is not None and obj_cls in (LoadImageFromFile,
-                                               LoadAnnotations):
+                                               LoadAnnotations,
+                                               LoadPanopticAnnotations):
             loading_pipeline_cfg.append(cfg)
     assert len(loading_pipeline_cfg) == 2, \
         'The data pipeline in your config file must include ' \
diff --git a/mmdet/models/detectors/panoptic_two_stage_segmentor.py b/mmdet/models/detectors/panoptic_two_stage_segmentor.py
index 9b93e31e17f..5ad49bac705 100644
--- a/mmdet/models/detectors/panoptic_two_stage_segmentor.py
+++ b/mmdet/models/detectors/panoptic_two_stage_segmentor.py
@@ -1,7 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
 import torch
 
-from mmdet.core import bbox2roi, multiclass_nms
+from mmdet.core import INSTANCE_OFFSET, bbox2roi, multiclass_nms
+from mmdet.core.visualization import imshow_det_bboxes
 from ..builder import DETECTORS, build_head
 from ..roi_heads.mask_heads.fcn_mask_head import _do_paste_mask
 from .two_stage import TwoStageDetector
@@ -201,3 +204,76 @@ def simple_test(self, img, img_metas, proposals=None, rescale=False):
             result = dict(pan_results=pan_results)
             results.append(result)
         return results
+
+    def show_result(self,
+                    img,
+                    result,
+                    score_thr=0.3,
+                    bbox_color=(72, 101, 241),
+                    text_color=(72, 101, 241),
+                    mask_color=None,
+                    thickness=2,
+                    font_size=13,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (dict): The results.
+
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+               The tuple of color should be in BGR order. Default: 'green'.
+            text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+               The tuple of color should be in BGR order. Default: 'green'.
+            mask_color (None or str or tuple(int) or :obj:`Color`):
+               Color of masks. The tuple of color should be in BGR order.
+               Default: None.
+            thickness (int): Thickness of lines. Default: 2.
+            font_size (int): Font size of texts. Default: 13.
+            win_name (str): The window name. Default: ''.
+            wait_time (float): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        pan_results = result['pan_results']
+        # keep objects ahead
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != self.num_classes  # for VOID label
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        img = imshow_det_bboxes(
+            img,
+            segms=segms,
+            labels=labels,
+            class_names=self.CLASSES,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img
diff --git a/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py b/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
index fa6257802ae..06c1de2b901 100644
--- a/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
+++ b/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
-from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET
+from mmdet.core.evaluation.panoptic_utils import INSTANCE_OFFSET
 from mmdet.models.builder import HEADS
 from .base_panoptic_fusion_head import BasePanopticFusionHead
 
diff --git a/tests/test_utils/test_visualization.py b/tests/test_utils/test_visualization.py
index 64e36e157f1..1dbdb2b135d 100644
--- a/tests/test_utils/test_visualization.py
+++ b/tests/test_utils/test_visualization.py
@@ -9,7 +9,8 @@
 import torch
 
 from mmdet.core import visualization as vis
-from mmdet.datasets import CityscapesDataset, CocoDataset, VOCDataset
+from mmdet.datasets import (CityscapesDataset, CocoDataset,
+                            CocoPanopticDataset, VOCDataset)
 
 
 def test_color():
@@ -133,42 +134,37 @@ def test_palette():
 
     # test list
     palette = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
-    palette_ = vis.get_palette(palette)
+    palette_ = vis.get_palette(palette, 3)
     for color, color_ in zip(palette, palette_):
         assert color == color_
 
     # test tuple
-    with pytest.raises(AssertionError):
-        vis.get_palette((1, 2, 3))
     palette = vis.get_palette((1, 2, 3), 3)
     assert len(palette) == 3
     for color in palette:
         assert color == (1, 2, 3)
 
     # test color str
-    with pytest.raises(AssertionError):
-        vis.get_palette('red')
     palette = vis.get_palette('red', 3)
     assert len(palette) == 3
     for color in palette:
         assert color == (255, 0, 0)
 
     # test dataset str
-    palette = vis.get_palette('coco')
-    for color, color_ in zip(palette, CocoDataset.PALETTE):
-        assert color == color_
-    palette = vis.get_palette('voc')
-    for color, color_ in zip(palette, VOCDataset.PALETTE):
-        assert color == color_
-    palette = vis.get_palette('citys')
-    for color, color_ in zip(palette, CityscapesDataset.PALETTE):
-        assert color == color_
+    palette = vis.get_palette('coco', len(CocoDataset.CLASSES))
+    assert len(palette) == len(CocoDataset.CLASSES)
+    assert palette[0] == (220, 20, 60)
+    palette = vis.get_palette('coco', len(CocoPanopticDataset.CLASSES))
+    assert len(palette) == len(CocoPanopticDataset.CLASSES)
+    assert palette[-1] == (250, 141, 255)
+    palette = vis.get_palette('voc', len(VOCDataset.CLASSES))
+    assert len(palette) == len(VOCDataset.CLASSES)
+    assert palette[0] == (106, 0, 228)
+    palette = vis.get_palette('citys', len(CityscapesDataset.CLASSES))
+    assert len(palette) == len(CityscapesDataset.CLASSES)
+    assert palette[0] == (220, 20, 60)
 
     # test random
-    with pytest.raises(AssertionError):
-        vis.get_palette('random')
-    with pytest.raises(AssertionError):
-        vis.get_palette(None)
     palette1 = vis.get_palette('random', 3)
     palette2 = vis.get_palette(None, 3)
     for color1, color2 in zip(palette1, palette2):
diff --git a/tools/analysis_tools/analyze_results.py b/tools/analysis_tools/analyze_results.py
index 8265265575c..cb79587a65c 100644
--- a/tools/analysis_tools/analyze_results.py
+++ b/tools/analysis_tools/analyze_results.py
@@ -82,6 +82,12 @@ def _save_image_gts_results(self, dataset, results, mAPs, out_dir=None):
                 data_info,
                 results[index],
                 dataset.CLASSES,
+                gt_bbox_color=dataset.PALETTE,
+                gt_text_color=(200, 200, 200),
+                gt_mask_color=dataset.PALETTE,
+                det_bbox_color=dataset.PALETTE,
+                det_text_color=(200, 200, 200),
+                det_mask_color=dataset.PALETTE,
                 show=self.show,
                 score_thr=self.score_thr,
                 wait_time=self.wait_time,
diff --git a/tools/misc/browse_dataset.py b/tools/misc/browse_dataset.py
index aebe9cf9bd8..3e70c8b8741 100644
--- a/tools/misc/browse_dataset.py
+++ b/tools/misc/browse_dataset.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 import mmcv
+import numpy as np
 from mmcv import Config, DictAction
 
 from mmdet.core.utils import mask2ndarray
@@ -73,6 +74,10 @@ def main():
     args = parse_args()
     cfg = retrieve_data_cfg(args.config, args.skip_type, args.cfg_options)
 
+    if 'gt_semantic_seg' in cfg.train_pipeline[-1]['keys']:
+        cfg.data.train.pipeline = [
+            p for p in cfg.data.train.pipeline if p['type'] != 'SegRescale'
+        ]
     dataset = build_dataset(cfg.data.train)
 
     progress_bar = mmcv.ProgressBar(len(dataset))
@@ -82,21 +87,40 @@ def main():
                                 Path(item['filename']).name
                                 ) if args.output_dir is not None else None
 
+        gt_bboxes = item['gt_bboxes']
+        gt_labels = item['gt_labels']
         gt_masks = item.get('gt_masks', None)
         if gt_masks is not None:
             gt_masks = mask2ndarray(gt_masks)
 
+        gt_seg = item.get('gt_semantic_seg', None)
+        if gt_seg is not None:
+            pad_value = 255  # the padding value of gt_seg
+            sem_labels = np.unique(gt_seg)
+            all_labels = np.concatenate((gt_labels, sem_labels), axis=0)
+            all_labels, counts = np.unique(all_labels, return_counts=True)
+            stuff_labels = all_labels[np.logical_and(counts < 2,
+                                                     all_labels != pad_value)]
+            stuff_masks = gt_seg[None] == stuff_labels[:, None, None]
+            gt_labels = np.concatenate((gt_labels, stuff_labels), axis=0)
+            gt_masks = np.concatenate((gt_masks, stuff_masks.astype(np.uint8)),
+                                      axis=0)
+            # If you need to show the bounding boxes,
+            # please comment the following line
+            gt_bboxes = None
+
         imshow_det_bboxes(
             item['img'],
-            item['gt_bboxes'],
-            item['gt_labels'],
+            gt_bboxes,
+            gt_labels,
             gt_masks,
             class_names=dataset.CLASSES,
             show=not args.not_show,
             wait_time=args.show_interval,
             out_file=filename,
-            bbox_color=(255, 102, 61),
-            text_color=(255, 102, 61))
+            bbox_color=dataset.PALETTE,
+            text_color=(200, 200, 200),
+            mask_color=dataset.PALETTE)
 
         progress_bar.update()
 

From 31bfc07589410c9c0b61ce9aa7b658caf9b73606 Mon Sep 17 00:00:00 2001
From: Daniel van Sabben Alsina <13849741+dvansa@users.noreply.github.com>
Date: Wed, 16 Feb 2022 13:28:46 +0100
Subject: [PATCH 10/27] [Fix] confusion_matrix.py analysis tool handling NaNs
 (#7147)

---
 tools/analysis_tools/confusion_matrix.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
index 71e4eb0d9fe..a2531ba81f9 100644
--- a/tools/analysis_tools/confusion_matrix.py
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -207,7 +207,10 @@ def plot_confusion_matrix(confusion_matrix,
             ax.text(
                 j,
                 i,
-                '{}%'.format(int(confusion_matrix[i, j])),
+                '{}%'.format(
+                    int(confusion_matrix[
+                        i,
+                        j]) if not np.isnan(confusion_matrix[i, j]) else -1),
                 ha='center',
                 va='center',
                 color='w',

From 78b1e71fbf714c21155f7cbbe99e3894de3d9f41 Mon Sep 17 00:00:00 2001
From: VIKASH RANJAN <ranjanvikash26@gmail.com>
Date: Wed, 16 Feb 2022 16:37:40 +0100
Subject: [PATCH 11/27] [Fix] Added missing property in SABLHead (#7091)

* Added missing property in SABLHead

* set pre-commit-hooks to v0.1.0

* set maskdownlint to v0.11.0

* pre-commit-hooks

Co-authored-by: Cedric Luo <luochunhua1996@outlook.com>
---
 mmdet/models/roi_heads/bbox_heads/sabl_head.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/mmdet/models/roi_heads/bbox_heads/sabl_head.py b/mmdet/models/roi_heads/bbox_heads/sabl_head.py
index f3d5e83de84..0ce986b9a29 100644
--- a/mmdet/models/roi_heads/bbox_heads/sabl_head.py
+++ b/mmdet/models/roi_heads/bbox_heads/sabl_head.py
@@ -206,6 +206,18 @@ def __init__(self,
                         ])
                 ]
 
+    @property
+    def custom_cls_channels(self):
+        return getattr(self.loss_cls, 'custom_cls_channels', False)
+
+    @property
+    def custom_activation(self):
+        return getattr(self.loss_cls, 'custom_activation', False)
+
+    @property
+    def custom_accuracy(self):
+        return getattr(self.loss_cls, 'custom_accuracy', False)
+
     def _add_fc_branch(self, num_branch_fcs, in_channels, roi_feat_size,
                        fc_out_channels):
         in_channels = in_channels * roi_feat_size * roi_feat_size

From f814beb7ba2bb659e4e26e7b1bb5714d79ef61fa Mon Sep 17 00:00:00 2001
From: Range King <RangeKingHZ@gmail.com>
Date: Tue, 22 Feb 2022 09:22:46 +0800
Subject: [PATCH 12/27] Update config.md (#7215)

* [Fix] Fix wrong img name in onnx2tensorrt.py (#7157)

* [Docs] fix albumentations installed way (#7143)

* Update config.md

fix some typos

Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>
---
 docs/en/get_started.md            | 11 +++++++----
 docs/zh_cn/get_started.md         |  4 ++--
 docs/zh_cn/tutorials/config.md    | 14 +++++++-------
 requirements/albu.txt             |  1 +
 tools/deployment/onnx2tensorrt.py |  2 +-
 5 files changed, 18 insertions(+), 14 deletions(-)
 create mode 100644 requirements/albu.txt

diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index 9fbee42127d..28bbefaad94 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -141,7 +141,7 @@ Or you can still install MMDetection manually:
     # for LVIS dataset
     pip install git+https://github.com/lvis-dataset/lvis-api.git
     # for albumentations
-    pip install albumentations>=0.3.2 --no-binary imgaug,albumentations
+    pip install -r requirements/albu.txt
     ```
 
 **Note:**
@@ -155,9 +155,12 @@ you can install it before installing MMCV.
 c. Some dependencies are optional. Simply running `pip install -v -e .` will
  only install the minimum runtime requirements. To use optional dependencies like `albumentations` and `imagecorruptions` either install them manually with `pip install -r requirements/optional.txt` or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`). Valid keys for the extras field are: `all`, `tests`, `build`, and `optional`.
 
-d. If you would like to use `albumentations`, we suggest using
-`pip install albumentations>=0.3.2 --no-binary imgaug,albumentations`. If you simply use
-`pip install albumentations>=0.3.2`, it will install `opencv-python-headless` simultaneously (even though you have already installed `opencv-python`). We should not allow `opencv-python` and `opencv-python-headless` installed at the same time, because it might cause unexpected issues. Please refer to [official documentation](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies) for more details.
+d. If you would like to use `albumentations`, we suggest using `pip install -r requirements/albu.txt` or
+`pip install -U albumentations --no-binary qudida,albumentations`. If you simply use `pip install albumentations>=0.3.2`,
+it will install `opencv-python-headless` simultaneously (even though you have already
+installed `opencv-python`). We recommended checking the environment after installing `albumentation` to
+ensure that `opencv-python` and `opencv-python-headless` are not installed at the same time, because it might cause unexpected issues if they both installed. Please refer
+to [official documentation](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies) for more details.
 
 ### Install without GPU support
 
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index 8c74b0afbf9..b606a0ad4d8 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -149,7 +149,7 @@ MIM 能够自动地安装 OpenMMLab 的项目以及对应的依赖包。
     # 安装 LVIS 数据集依赖
     pip install git+https://github.com/lvis-dataset/lvis-api.git
     # 安装 albumentations 依赖
-    pip install albumentations>=0.3.2 --no-binary imgaug,albumentations
+    pip install -r requirements/albu.txt
     ```
 
 **注意：**
@@ -160,7 +160,7 @@ MIM 能够自动地安装 OpenMMLab 的项目以及对应的依赖包。
 
 (3) 一些安装依赖是可以选择的。例如只需要安装最低运行要求的版本，则可以使用 `pip install -v -e .` 命令。如果希望使用可选择的像 `albumentations` 和 `imagecorruptions` 这种依赖项，可以使用 `pip install -r requirements/optional.txt` 进行手动安装，或者在使用 `pip` 时指定所需的附加功能（例如 `pip install -v -e .[optional]`），支持附加功能的有效键值包括  `all`、`tests`、`build` 以及 `optional` 。
 
-(4) 如果希望使用 `albumentations`，我们建议使用 `pip install albumentations>=0.3.2 --no-binary imgaug,albumentations` 进行安装。 如果简单地使用 `pip install albumentations>=0.3.2` 进行安装，则会同时安装 `opencv-python-headless`（即便已经安装了 `opencv-python` 也会再次安装）。我们不允许同时安装 `opencv-python` 和 `opencv-python-headless`，因为这样可能会导致一些问题。更多细节请参考[官方文档](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies)。
+(4) 如果希望使用 `albumentations`，我们建议使用 `pip install -r requirements/albu.txt` 或者 `pip install -U albumentations --no-binary qudida,albumentations` 进行安装。 如果简单地使用 `pip install albumentations>=0.3.2` 进行安装，则会同时安装 `opencv-python-headless`（即便已经安装了 `opencv-python` 也会再次安装）。我们建议在安装 `albumentations` 后检查环境，以确保没有同时安装 `opencv-python` 和 `opencv-python-headless`，因为同时安装可能会导致一些问题。更多细节请参考[官方文档](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies)。
 
 ### 只在 CPU 安装
 
diff --git a/docs/zh_cn/tutorials/config.md b/docs/zh_cn/tutorials/config.md
index 5ea664b4caf..19db31a5bdb 100644
--- a/docs/zh_cn/tutorials/config.md
+++ b/docs/zh_cn/tutorials/config.md
@@ -93,7 +93,7 @@ model = dict(
             requires_grad=True),  # 是否训练归一化里的 gamma 和 beta。
         norm_eval=True,  # 是否冻结 BN 里的统计项。
         style='pytorch',  # 主干网络的风格，'pytorch' 意思是步长为2的层为 3x3 卷积， 'caffe' 意思是步长为2的层为 1x1 卷积。
-       init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),  # 加载通过 ImageNet 与训练的模型
+       init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),  # 加载通过 ImageNet 预训练的模型
     neck=dict(
         type='FPN',  # 检测器的 neck 是 FPN，我们同样支持 'NASFPN', 'PAFPN' 等，更多细节可以参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/fpn.py#L10。
         in_channels=[256, 512, 1024, 2048],  # 输入通道数，这与主干网络的输出通道一致
@@ -111,7 +111,7 @@ model = dict(
         bbox_coder=dict(  # 在训练和测试期间对框进行编码和解码。
             type='DeltaXYWHBBoxCoder',  # 框编码器的类别，'DeltaXYWHBBoxCoder' 是最常用的，更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py#L9。
             target_means=[0.0, 0.0, 0.0, 0.0],  # 用于编码和解码框的目标均值
-            target_stds=[1.0, 1.0, 1.0, 1.0]),  # 用于编码和解码框的标准方差
+            target_stds=[1.0, 1.0, 1.0, 1.0]),  # 用于编码和解码框的标准差
         loss_cls=dict(  # 分类分支的损失函数配置
             type='CrossEntropyLoss',  # 分类分支的损失类型，我们也支持 FocalLoss 等。
             use_sigmoid=True,  # RPN通常进行二分类，所以通常使用sigmoid函数。
@@ -138,7 +138,7 @@ model = dict(
             bbox_coder=dict(  # 第二阶段使用的框编码器。
                 type='DeltaXYWHBBoxCoder',  # 框编码器的类别，大多数情况使用 'DeltaXYWHBBoxCoder'。
                 target_means=[0.0, 0.0, 0.0, 0.0],  # 用于编码和解码框的均值
-                target_stds=[0.1, 0.1, 0.2, 0.2]),  # 编码和解码的标准方差。因为框更准确，所以值更小，常规设置时 [0.1, 0.1, 0.2, 0.2]。
+                target_stds=[0.1, 0.1, 0.2, 0.2]),  # 编码和解码的标准差。因为框更准确，所以值更小，常规设置时 [0.1, 0.1, 0.2, 0.2]。
             reg_class_agnostic=False,  # 回归是否与类别无关。
             loss_cls=dict(  # 分类分支的损失函数配置
                 type='CrossEntropyLoss',  # 分类分支的损失类型，我们也支持 FocalLoss 等。
@@ -211,7 +211,7 @@ model = dict(
             mask_size=28,  # mask 的大小
             pos_weight=-1,  # 训练期间正样本的权重。
             debug=False))  # 是否设置调试模式。
-    test_cfg = dict(  # 用于测试 rnn 和 rnn 超参数的配置
+    test_cfg = dict(  # 用于测试 rpn 和 rcnn 超参数的配置
         rpn=dict(  # 测试阶段生成 proposals 的配置
             nms_across_levels=False,  # 是否对跨层的 box 做 NMS。仅适用于`GARPNHead`，naive rpn 不支持做 NMS cross levels。
             nms_pre=1000,  # NMS 前的 box 数
@@ -231,7 +231,7 @@ model = dict(
             mask_thr_binary=0.5))  # mask 预处的阈值
 dataset_type = 'CocoDataset'  # 数据集类型，这将被用来定义数据集。
 data_root = 'data/coco/'  # 数据的根路径。
-img_norm_cfg = dict(  #图像归一化配置，用来归一化输入的图像。
+img_norm_cfg = dict(  # 图像归一化配置，用来归一化输入的图像。
     mean=[123.675, 116.28, 103.53],  # 预训练里用于预训练主干网络模型的平均值。
     std=[58.395, 57.12, 57.375],  # 预训练里用于预训练主干网络模型的标准差。
     to_rgb=True
@@ -398,7 +398,7 @@ log_level = 'INFO'  # 日志的级别。
 load_from = None  # 从一个给定路径里加载模型作为预训练模型，它并不会消耗训练时间。
 resume_from = None  # 从给定路径里恢复检查点(checkpoints)，训练模式将从检查点保存的轮次开始恢复训练。
 workflow = [('train', 1)]  # runner 的工作流程，[('train', 1)] 表示只有一个工作流且工作流仅执行一次。根据 total_epochs 工作流训练 12个回合。
-work_dir = 'work_dir'  # 用于保存当前实验的模型检查点和日志的目录文件地址。
+work_dir = 'work_dir'  # 用于保存当前实验的模型检查点和日志的目录。
 ```
 
 ## 常问问题 (FAQ)
@@ -466,7 +466,7 @@ model = dict(
 
 ### 使用配置文件里的中间变量
 
-配置文件里会使用一些中间变量，例如数据集里的 `train_pipeline/test_pipeline`。我们在定义新的 `train_pipeline/test_pipeline` 之后，需要将它们传递到 `data` 里。例如，我们想在训练或测试时，改变 Mask R-CNN 的多尺度策略 (multi scale strategy)，`train_pipeline/test_pipeline` 是我们想要修改的中间变量。
+配置文件里会使用一些中间变量，例如数据集里的 `train_pipeline`/`test_pipeline`。我们在定义新的 `train_pipeline`/`test_pipeline` 之后，需要将它们传递到 `data` 里。例如，我们想在训练或测试时，改变 Mask R-CNN 的多尺度策略 (multi scale strategy)，`train_pipeline`/`test_pipeline` 是我们想要修改的中间变量。
 
 ```python
 _base_ = './mask_rcnn_r50_fpn_1x_coco.py'
diff --git a/requirements/albu.txt b/requirements/albu.txt
new file mode 100644
index 00000000000..f421fbbdc47
--- /dev/null
+++ b/requirements/albu.txt
@@ -0,0 +1 @@
+albumentations>=0.3.2 --no-binary qudida,albumentations
diff --git a/tools/deployment/onnx2tensorrt.py b/tools/deployment/onnx2tensorrt.py
index 84a9afebda4..e3e9b57d2b4 100644
--- a/tools/deployment/onnx2tensorrt.py
+++ b/tools/deployment/onnx2tensorrt.py
@@ -201,7 +201,7 @@ def parse_args():
         parsed directly from config file and are deprecated and will be \
         removed in future releases.')
     if not args.input_img:
-        args.input_img = osp.join(osp.dirname(__file__), '../demo/demo.jpg')
+        args.input_img = osp.join(osp.dirname(__file__), '../../demo/demo.jpg')
 
     cfg = Config.fromfile(args.config)
 

From 0b205c6a7193f6d112a1a805fc8d466e9a0a2b06 Mon Sep 17 00:00:00 2001
From: Yosuke Shinya <42844407+shinya7y@users.noreply.github.com>
Date: Tue, 22 Feb 2022 10:45:09 +0900
Subject: [PATCH 13/27] [Feature] Support DyHead (#6823)

* add DyHead

* move and update DYReLU

* update

* replace stack with sum to reduce memory

* clean and update

* update to align inference accuracy (incomplete)

* fix pad

* update to align training accuracy and pick #6867

* add README and metafile

* update docs

* resolve comments

* revert picking 6867

* update README.md

* update metafile.yml

* resolve comments and update urls
---
 README.md                                     |   1 +
 README_zh-CN.md                               |   1 +
 configs/dyhead/README.md                      |  46 +++++
 .../atss_r50_caffe_fpn_dyhead_1x_coco.py      | 112 +++++++++++
 configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py |  65 +++++++
 configs/dyhead/metafile.yml                   |  63 +++++++
 mmdet/models/dense_heads/atss_head.py         |  17 +-
 mmdet/models/necks/__init__.py                |   3 +-
 mmdet/models/necks/dyhead.py                  | 174 ++++++++++++++++++
 mmdet/models/utils/__init__.py                |   4 +-
 mmdet/models/utils/se_layer.py                |  69 +++++++
 model-index.yml                               |   1 +
 tests/test_models/test_necks.py               |  25 ++-
 tests/test_models/test_utils/test_se_layer.py |  32 +++-
 14 files changed, 604 insertions(+), 9 deletions(-)
 create mode 100644 configs/dyhead/README.md
 create mode 100644 configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py
 create mode 100644 configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
 create mode 100644 configs/dyhead/metafile.yml
 create mode 100644 mmdet/models/necks/dyhead.py

diff --git a/README.md b/README.md
index 752b350d578..48b28118c6e 100644
--- a/README.md
+++ b/README.md
@@ -234,6 +234,7 @@ Results and models are available in the [model zoo](docs/en/model_zoo.md).
         <li><a href="configs/carafe">CARAFE (ICCV'2019)</a></li>
         <li><a href="configs/fpg">FPG (ArXiv'2020)</a></li>
         <li><a href="configs/groie">GRoIE (ICPR'2020)</a></li>
+        <li><a href="configs/dyhead">DyHead (CVPR'2021)</a></li>
       </ul>
       </td>
       <td>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index f4c188aab91..441605a8ab3 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -233,6 +233,7 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope
         <li><a href="configs/carafe">CARAFE (ICCV'2019)</a></li>
         <li><a href="configs/fpg">FPG (ArXiv'2020)</a></li>
         <li><a href="configs/groie">GRoIE (ICPR'2020)</a></li>
+        <li><a href="configs/dyhead">DyHead (CVPR'2021)</a></li>
       </ul>
       </td>
       <td>
diff --git a/configs/dyhead/README.md b/configs/dyhead/README.md
new file mode 100644
index 00000000000..068a35b1189
--- /dev/null
+++ b/configs/dyhead/README.md
@@ -0,0 +1,46 @@
+# DyHead
+
+> [Dynamic Head: Unifying Object Detection Heads with Attentions](https://arxiv.org/abs/2106.08322)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The complex nature of combining localization and classification in object detection has resulted in the flourished development of methods. Previous works tried to improve the performance in various object detection heads but failed to present a unified view. In this paper, we present a novel dynamic head framework to unify object detection heads with attentions. By coherently combining multiple self-attention mechanisms between feature levels for scale-awareness, among spatial locations for spatial-awareness, and within output channels for task-awareness, the proposed approach significantly improves the representation ability of object detection heads without any computational overhead. Further experiments demonstrate that the effectiveness and efficiency of the proposed dynamic head on the COCO benchmark. With a standard ResNeXt-101-DCN backbone, we largely improve the performance over popular object detectors and achieve a new state-of-the-art at 54.0 AP. Furthermore, with latest transformer backbone and extra data, we can push current best COCO result to a new record at 60.6 AP.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/42844407/149169448-fcafb6d0-b866-41cc-9422-94de9f1e1761.png" height="300"/>
+</div>
+
+## Results and Models
+
+| Method | Backbone | Style   | Setting      | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
+|:------:|:--------:|:-------:|:------------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:|
+| ATSS   | R-50     | caffe   | reproduction | 1x      | 5.4      | 13.2           | 42.5   | [config](./atss_r50_caffe_fpn_dyhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939.log.json) |
+| ATSS   | R-50     | pytorch | simple       | 1x      | 4.9      | 13.7           | 43.3   | [config](./atss_r50_fpn_dyhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314.log.json) |
+
+- We trained the above models with 4 GPUs and 4 `samples_per_gpu`.
+- The `reproduction` setting aims to reproduce the official implementation based on Detectron2.
+- The `simple` setting serves as a minimum example to use DyHead in MMDetection. Specifically,
+  - it adds `DyHead` to `neck` after `FPN`
+  - it sets `stacked_convs=0` to `bbox_head`
+- The `simple` setting achieves higher AP than the original implementation.
+  We have not conduct ablation study between the two settings.
+  `dict(type='Pad', size_divisor=128)` may further improve AP by prefer spatial alignment across pyramid levels, although large padding reduces efficiency.
+
+## Relation to Other Methods
+
+- DyHead can be regarded as an improved [SEPC](https://arxiv.org/abs/2005.03101) with [DyReLU modules](https://arxiv.org/abs/2003.10027) and simplified [SE blocks](https://arxiv.org/abs/1709.01507).
+- Xiyang Dai et al., the author team of DyHead, adopt it for [Dynamic DETR](https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Dynamic_DETR_End-to-End_Object_Detection_With_Dynamic_Attention_ICCV_2021_paper.html).
+  The description of Dynamic Encoder in Sec. 3.2 will help you understand DyHead.
+
+## Citation
+
+```latex
+@inproceedings{DyHead_CVPR2021,
+  author    = {Dai, Xiyang and Chen, Yinpeng and Xiao, Bin and Chen, Dongdong and Liu, Mengchen and Yuan, Lu and Zhang, Lei},
+  title     = {Dynamic Head: Unifying Object Detection Heads With Attentions},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year      = {2021}
+}
+```
diff --git a/configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py b/configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py
new file mode 100644
index 00000000000..223b6532607
--- /dev/null
+++ b/configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ATSS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(
+            type='DyHead',
+            in_channels=256,
+            out_channels=256,
+            num_blocks=6,
+            # disable zero_init_offset to follow official implementation
+            zero_init_offset=False)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        pred_kernel_size=1,  # follow DyHead official implementation
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128],
+            center_offset=0.5),  # follow DyHead official implementation
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+
+# use caffe img_norm, size_divisor=128, pillow resize
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=(1333, 800),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=128),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True, backend='pillow'),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=128),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py b/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
new file mode 100644
index 00000000000..8c5109d0aff
--- /dev/null
+++ b/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
@@ -0,0 +1,65 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ATSS',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(type='DyHead', in_channels=256, out_channels=256, num_blocks=6)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/dyhead/metafile.yml b/configs/dyhead/metafile.yml
new file mode 100644
index 00000000000..a2e9504e61c
--- /dev/null
+++ b/configs/dyhead/metafile.yml
@@ -0,0 +1,63 @@
+Collections:
+  - Name: DyHead
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 4x T4 GPUs
+      Architecture:
+        - ATSS
+        - DyHead
+        - FPN
+        - ResNet
+        - Deformable Convolution
+        - Pyramid Convolution
+    Paper:
+      URL: https://arxiv.org/abs/2106.08322
+      Title: 'Dynamic Head: Unifying Object Detection Heads with Attentions'
+    README: configs/dyhead/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/necks/dyhead.py#L130
+      Version: v2.22.0
+
+Models:
+  - Name: atss_r50_caffe_fpn_dyhead_1x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_r50_caffe_fpn_dyhead_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.4
+      inference time (ms/im):
+        - value: 75.7
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth
+
+  - Name: atss_r50_fpn_dyhead_1x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.9
+      inference time (ms/im):
+        - value: 73.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth
diff --git a/mmdet/models/dense_heads/atss_head.py b/mmdet/models/dense_heads/atss_head.py
index 00687a2fec8..e8f401caa1a 100644
--- a/mmdet/models/dense_heads/atss_head.py
+++ b/mmdet/models/dense_heads/atss_head.py
@@ -24,6 +24,7 @@ class ATSSHead(AnchorHead):
     def __init__(self,
                  num_classes,
                  in_channels,
+                 pred_kernel_size=3,
                  stacked_convs=4,
                  conv_cfg=None,
                  norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
@@ -42,6 +43,7 @@ def __init__(self,
                          std=0.01,
                          bias_prob=0.01)),
                  **kwargs):
+        self.pred_kernel_size = pred_kernel_size
         self.stacked_convs = stacked_convs
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
@@ -85,15 +87,22 @@ def _init_layers(self):
                     padding=1,
                     conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
         self.atss_cls = nn.Conv2d(
             self.feat_channels,
             self.num_anchors * self.cls_out_channels,
-            3,
-            padding=1)
+            self.pred_kernel_size,
+            padding=pred_pad_size)
         self.atss_reg = nn.Conv2d(
-            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
         self.atss_centerness = nn.Conv2d(
-            self.feat_channels, self.num_base_priors * 1, 3, padding=1)
+            self.feat_channels,
+            self.num_base_priors * 1,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
         self.scales = nn.ModuleList(
             [Scale(1.0) for _ in self.prior_generator.strides])
 
diff --git a/mmdet/models/necks/__init__.py b/mmdet/models/necks/__init__.py
index fac8397c379..6f2fa823fb3 100644
--- a/mmdet/models/necks/__init__.py
+++ b/mmdet/models/necks/__init__.py
@@ -3,6 +3,7 @@
 from .channel_mapper import ChannelMapper
 from .ct_resnet_neck import CTResNetNeck
 from .dilated_encoder import DilatedEncoder
+from .dyhead import DyHead
 from .fpg import FPG
 from .fpn import FPN
 from .fpn_carafe import FPN_CARAFE
@@ -18,5 +19,5 @@
 __all__ = [
     'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN',
     'NASFCOS_FPN', 'RFP', 'YOLOV3Neck', 'FPG', 'DilatedEncoder',
-    'CTResNetNeck', 'SSDNeck', 'YOLOXPAFPN'
+    'CTResNetNeck', 'SSDNeck', 'YOLOXPAFPN', 'DyHead'
 ]
diff --git a/mmdet/models/necks/dyhead.py b/mmdet/models/necks/dyhead.py
new file mode 100644
index 00000000000..5d752c348dc
--- /dev/null
+++ b/mmdet/models/necks/dyhead.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (build_activation_layer, build_norm_layer, constant_init,
+                      normal_init)
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+from ..utils import DyReLU
+
+# Reference:
+# https://github.com/microsoft/DynamicHead
+# https://github.com/jshilong/SEPC
+
+
+class DyDCNv2(nn.Module):
+    """ModulatedDeformConv2d with normalization layer used in DyHead.
+
+    This module cannot be configured with `conv_cfg=dict(type='DCNv2')`
+    because DyHead calculates offset and mask from middle-level feature.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int | tuple[int], optional): Stride of the convolution.
+            Default: 1.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='GN', num_groups=16, requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)):
+        super().__init__()
+        self.with_norm = norm_cfg is not None
+        bias = not self.with_norm
+        self.conv = ModulatedDeformConv2d(
+            in_channels, out_channels, 3, stride=stride, padding=1, bias=bias)
+        if self.with_norm:
+            self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+
+    def forward(self, x, offset, mask):
+        """Forward function."""
+        x = self.conv(x.contiguous(), offset, mask)
+        if self.with_norm:
+            x = self.norm(x)
+        return x
+
+
+class DyHeadBlock(nn.Module):
+    """DyHead Block with three types of attention.
+
+    HSigmoid arguments in default act_cfg follow official code, not paper.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        act_cfg (dict, optional): Config dict for the last activation layer of
+            scale-aware attention. Default: dict(type='HSigmoid', bias=3.0,
+            divisor=6.0).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 zero_init_offset=True,
+                 act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)):
+        super().__init__()
+        self.zero_init_offset = zero_init_offset
+        # (offset_x, offset_y, mask) * kernel_size_y * kernel_size_x
+        self.offset_and_mask_dim = 3 * 3 * 3
+        self.offset_dim = 2 * 3 * 3
+
+        self.spatial_conv_high = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_mid = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_low = DyDCNv2(in_channels, out_channels, stride=2)
+        self.spatial_conv_offset = nn.Conv2d(
+            in_channels, self.offset_and_mask_dim, 3, padding=1)
+        self.scale_attn_module = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1), nn.Conv2d(out_channels, 1, 1),
+            nn.ReLU(inplace=True), build_activation_layer(act_cfg))
+        self.task_attn_module = DyReLU(out_channels)
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, 0, 0.01)
+        if self.zero_init_offset:
+            constant_init(self.spatial_conv_offset, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for level in range(len(x)):
+            # calculate offset and mask of DCNv2 from middle-level feature
+            offset_and_mask = self.spatial_conv_offset(x[level])
+            offset = offset_and_mask[:, :self.offset_dim, :, :]
+            mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid()
+
+            mid_feat = self.spatial_conv_mid(x[level], offset, mask)
+            sum_feat = mid_feat * self.scale_attn_module(mid_feat)
+            summed_levels = 1
+            if level > 0:
+                low_feat = self.spatial_conv_low(x[level - 1], offset, mask)
+                sum_feat += low_feat * self.scale_attn_module(low_feat)
+                summed_levels += 1
+            if level < len(x) - 1:
+                # this upsample order is weird, but faster than natural order
+                # https://github.com/microsoft/DynamicHead/issues/25
+                high_feat = F.interpolate(
+                    self.spatial_conv_high(x[level + 1], offset, mask),
+                    size=x[level].shape[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                sum_feat += high_feat * self.scale_attn_module(high_feat)
+                summed_levels += 1
+            outs.append(self.task_attn_module(sum_feat / summed_levels))
+
+        return outs
+
+
+@NECKS.register_module()
+class DyHead(BaseModule):
+    """DyHead neck consisting of multiple DyHead Blocks.
+
+    See `Dynamic Head: Unifying Object Detection Heads with Attentions
+    <https://arxiv.org/abs/2106.08322>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_blocks (int, optional): Number of DyHead Blocks. Default: 6.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=6,
+                 zero_init_offset=True,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.zero_init_offset = zero_init_offset
+
+        dyhead_blocks = []
+        for i in range(num_blocks):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            dyhead_blocks.append(
+                DyHeadBlock(
+                    in_channels,
+                    self.out_channels,
+                    zero_init_offset=zero_init_offset))
+        self.dyhead_blocks = nn.Sequential(*dyhead_blocks)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert isinstance(inputs, (tuple, list))
+        outs = self.dyhead_blocks(inputs)
+        return tuple(outs)
diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py
index 98678af5860..84dc141e850 100644
--- a/mmdet/models/utils/__init__.py
+++ b/mmdet/models/utils/__init__.py
@@ -12,7 +12,7 @@
 from .positional_encoding import (LearnedPositionalEncoding,
                                   SinePositionalEncoding)
 from .res_layer import ResLayer, SimplifiedBasicBlock
-from .se_layer import SELayer
+from .se_layer import DyReLU, SELayer
 from .transformer import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
                           DynamicConv, PatchEmbed, Transformer, nchw_to_nlc,
                           nlc_to_nchw)
@@ -25,5 +25,5 @@
     'NormedLinear', 'NormedConv2d', 'make_divisible', 'InvertedResidual',
     'SELayer', 'interpolate_as', 'ConvUpsample', 'CSPLayer',
     'adaptive_avg_pool2d', 'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc',
-    'nlc_to_nchw', 'pvt_convert', 'sigmoid_geometric_mean'
+    'nlc_to_nchw', 'pvt_convert', 'sigmoid_geometric_mean', 'DyReLU'
 ]
diff --git a/mmdet/models/utils/se_layer.py b/mmdet/models/utils/se_layer.py
index 8e55a9e494e..a2492103b15 100644
--- a/mmdet/models/utils/se_layer.py
+++ b/mmdet/models/utils/se_layer.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import mmcv
+import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 from mmcv.runner import BaseModule
@@ -56,3 +57,71 @@ def forward(self, x):
         out = self.conv1(out)
         out = self.conv2(out)
         return x * out
+
+
+class DyReLU(BaseModule):
+    """Dynamic ReLU (DyReLU) module.
+
+    See `Dynamic ReLU <https://arxiv.org/abs/2003.10027>`_ for details.
+    Current implementation is specialized for task-aware attention in DyHead.
+    HSigmoid arguments in default act_cfg follow DyHead official code.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        channels (int): The input (and output) channels of DyReLU module.
+        ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module,
+            the intermediate channel will be ``int(channels/ratio)``.
+            Default: 4.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=4,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'),
+                          dict(type='HSigmoid', bias=3.0, divisor=6.0)),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        self.expansion = 4  # for a1, b1, a2, b2
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        """Forward function."""
+        coeffs = self.global_avgpool(x)
+        coeffs = self.conv1(coeffs)
+        coeffs = self.conv2(coeffs) - 0.5  # value range: [-0.5, 0.5]
+        a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1)
+        a1 = a1 * 2.0 + 1.0  # [-1.0, 1.0] + 1.0
+        a2 = a2 * 2.0  # [-1.0, 1.0]
+        out = torch.max(x * a1 + b1, x * a2 + b2)
+        return out
diff --git a/model-index.yml b/model-index.yml
index b6ec18b682f..e05ab8d2964 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -13,6 +13,7 @@ Import:
   - configs/detectors/metafile.yml
   - configs/detr/metafile.yml
   - configs/double_heads/metafile.yml
+  - configs/dyhead/metafile.yml
   - configs/dynamic_rcnn/metafile.yml
   - configs/empirical_attention/metafile.yml
   - configs/faster_rcnn/metafile.yml
diff --git a/tests/test_models/test_necks.py b/tests/test_models/test_necks.py
index f68476d6c7b..301e51a85a3 100644
--- a/tests/test_models/test_necks.py
+++ b/tests/test_models/test_necks.py
@@ -4,7 +4,7 @@
 from torch.nn.modules.batchnorm import _BatchNorm
 
 from mmdet.models.necks import (FPN, YOLOXPAFPN, ChannelMapper, CTResNetNeck,
-                                DilatedEncoder, SSDNeck, YOLOV3Neck)
+                                DilatedEncoder, DyHead, SSDNeck, YOLOV3Neck)
 
 
 def test_fpn():
@@ -404,3 +404,26 @@ def test_yolox_pafpn():
     for i in range(len(feats)):
         assert outs[i].shape[1] == out_channels
         assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+
+def test_dyhead():
+    s = 64
+    in_channels = 8
+    out_channels = 16
+    feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+    feats = [
+        torch.rand(1, in_channels, feat_sizes[i], feat_sizes[i])
+        for i in range(len(feat_sizes))
+    ]
+    neck = DyHead(
+        in_channels=in_channels, out_channels=out_channels, num_blocks=3)
+    outs = neck(feats)
+    assert len(outs) == len(feats)
+    for i in range(len(outs)):
+        assert outs[i].shape[1] == out_channels
+        assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    feat = torch.rand(1, 8, 4, 4)
+    # input feat must be tuple or list
+    with pytest.raises(AssertionError):
+        neck(feat)
diff --git a/tests/test_models/test_utils/test_se_layer.py b/tests/test_models/test_utils/test_se_layer.py
index ae7ec7f1090..b525b91116e 100644
--- a/tests/test_models/test_utils/test_se_layer.py
+++ b/tests/test_models/test_utils/test_se_layer.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
+import torch.nn.functional as F
+from mmcv.cnn import constant_init
 
-from mmdet.models.utils import SELayer
+from mmdet.models.utils import DyReLU, SELayer
 
 
 def test_se_layer():
@@ -22,3 +24,31 @@ def test_se_layer():
     x = torch.randn((1, 32, 10, 10))
     x_out = layer(x)
     assert x_out.shape == torch.Size((1, 32, 10, 10))
+
+
+def test_dyrelu():
+    with pytest.raises(AssertionError):
+        # act_cfg sequence length must equal to 2
+        DyReLU(channels=32, act_cfg=(dict(type='ReLU'), ))
+
+    with pytest.raises(AssertionError):
+        # act_cfg sequence must be a tuple of dict
+        DyReLU(channels=32, act_cfg=[dict(type='ReLU'), dict(type='ReLU')])
+
+    # Test DyReLU forward
+    layer = DyReLU(channels=32)
+    layer.init_weights()
+    layer.train()
+    x = torch.randn((1, 32, 10, 10))
+    x_out = layer(x)
+    assert x_out.shape == torch.Size((1, 32, 10, 10))
+
+    # DyReLU should act as standard (static) ReLU
+    # when eliminating the effect of SE-like module
+    layer = DyReLU(channels=32)
+    constant_init(layer.conv2.conv, 0)
+    layer.train()
+    x = torch.randn((1, 32, 10, 10))
+    x_out = layer(x)
+    relu_out = F.relu(x)
+    assert torch.equal(x_out, relu_out)

From 613dacad3ab68ff24c10497b2cfb5e9314b89cb5 Mon Sep 17 00:00:00 2001
From: Haofan Wang <frankmiracle@outlook.com>
Date: Tue, 22 Feb 2022 17:36:59 +0800
Subject: [PATCH 14/27] Fix broken colab link (#7218)

* [Fix] Fix wrong img name in onnx2tensorrt.py (#7157)

* [Docs] fix albumentations installed way (#7143)

* Fix broken colab link

Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>
---
 demo/MMDet_Tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/MMDet_Tutorial.ipynb b/demo/MMDet_Tutorial.ipynb
index 02d785bbfc1..4b8604a878b 100644
--- a/demo/MMDet_Tutorial.ipynb
+++ b/demo/MMDet_Tutorial.ipynb
@@ -363,7 +363,7 @@
     "\n",
     "Usually we recommend to use the first two methods which are usually easier than the third.\n",
     "\n",
-    "In this tutorial, we gives an example that converting the data into the format of existing datasets like COCO, VOC, etc. Other methods and more advanced usages can be found in the [doc](https://mmdetection.readthedocs.io/en/latest/tutorials/new_dataset.html#).\n",
+    "In this tutorial, we gives an example that converting the data into the format of existing datasets like COCO, VOC, etc. Other methods and more advanced usages can be found in the [doc](https://mmdetection.readthedocs.io/en/latest/tutorials/customize_dataset.html).\n",
     "\n",
     "Firstly, let's download a tiny dataset obtained from [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). We select the first 75 images and their annotations from the 3D object detection dataset (it is the same dataset as the 2D object detection dataset but has 3D annotations). We convert the original images from PNG to JPEG format with 80% quality to reduce the size of dataset."
    ]

From 220c0da0288380ca8797d1fe2b1746fa203796a8 Mon Sep 17 00:00:00 2001
From: Zhijian Liu <zhijianliu.cs@gmail.com>
Date: Tue, 22 Feb 2022 06:18:08 -0500
Subject: [PATCH 15/27] Remove the inplace addition in `FPN` (#7175)

* [Fix] Fix wrong img name in onnx2tensorrt.py (#7157)

* [Docs] fix albumentations installed way (#7143)

* Remove the inplace addition in `FPN`

* update

Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>
Co-authored-by: PJLAB\huanghaian <1286304229@qq.com>
---
 mmdet/models/necks/fpn.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mmdet/models/necks/fpn.py b/mmdet/models/necks/fpn.py
index 2f065ed0dbb..9f601386549 100644
--- a/mmdet/models/necks/fpn.py
+++ b/mmdet/models/necks/fpn.py
@@ -165,11 +165,12 @@ def forward(self, inputs):
             # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
             #  it cannot co-exist with `size` in `F.interpolate`.
             if 'scale_factor' in self.upsample_cfg:
-                laterals[i - 1] += F.interpolate(laterals[i],
-                                                 **self.upsample_cfg)
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
             else:
                 prev_shape = laterals[i - 1].shape[2:]
-                laterals[i - 1] += F.interpolate(
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
                     laterals[i], size=prev_shape, **self.upsample_cfg)
 
         # build outputs

From 1516986a616fee8bb741d0ab2be40683045efccd Mon Sep 17 00:00:00 2001
From: BigDong <yudongwang1226@gmail.com>
Date: Tue, 22 Feb 2022 19:25:09 +0800
Subject: [PATCH 16/27] [Feature] Support OpenImages Dataset (#6331)

* [Feature] support openimage group of eval

* [Feature] support openimage group of eval

* support openimage dataset

* support openimage challenge dataset

* fully support OpenImages-V6 and OpenImages Challenge 2019

* Fix some logic error

* update config file

* fix get data_infos error

* fully support OpenImages evaluation

* update OpenImages config files

* [Feature] support OpenImages datasets

* fix bug

* support load image metas from pipeline

* fix bug

* fix get classes logic error

* update code

* support get image metas

* support openimags

* support collect image metas

* support Open Images

* fix openimages logic

* minor fix

* add a new function to compute openimages tpfp

* minor fix

* fix ci error

* minor fix

* fix indication

* minor fix

* fix returns

* fix returns

* fix returns

* fix returns

* fix returns

* minor fix

* update readme

* support loading image level labels and fix some logic

* minor fix

* minor fix

* add class names

* minor fix

* minor fix

* minor fix

* add openimages test unit

* minor fix

* minor fix

* fix test unit

* minor fix

* fix logic error

* minor fix

* fully support openimages

* minor fix

* fix docstring

* fix docstrings in readthedocs

* update get image metas script

* label_description_file -> label_file

* update openimages readme

* fix test unit

* fix test unit

* minor fix

* update readme file

* Update get_image_metas.py
---
 .dev_scripts/gather_models.py                 |   4 +-
 .../_base_/datasets/openimages_detection.py   |  65 ++
 configs/openimages/README.md                  | 125 +++
 .../faster_rcnn_r50_fpn_32x2_1x_openimages.py |  18 +
 ...nn_r50_fpn_32x2_1x_openimages_challenge.py |  42 +
 configs/openimages/metafile.yml               |  62 ++
 .../retinanet_r50_fpn_32x2_1x_openimages.py   |  17 +
 .../openimages/ssd300_32x8_36e_openimages.py  |  78 ++
 mmdet/core/evaluation/__init__.py             |   6 +-
 mmdet/core/evaluation/class_names.py          | 217 ++++-
 mmdet/core/evaluation/mean_ap.py              | 256 +++++-
 mmdet/datasets/__init__.py                    |   4 +-
 mmdet/datasets/custom.py                      |   2 +-
 mmdet/datasets/openimages.py                  | 841 ++++++++++++++++++
 mmdet/datasets/pipelines/loading.py           |  18 +
 .../test_datasets/test_openimages_dataset.py  | 348 ++++++++
 tests/test_metrics/test_mean_ap.py            |  82 +-
 tools/misc/get_image_metas.py                 | 116 +++
 18 files changed, 2288 insertions(+), 13 deletions(-)
 create mode 100644 configs/_base_/datasets/openimages_detection.py
 create mode 100644 configs/openimages/README.md
 create mode 100644 configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py
 create mode 100644 configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py
 create mode 100644 configs/openimages/metafile.yml
 create mode 100644 configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py
 create mode 100644 configs/openimages/ssd300_32x8_36e_openimages.py
 create mode 100644 mmdet/datasets/openimages.py
 create mode 100644 tests/test_data/test_datasets/test_openimages_dataset.py
 create mode 100644 tools/misc/get_image_metas.py

diff --git a/.dev_scripts/gather_models.py b/.dev_scripts/gather_models.py
index 7404ae4639b..9dbce0d8a0c 100644
--- a/.dev_scripts/gather_models.py
+++ b/.dev_scripts/gather_models.py
@@ -98,7 +98,9 @@ def get_dataset_name(config):
         LVISV05Dataset='LVIS v0.5',
         LVISV1Dataset='LVIS v1',
         VOCDataset='Pascal VOC',
-        WIDERFaceDataset='WIDER Face')
+        WIDERFaceDataset='WIDER Face',
+        OpenImagesDataset='OpenImagesDataset',
+        OpenImagesChallengeDataset='OpenImagesChallengeDataset')
     cfg = mmcv.Config.fromfile('./configs/' + config)
     return name_map[cfg.dataset_type]
 
diff --git a/configs/_base_/datasets/openimages_detection.py b/configs/_base_/datasets/openimages_detection.py
new file mode 100644
index 00000000000..a65d30634ad
--- /dev/null
+++ b/configs/_base_/datasets/openimages_detection.py
@@ -0,0 +1,65 @@
+# dataset settings
+dataset_type = 'OpenImagesDataset'
+data_root = 'data/OpenImages/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, denorm_bbox=True),
+    dict(type='Resize', img_scale=(1024, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1024, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=0,  # workers_per_gpu > 0 may occur out of memory
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/oidv6-train-annotations-bbox.csv',
+        img_prefix=data_root + 'OpenImages/train/',
+        label_file=data_root + 'annotations/class-descriptions-boxable.csv',
+        hierarchy_file=data_root +
+        'annotations/bbox_labels_600_hierarchy.json',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/validation-annotations-bbox.csv',
+        img_prefix=data_root + 'OpenImages/validation/',
+        label_file=data_root + 'annotations/class-descriptions-boxable.csv',
+        hierarchy_file=data_root +
+        'annotations/bbox_labels_600_hierarchy.json',
+        meta_file=data_root + 'annotations/validation-image-metas.pkl',
+        image_level_ann_file=data_root +
+        'annotations/validation-annotations-human-imagelabels-boxable.csv',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/validation-annotations-bbox.csv',
+        img_prefix=data_root + 'OpenImages/validation/',
+        label_file=data_root + 'annotations/class-descriptions-boxable.csv',
+        hierarchy_file=data_root +
+        'annotations/bbox_labels_600_hierarchy.json',
+        meta_file=data_root + 'annotations/validation-image-metas.pkl',
+        image_level_ann_file=data_root +
+        'annotations/validation-annotations-human-imagelabels-boxable.csv',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='mAP')
diff --git a/configs/openimages/README.md b/configs/openimages/README.md
new file mode 100644
index 00000000000..6d954217280
--- /dev/null
+++ b/configs/openimages/README.md
@@ -0,0 +1,125 @@
+# Open Images Dataset
+<!-- [DATASET] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+#### Open Images v6
+
+[Open Images](https://storage.googleapis.com/openimages/web/index.html) is a dataset of ~9M images annotated with image-level labels,
+object bounding boxes, object segmentation masks, visual relationships,
+and localized narratives:
+
+- It contains a total of 16M bounding boxes for 600 object classes on
+1.9M images, making it the largest existing dataset with object location
+annotations. The boxes have been largely manually drawn by professional
+annotators to ensure accuracy and consistency. The images are very diverse
+and often contain complex scenes with several objects (8.3 per image on
+average).
+
+- Open Images also offers visual relationship annotations, indicating pairs
+of objects in particular relations (e.g. "woman playing guitar", "beer on
+table"), object properties (e.g. "table is wooden"), and human actions (e.g.
+"woman is jumping"). In total it has 3.3M annotations from 1,466 distinct
+relationship triplets.
+
+- In V5 we added segmentation masks for 2.8M object instances in 350 classes.
+Segmentation masks mark the outline of objects, which characterizes their
+spatial extent to a much higher level of detail.
+
+- In V6 we added 675k localized narratives: multimodal descriptions of images
+consisting of synchronized voice, text, and mouse traces over the objects being
+described. (Note we originally launched localized narratives only on train in V6,
+but since July 2020 we also have validation and test covered.)
+
+- Finally, the dataset is annotated with 59.9M image-level labels spanning 19,957
+classes.
+
+We believe that having a single dataset with unified annotations for image
+classification, object detection, visual relationship detection, instance
+segmentation, and multimodal image descriptions will enable to study these
+tasks jointly and stimulate progress towards genuine scene understanding.
+
+<!-- [IMAGE] -->
+<div align=center>
+<img src="https://user-images.githubusercontent.com/48282753/147199750-23e17230-c0cf-49a0-a13c-0d014d49107e.png" height="400"/>
+</div>
+
+#### Open Images Challenge 2019
+
+[Open Images Challenges 2019](https://storage.googleapis.com/openimages/web/challenge2019.html) is based on the V5 release of the Open
+Images dataset. The images of the dataset are very varied and
+often contain complex scenes with several objects (explore the dataset).
+
+## Citation
+
+```
+@article{OpenImages,
+  author = {Alina Kuznetsova and Hassan Rom and Neil Alldrin and Jasper Uijlings and Ivan Krasin and Jordi Pont-Tuset and Shahab Kamali and Stefan Popov and Matteo Malloci and Alexander Kolesnikov and Tom Duerig and Vittorio Ferrari},
+  title = {The Open Images Dataset V4: Unified image classification, object detection, and visual relationship detection at scale},
+  year = {2020},
+  journal = {IJCV}
+}
+```
+
+## Prepare Dataset
+
+1. You need to download and extract Open Images dataset.
+
+2. The Open Images dataset does not have image metas (width and height of the image),
+which will be used during evaluation. We suggest to get test image metas before
+training/testing by using `tools/misc/get_image_metas.py`.
+
+    **Usage**
+    ```shell
+    python tools/misc/get_image_metas.py ${CONFIG} \
+    --out ${OUTPUT FILE NAME}
+    ```
+
+3. The directory should be like this:
+
+    ```none
+    mmdetection
+    ├── mmdet
+    ├── tools
+    ├── configs
+    ├── data
+    │   ├── OpenImages
+    │   │   ├── annotations
+    │   │   │   ├── bbox_labels_600_hierarchy.json
+    │   │   │   ├── class-descriptions-boxable.csv
+    │   │   │   ├── oidv6-train-annotations-bbox.scv
+    │   │   │   ├── validation-annotations-bbox.csv
+    │   │   │   ├── validation-annotations-human-imagelabels-boxable.csv    # is not necessary
+    │   │   │   ├── validation-image-metas.pkl      # get from script
+    │   │   ├── challenge2019
+    │   │   │   ├── challenge-2019-train-detection-bbox.txt
+    │   │   │   ├── challenge-2019-validation-detection-bbox.txt
+    │   │   │   ├── class_label_tree.np
+    │   │   │   ├── class_sample_train.pkl
+    │   │   │   ├── challenge-2019-validation-detection-human-imagelabels.csv       # download from official website, not necessary
+    │   │   │   ├── challenge-2019-validation-metas.pkl     # get from script
+    │   │   ├── OpenImages
+    │   │   │   ├── train           # training images
+    │   │   │   ├── test            # testing images
+    │   │   │   ├── validation      # validation images
+    ```
+
+**Note**:
+1. The training and validation images of Open Images Challenge dataset are based on
+Open Images v6, but the test images are different.
+2. The Open Images Challenges annotations are obtained from [TSD](https://github.com/Sense-X/TSD).
+You can also download the annotations from [official website](https://storage.googleapis.com/openimages/web/challenge2019_downloads.html),
+and set data.train.type=OpenImagesDataset, data.val.type=OpenImagesDataset, and data.test.type=OpenImagesDataset in the config
+3. If users do not want to use `validation-annotations-human-imagelabels-boxable.csv` and `challenge-2019-validation-detection-human-imagelabels.csv`
+users can should set `data.val.load_image_level_labels=False` and `data.test.load_image_level_labels=False` in the config .
+
+
+## Results and Models
+
+| Architecture | Backbone  | Style   | Lr schd | Sampler | Mem (GB) | Inf time (fps) | box AP | Config | Download |
+|:------------:|:---------:|:-------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:--------:|
+| Faster R-CNN | R-50      | pytorch | 1x      |     Group Sampler    |  7.7   | -          | 51.6 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159.log.json) |
+| Faster R-CNN (Challenge 2019) | R-50  | pytorch | 1x |   Group Sampler  |  7.7  | -          | 54.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20211229_071252-46380cde.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20211229_071252.log.json) |
+| Retinanet    | R-50      | pytorch | 1x      |    Group Sampler     |  6.6   | -          | 61.5 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954.log.json) |
+| SSD          | VGG16     | pytorch | 36e     |    Group Sampler     |  10.8  | -          | 35.4 |[config](https://github.com/open-mmlab/mmdetection/tree/master/configs/openimages/ssd300_32x8_36e_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth) &#124; [log](ttps://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232.log.json) |
diff --git a/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py
new file mode 100644
index 00000000000..4e719e3cd31
--- /dev/null
+++ b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/openimages_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=601)))
+
+# Using 32 GPUS while training
+optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=26000,
+    warmup_ratio=1.0 / 64,
+    step=[8, 11])
diff --git a/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py
new file mode 100644
index 00000000000..39ce28b1588
--- /dev/null
+++ b/configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py
@@ -0,0 +1,42 @@
+_base_ = ['faster_rcnn_r50_fpn_32x2_1x_openimages.py']
+
+model = dict(
+    roi_head=dict(bbox_head=dict(num_classes=500)),
+    test_cfg=dict(rcnn=dict(score_thr=0.01)))
+
+# dataset settings
+dataset_type = 'OpenImagesChallengeDataset'
+data_root = 'data/OpenImages/'
+data = dict(
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'challenge2019/challenge-2019-train-detection-bbox.txt',
+        img_prefix=data_root + 'OpenImages/',
+        label_file=data_root + 'challenge2019/cls-label-description.csv',
+        hierarchy_file=data_root + 'challenge2019/class_label_tree.np'),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-bbox.txt',
+        img_prefix=data_root + 'OpenImages/',
+        label_file=data_root + 'challenge2019/cls-label-description.csv',
+        hierarchy_file=data_root + 'challenge2019/class_label_tree.np',
+        meta_file=data_root +
+        'challenge2019/challenge-2019-validation-metas.pkl',
+        image_level_ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-'
+        'human-imagelabels.csv'),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-bbox.txt',
+        img_prefix=data_root + 'OpenImages/',
+        label_file=data_root + 'challenge2019/cls-label-description.csv',
+        hierarchy_file=data_root + 'challenge2019/class_label_tree.np',
+        meta_file=data_root +
+        'challenge2019/challenge-2019-validation-metas.pkl',
+        image_level_ann_file=data_root +
+        'challenge2019/challenge-2019-validation-detection-'
+        'human-imagelabels.csv'))
+evaluation = dict(interval=1, metric='mAP')
diff --git a/configs/openimages/metafile.yml b/configs/openimages/metafile.yml
new file mode 100644
index 00000000000..a3e7a8acfbe
--- /dev/null
+++ b/configs/openimages/metafile.yml
@@ -0,0 +1,62 @@
+Collections:
+  - Name: Open Images Dataset
+    Paper:
+      URL: https://arxiv.org/abs/1811.00982
+      Title: 'The Open Images Dataset V4: Unified image classification, object detection, and visual relationship detection at scale'
+    README: configs/openimages/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.20.0/mmdet/datasets/openimages.py#L21
+      Version: v2.20.0
+
+Models:
+  - Name: faster_rcnn_r50_fpn_32x2_1x_openimages
+    In Collection: Open Images Dataset
+    Config: configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 51.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth
+
+  - Name: retinanet_r50_fpn_32x2_1x_openimages
+    In Collection: Open Images Dataset
+    Config: configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py
+    Metadata:
+      Training Memory (GB): 6.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 61.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth
+
+  - Name: ssd300_32x8_36e_openimages
+    In Collection: Open Images Dataset
+    Config: configs/openimages/ssd300_32x8_36e_openimages
+    Metadata:
+      Training Memory (GB): 10.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 35.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth
+
+  - Name: faster_rcnn_r50_fpn_32x2_1x_openimages_challenge
+    In Collection: Open Images Dataset
+    Config: configs/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images Challenge 2019W
+        Metrics:
+          box AP: 54.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20211229_071252-46380cde.pth
diff --git a/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py b/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py
new file mode 100644
index 00000000000..f45be19e4d9
--- /dev/null
+++ b/configs/openimages/retinanet_r50_fpn_32x2_1x_openimages.py
@@ -0,0 +1,17 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/openimages_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(bbox_head=dict(num_classes=601))
+
+optimizer = dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=26000,
+    warmup_ratio=1.0 / 64,
+    step=[8, 11])
diff --git a/configs/openimages/ssd300_32x8_36e_openimages.py b/configs/openimages/ssd300_32x8_36e_openimages.py
new file mode 100644
index 00000000000..60143db4203
--- /dev/null
+++ b/configs/openimages/ssd300_32x8_36e_openimages.py
@@ -0,0 +1,78 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/openimages_detection.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py'
+]
+model = dict(
+    bbox_head=dict(
+        num_classes=601,
+        anchor_generator=dict(basesize_ratio_range=(0.2, 0.9))))
+# dataset settings
+dataset_type = 'OpenImagesDataset'
+data_root = 'data/OpenImages/'
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True, normed_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', img_scale=(300, 300), keep_ratio=False),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(300, 300),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=False),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=8,  # using 32 GPUS while training.
+    workers_per_gpu=0,  # workers_per_gpu > 0 may occur out of memory
+    train=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=data_root +
+            'annotations/oidv6-train-annotations-bbox.csv',
+            img_prefix=data_root + 'OpenImages/train/',
+            label_file=data_root +
+            'annotations/class-descriptions-boxable.csv',
+            hierarchy_file=data_root +
+            'annotations/bbox_labels_600_hierarchy.json',
+            pipeline=train_pipeline)),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.04, momentum=0.9, weight_decay=5e-4)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=20000,
+    warmup_ratio=0.001,
+    step=[8, 11])
diff --git a/mmdet/core/evaluation/__init__.py b/mmdet/core/evaluation/__init__.py
index 0aa94c9fe0e..67e7c55b3ca 100644
--- a/mmdet/core/evaluation/__init__.py
+++ b/mmdet/core/evaluation/__init__.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .class_names import (cityscapes_classes, coco_classes, dataset_aliases,
                           get_classes, imagenet_det_classes,
-                          imagenet_vid_classes, voc_classes)
+                          imagenet_vid_classes, oid_challenge_classes,
+                          oid_v6_classes, voc_classes)
 from .eval_hooks import DistEvalHook, EvalHook
 from .mean_ap import average_precision, eval_map, print_map_summary
 from .panoptic_utils import INSTANCE_OFFSET
@@ -13,5 +14,6 @@
     'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes',
     'DistEvalHook', 'EvalHook', 'average_precision', 'eval_map',
     'print_map_summary', 'eval_recalls', 'print_recall_summary',
-    'plot_num_recall', 'plot_iou_recall', 'INSTANCE_OFFSET'
+    'plot_num_recall', 'plot_iou_recall', 'oid_v6_classes',
+    'oid_challenge_classes', 'INSTANCE_OFFSET'
 ]
diff --git a/mmdet/core/evaluation/class_names.py b/mmdet/core/evaluation/class_names.py
index c7122b5ca95..737971182d3 100644
--- a/mmdet/core/evaluation/class_names.py
+++ b/mmdet/core/evaluation/class_names.py
@@ -90,13 +90,228 @@ def cityscapes_classes():
     ]
 
 
+def oid_challenge_classes():
+    return [
+        'Footwear', 'Jeans', 'House', 'Tree', 'Woman', 'Man', 'Land vehicle',
+        'Person', 'Wheel', 'Bus', 'Human face', 'Bird', 'Dress', 'Girl',
+        'Vehicle', 'Building', 'Cat', 'Car', 'Belt', 'Elephant', 'Dessert',
+        'Butterfly', 'Train', 'Guitar', 'Poster', 'Book', 'Boy', 'Bee',
+        'Flower', 'Window', 'Hat', 'Human head', 'Dog', 'Human arm', 'Drink',
+        'Human mouth', 'Human hair', 'Human nose', 'Human hand', 'Table',
+        'Marine invertebrates', 'Fish', 'Sculpture', 'Rose', 'Street light',
+        'Glasses', 'Fountain', 'Skyscraper', 'Swimwear', 'Brassiere', 'Drum',
+        'Duck', 'Countertop', 'Furniture', 'Ball', 'Human leg', 'Boat',
+        'Balloon', 'Bicycle helmet', 'Goggles', 'Door', 'Human eye', 'Shirt',
+        'Toy', 'Teddy bear', 'Pasta', 'Tomato', 'Human ear',
+        'Vehicle registration plate', 'Microphone', 'Musical keyboard',
+        'Tower', 'Houseplant', 'Flowerpot', 'Fruit', 'Vegetable',
+        'Musical instrument', 'Suit', 'Motorcycle', 'Bagel', 'French fries',
+        'Hamburger', 'Chair', 'Salt and pepper shakers', 'Snail', 'Airplane',
+        'Horse', 'Laptop', 'Computer keyboard', 'Football helmet', 'Cocktail',
+        'Juice', 'Tie', 'Computer monitor', 'Human beard', 'Bottle',
+        'Saxophone', 'Lemon', 'Mouse', 'Sock', 'Cowboy hat', 'Sun hat',
+        'Football', 'Porch', 'Sunglasses', 'Lobster', 'Crab', 'Picture frame',
+        'Van', 'Crocodile', 'Surfboard', 'Shorts', 'Helicopter', 'Helmet',
+        'Sports uniform', 'Taxi', 'Swan', 'Goose', 'Coat', 'Jacket', 'Handbag',
+        'Flag', 'Skateboard', 'Television', 'Tire', 'Spoon', 'Palm tree',
+        'Stairs', 'Salad', 'Castle', 'Oven', 'Microwave oven', 'Wine',
+        'Ceiling fan', 'Mechanical fan', 'Cattle', 'Truck', 'Box', 'Ambulance',
+        'Desk', 'Wine glass', 'Reptile', 'Tank', 'Traffic light', 'Billboard',
+        'Tent', 'Insect', 'Spider', 'Treadmill', 'Cupboard', 'Shelf',
+        'Seat belt', 'Human foot', 'Bicycle', 'Bicycle wheel', 'Couch',
+        'Bookcase', 'Fedora', 'Backpack', 'Bench', 'Oyster',
+        'Moths and butterflies', 'Lavender', 'Waffle', 'Fork', 'Animal',
+        'Accordion', 'Mobile phone', 'Plate', 'Coffee cup', 'Saucer',
+        'Platter', 'Dagger', 'Knife', 'Bull', 'Tortoise', 'Sea turtle', 'Deer',
+        'Weapon', 'Apple', 'Ski', 'Taco', 'Traffic sign', 'Beer', 'Necklace',
+        'Sunflower', 'Piano', 'Organ', 'Harpsichord', 'Bed', 'Cabinetry',
+        'Nightstand', 'Curtain', 'Chest of drawers', 'Drawer', 'Parrot',
+        'Sandal', 'High heels', 'Tableware', 'Cart', 'Mushroom', 'Kite',
+        'Missile', 'Seafood', 'Camera', 'Paper towel', 'Toilet paper',
+        'Sombrero', 'Radish', 'Lighthouse', 'Segway', 'Pig', 'Watercraft',
+        'Golf cart', 'studio couch', 'Dolphin', 'Whale', 'Earrings', 'Otter',
+        'Sea lion', 'Whiteboard', 'Monkey', 'Gondola', 'Zebra',
+        'Baseball glove', 'Scarf', 'Adhesive tape', 'Trousers', 'Scoreboard',
+        'Lily', 'Carnivore', 'Power plugs and sockets', 'Office building',
+        'Sandwich', 'Swimming pool', 'Headphones', 'Tin can', 'Crown', 'Doll',
+        'Cake', 'Frog', 'Beetle', 'Ant', 'Gas stove', 'Canoe', 'Falcon',
+        'Blue jay', 'Egg', 'Fire hydrant', 'Raccoon', 'Muffin', 'Wall clock',
+        'Coffee', 'Mug', 'Tea', 'Bear', 'Waste container', 'Home appliance',
+        'Candle', 'Lion', 'Mirror', 'Starfish', 'Marine mammal', 'Wheelchair',
+        'Umbrella', 'Alpaca', 'Violin', 'Cello', 'Brown bear', 'Canary', 'Bat',
+        'Ruler', 'Plastic bag', 'Penguin', 'Watermelon', 'Harbor seal', 'Pen',
+        'Pumpkin', 'Harp', 'Kitchen appliance', 'Roller skates', 'Bust',
+        'Coffee table', 'Tennis ball', 'Tennis racket', 'Ladder', 'Boot',
+        'Bowl', 'Stop sign', 'Volleyball', 'Eagle', 'Paddle', 'Chicken',
+        'Skull', 'Lamp', 'Beehive', 'Maple', 'Sink', 'Goldfish', 'Tripod',
+        'Coconut', 'Bidet', 'Tap', 'Bathroom cabinet', 'Toilet',
+        'Filing cabinet', 'Pretzel', 'Table tennis racket', 'Bronze sculpture',
+        'Rocket', 'Mouse', 'Hamster', 'Lizard', 'Lifejacket', 'Goat',
+        'Washing machine', 'Trumpet', 'Horn', 'Trombone', 'Sheep',
+        'Tablet computer', 'Pillow', 'Kitchen & dining room table',
+        'Parachute', 'Raven', 'Glove', 'Loveseat', 'Christmas tree',
+        'Shellfish', 'Rifle', 'Shotgun', 'Sushi', 'Sparrow', 'Bread',
+        'Toaster', 'Watch', 'Asparagus', 'Artichoke', 'Suitcase', 'Antelope',
+        'Broccoli', 'Ice cream', 'Racket', 'Banana', 'Cookie', 'Cucumber',
+        'Dragonfly', 'Lynx', 'Caterpillar', 'Light bulb', 'Office supplies',
+        'Miniskirt', 'Skirt', 'Fireplace', 'Potato', 'Light switch',
+        'Croissant', 'Cabbage', 'Ladybug', 'Handgun', 'Luggage and bags',
+        'Window blind', 'Snowboard', 'Baseball bat', 'Digital clock',
+        'Serving tray', 'Infant bed', 'Sofa bed', 'Guacamole', 'Fox', 'Pizza',
+        'Snowplow', 'Jet ski', 'Refrigerator', 'Lantern', 'Convenience store',
+        'Sword', 'Rugby ball', 'Owl', 'Ostrich', 'Pancake', 'Strawberry',
+        'Carrot', 'Tart', 'Dice', 'Turkey', 'Rabbit', 'Invertebrate', 'Vase',
+        'Stool', 'Swim cap', 'Shower', 'Clock', 'Jellyfish', 'Aircraft',
+        'Chopsticks', 'Orange', 'Snake', 'Sewing machine', 'Kangaroo', 'Mixer',
+        'Food processor', 'Shrimp', 'Towel', 'Porcupine', 'Jaguar', 'Cannon',
+        'Limousine', 'Mule', 'Squirrel', 'Kitchen knife', 'Tiara', 'Tiger',
+        'Bow and arrow', 'Candy', 'Rhinoceros', 'Shark', 'Cricket ball',
+        'Doughnut', 'Plumbing fixture', 'Camel', 'Polar bear', 'Coin',
+        'Printer', 'Blender', 'Giraffe', 'Billiard table', 'Kettle',
+        'Dinosaur', 'Pineapple', 'Zucchini', 'Jug', 'Barge', 'Teapot',
+        'Golf ball', 'Binoculars', 'Scissors', 'Hot dog', 'Door handle',
+        'Seahorse', 'Bathtub', 'Leopard', 'Centipede', 'Grapefruit', 'Snowman',
+        'Cheetah', 'Alarm clock', 'Grape', 'Wrench', 'Wok', 'Bell pepper',
+        'Cake stand', 'Barrel', 'Woodpecker', 'Flute', 'Corded phone',
+        'Willow', 'Punching bag', 'Pomegranate', 'Telephone', 'Pear',
+        'Common fig', 'Bench', 'Wood-burning stove', 'Burrito', 'Nail',
+        'Turtle', 'Submarine sandwich', 'Drinking straw', 'Peach', 'Popcorn',
+        'Frying pan', 'Picnic basket', 'Honeycomb', 'Envelope', 'Mango',
+        'Cutting board', 'Pitcher', 'Stationary bicycle', 'Dumbbell',
+        'Personal care', 'Dog bed', 'Snowmobile', 'Oboe', 'Briefcase',
+        'Squash', 'Tick', 'Slow cooker', 'Coffeemaker', 'Measuring cup',
+        'Crutch', 'Stretcher', 'Screwdriver', 'Flashlight', 'Spatula',
+        'Pressure cooker', 'Ring binder', 'Beaker', 'Torch', 'Winter melon'
+    ]
+
+
+def oid_v6_classes():
+    return [
+        'Tortoise', 'Container', 'Magpie', 'Sea turtle', 'Football',
+        'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink', 'Toy',
+        'Organ (Musical Instrument)', 'Cassette deck', 'Apple', 'Human eye',
+        'Cosmetics', 'Paddle', 'Snowman', 'Beer', 'Chopsticks', 'Human beard',
+        'Bird', 'Parking meter', 'Traffic light', 'Croissant', 'Cucumber',
+        'Radish', 'Towel', 'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick',
+        'Belt', 'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle',
+        'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot',
+        'Headphones', 'Hot dog', 'Shorts', 'Fast food', 'Bus', 'Boy',
+        'Screwdriver', 'Bicycle wheel', 'Barge', 'Laptop', 'Miniskirt',
+        'Drill (Tool)', 'Dress', 'Bear', 'Waffle', 'Pancake', 'Brown bear',
+        'Woodpecker', 'Blue jay', 'Pretzel', 'Bagel', 'Tower', 'Teapot',
+        'Person', 'Bow and arrow', 'Swimwear', 'Beehive', 'Brassiere', 'Bee',
+        'Bat (Animal)', 'Starfish', 'Popcorn', 'Burrito', 'Chainsaw',
+        'Balloon', 'Wrench', 'Tent', 'Vehicle registration plate', 'Lantern',
+        'Toaster', 'Flashlight', 'Billboard', 'Tiara', 'Limousine', 'Necklace',
+        'Carnivore', 'Scissors', 'Stairs', 'Computer keyboard', 'Printer',
+        'Traffic sign', 'Chair', 'Shirt', 'Poster', 'Cheese', 'Sock',
+        'Fire hydrant', 'Land vehicle', 'Earrings', 'Tie', 'Watercraft',
+        'Cabinetry', 'Suitcase', 'Muffin', 'Bidet', 'Snack', 'Snowmobile',
+        'Clock', 'Medical equipment', 'Cattle', 'Cello', 'Jet ski', 'Camel',
+        'Coat', 'Suit', 'Desk', 'Cat', 'Bronze sculpture', 'Juice', 'Gondola',
+        'Beetle', 'Cannon', 'Computer mouse', 'Cookie', 'Office building',
+        'Fountain', 'Coin', 'Calculator', 'Cocktail', 'Computer monitor',
+        'Box', 'Stapler', 'Christmas tree', 'Cowboy hat', 'Hiking equipment',
+        'Studio couch', 'Drum', 'Dessert', 'Wine rack', 'Drink', 'Zucchini',
+        'Ladle', 'Human mouth', 'Dairy Product', 'Dice', 'Oven', 'Dinosaur',
+        'Ratchet (Device)', 'Couch', 'Cricket ball', 'Winter melon', 'Spatula',
+        'Whiteboard', 'Pencil sharpener', 'Door', 'Hat', 'Shower', 'Eraser',
+        'Fedora', 'Guacamole', 'Dagger', 'Scarf', 'Dolphin', 'Sombrero',
+        'Tin can', 'Mug', 'Tap', 'Harbor seal', 'Stretcher', 'Can opener',
+        'Goggles', 'Human body', 'Roller skates', 'Coffee cup',
+        'Cutting board', 'Blender', 'Plumbing fixture', 'Stop sign',
+        'Office supplies', 'Volleyball (Ball)', 'Vase', 'Slow cooker',
+        'Wardrobe', 'Coffee', 'Whisk', 'Paper towel', 'Personal care', 'Food',
+        'Sun hat', 'Tree house', 'Flying disc', 'Skirt', 'Gas stove',
+        'Salt and pepper shakers', 'Mechanical fan', 'Face powder', 'Fax',
+        'Fruit', 'French fries', 'Nightstand', 'Barrel', 'Kite', 'Tart',
+        'Treadmill', 'Fox', 'Flag', 'French horn', 'Window blind',
+        'Human foot', 'Golf cart', 'Jacket', 'Egg (Food)', 'Street light',
+        'Guitar', 'Pillow', 'Human leg', 'Isopod', 'Grape', 'Human ear',
+        'Power plugs and sockets', 'Panda', 'Giraffe', 'Woman', 'Door handle',
+        'Rhinoceros', 'Bathtub', 'Goldfish', 'Houseplant', 'Goat',
+        'Baseball bat', 'Baseball glove', 'Mixing bowl',
+        'Marine invertebrates', 'Kitchen utensil', 'Light switch', 'House',
+        'Horse', 'Stationary bicycle', 'Hammer', 'Ceiling fan', 'Sofa bed',
+        'Adhesive tape', 'Harp', 'Sandal', 'Bicycle helmet', 'Saucer',
+        'Harpsichord', 'Human hair', 'Heater', 'Harmonica', 'Hamster',
+        'Curtain', 'Bed', 'Kettle', 'Fireplace', 'Scale', 'Drinking straw',
+        'Insect', 'Hair dryer', 'Kitchenware', 'Indoor rower', 'Invertebrate',
+        'Food processor', 'Bookcase', 'Refrigerator', 'Wood-burning stove',
+        'Punching bag', 'Common fig', 'Cocktail shaker', 'Jaguar (Animal)',
+        'Golf ball', 'Fashion accessory', 'Alarm clock', 'Filing cabinet',
+        'Artichoke', 'Table', 'Tableware', 'Kangaroo', 'Koala', 'Knife',
+        'Bottle', 'Bottle opener', 'Lynx', 'Lavender (Plant)', 'Lighthouse',
+        'Dumbbell', 'Human head', 'Bowl', 'Humidifier', 'Porch', 'Lizard',
+        'Billiard table', 'Mammal', 'Mouse', 'Motorcycle',
+        'Musical instrument', 'Swim cap', 'Frying pan', 'Snowplow',
+        'Bathroom cabinet', 'Missile', 'Bust', 'Man', 'Waffle iron', 'Milk',
+        'Ring binder', 'Plate', 'Mobile phone', 'Baked goods', 'Mushroom',
+        'Crutch', 'Pitcher (Container)', 'Mirror', 'Personal flotation device',
+        'Table tennis racket', 'Pencil case', 'Musical keyboard', 'Scoreboard',
+        'Briefcase', 'Kitchen knife', 'Nail (Construction)', 'Tennis ball',
+        'Plastic bag', 'Oboe', 'Chest of drawers', 'Ostrich', 'Piano', 'Girl',
+        'Plant', 'Potato', 'Hair spray', 'Sports equipment', 'Pasta',
+        'Penguin', 'Pumpkin', 'Pear', 'Infant bed', 'Polar bear', 'Mixer',
+        'Cupboard', 'Jacuzzi', 'Pizza', 'Digital clock', 'Pig', 'Reptile',
+        'Rifle', 'Lipstick', 'Skateboard', 'Raven', 'High heels', 'Red panda',
+        'Rose', 'Rabbit', 'Sculpture', 'Saxophone', 'Shotgun', 'Seafood',
+        'Submarine sandwich', 'Snowboard', 'Sword', 'Picture frame', 'Sushi',
+        'Loveseat', 'Ski', 'Squirrel', 'Tripod', 'Stethoscope', 'Submarine',
+        'Scorpion', 'Segway', 'Training bench', 'Snake', 'Coffee table',
+        'Skyscraper', 'Sheep', 'Television', 'Trombone', 'Tea', 'Tank', 'Taco',
+        'Telephone', 'Torch', 'Tiger', 'Strawberry', 'Trumpet', 'Tree',
+        'Tomato', 'Train', 'Tool', 'Picnic basket', 'Cooking spray',
+        'Trousers', 'Bowling equipment', 'Football helmet', 'Truck',
+        'Measuring cup', 'Coffeemaker', 'Violin', 'Vehicle', 'Handbag',
+        'Paper cutter', 'Wine', 'Weapon', 'Wheel', 'Worm', 'Wok', 'Whale',
+        'Zebra', 'Auto part', 'Jug', 'Pizza cutter', 'Cream', 'Monkey', 'Lion',
+        'Bread', 'Platter', 'Chicken', 'Eagle', 'Helicopter', 'Owl', 'Duck',
+        'Turtle', 'Hippopotamus', 'Crocodile', 'Toilet', 'Toilet paper',
+        'Squid', 'Clothing', 'Footwear', 'Lemon', 'Spider', 'Deer', 'Frog',
+        'Banana', 'Rocket', 'Wine glass', 'Countertop', 'Tablet computer',
+        'Waste container', 'Swimming pool', 'Dog', 'Book', 'Elephant', 'Shark',
+        'Candle', 'Leopard', 'Axe', 'Hand dryer', 'Soap dispenser',
+        'Porcupine', 'Flower', 'Canary', 'Cheetah', 'Palm tree', 'Hamburger',
+        'Maple', 'Building', 'Fish', 'Lobster', 'Garden Asparagus',
+        'Furniture', 'Hedgehog', 'Airplane', 'Spoon', 'Otter', 'Bull',
+        'Oyster', 'Horizontal bar', 'Convenience store', 'Bomb', 'Bench',
+        'Ice cream', 'Caterpillar', 'Butterfly', 'Parachute', 'Orange',
+        'Antelope', 'Beaker', 'Moths and butterflies', 'Window', 'Closet',
+        'Castle', 'Jellyfish', 'Goose', 'Mule', 'Swan', 'Peach', 'Coconut',
+        'Seat belt', 'Raccoon', 'Chisel', 'Fork', 'Lamp', 'Camera',
+        'Squash (Plant)', 'Racket', 'Human face', 'Human arm', 'Vegetable',
+        'Diaper', 'Unicycle', 'Falcon', 'Chime', 'Snail', 'Shellfish',
+        'Cabbage', 'Carrot', 'Mango', 'Jeans', 'Flowerpot', 'Pineapple',
+        'Drawer', 'Stool', 'Envelope', 'Cake', 'Dragonfly', 'Common sunflower',
+        'Microwave oven', 'Honeycomb', 'Marine mammal', 'Sea lion', 'Ladybug',
+        'Shelf', 'Watch', 'Candy', 'Salad', 'Parrot', 'Handgun', 'Sparrow',
+        'Van', 'Grinder', 'Spice rack', 'Light bulb', 'Corded phone',
+        'Sports uniform', 'Tennis racket', 'Wall clock', 'Serving tray',
+        'Kitchen & dining room table', 'Dog bed', 'Cake stand',
+        'Cat furniture', 'Bathroom accessory', 'Facial tissue holder',
+        'Pressure cooker', 'Kitchen appliance', 'Tire', 'Ruler',
+        'Luggage and bags', 'Microphone', 'Broccoli', 'Umbrella', 'Pastry',
+        'Grapefruit', 'Band-aid', 'Animal', 'Bell pepper', 'Turkey', 'Lily',
+        'Pomegranate', 'Doughnut', 'Glasses', 'Human nose', 'Pen', 'Ant',
+        'Car', 'Aircraft', 'Human hand', 'Skunk', 'Teddy bear', 'Watermelon',
+        'Cantaloupe', 'Dishwasher', 'Flute', 'Balance beam', 'Sandwich',
+        'Shrimp', 'Sewing machine', 'Binoculars', 'Rays and skates', 'Ipod',
+        'Accordion', 'Willow', 'Crab', 'Crown', 'Seahorse', 'Perfume',
+        'Alpaca', 'Taxi', 'Canoe', 'Remote control', 'Wheelchair',
+        'Rugby ball', 'Armadillo', 'Maracas', 'Helmet'
+    ]
+
+
 dataset_aliases = {
     'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
     'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
     'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
     'coco': ['coco', 'mscoco', 'ms_coco'],
     'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'],
-    'cityscapes': ['cityscapes']
+    'cityscapes': ['cityscapes'],
+    'oid_challenge': ['oid_challenge', 'openimages_challenge'],
+    'oid_v6': ['oid_v6', 'openimages_v6']
 }
 
 
diff --git a/mmdet/core/evaluation/mean_ap.py b/mmdet/core/evaluation/mean_ap.py
index ea9a72b4103..fc1274aefea 100644
--- a/mmdet/core/evaluation/mean_ap.py
+++ b/mmdet/core/evaluation/mean_ap.py
@@ -82,7 +82,7 @@ def tpfp_imagenet(det_bboxes,
 
     Returns:
         tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
-            each array is (num_scales, m).
+        each array is (num_scales, m).
     """
 
     if not use_legacy_coordinate:
@@ -190,7 +190,7 @@ def tpfp_default(det_bboxes,
 
     Returns:
         tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
-            each array is (num_scales, m).
+        each array is (num_scales, m).
     """
 
     if not use_legacy_coordinate:
@@ -267,6 +267,210 @@ def tpfp_default(det_bboxes,
     return tp, fp
 
 
+def tpfp_openimages(det_bboxes,
+                    gt_bboxes,
+                    gt_bboxes_ignore=None,
+                    iou_thr=0.5,
+                    area_ranges=None,
+                    use_legacy_coordinate=False,
+                    gt_bboxes_group_of=None,
+                    use_group_of=True,
+                    ioa_thr=0.5):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Default: None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+        gt_bboxes_group_of (ndarray): GT group_of of this image, of shape
+            (k, 1). Default: None
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Default: True.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Default: 0.5.
+
+    Returns:
+        tuple[np.ndarray]: Returns a tuple (tp, fp, det_bboxes), where
+        (tp, fp) whose elements are 0 and 1. The shape of each array is
+        (num_scales, m). (det_bboxes) whose will filter those are not
+        matched by group of gts when processing Open Images evaluation.
+        The shape is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
+         np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp, det_bboxes
+
+    if gt_bboxes_group_of is not None and use_group_of:
+        # if handle group-of boxes, divided gt boxes into two parts:
+        # non-group-of and group-of.Then calculate ious and ioas through
+        # non-group-of group-of gts respectively. This only used in
+        # OpenImages evaluation.
+        assert gt_bboxes_group_of.shape[0] == gt_bboxes.shape[0]
+        non_group_gt_bboxes = gt_bboxes[~gt_bboxes_group_of]
+        group_gt_bboxes = gt_bboxes[gt_bboxes_group_of]
+        num_gts_group = group_gt_bboxes.shape[0]
+        ious = bbox_overlaps(det_bboxes, non_group_gt_bboxes)
+        ioas = bbox_overlaps(det_bboxes, group_gt_bboxes, mode='iof')
+    else:
+        # if not consider group-of boxes, only calculate ious through gt boxes
+        ious = bbox_overlaps(
+            det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+        ioas = None
+
+    if ious.shape[1] > 0:
+        # for each det, the max iou with all gts
+        ious_max = ious.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ious_argmax = ious.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            gt_covered = np.zeros(num_gts, dtype=bool)
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (
+                    gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                        gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                if ious_max[i] >= iou_thr:
+                    matched_gt = ious_argmax[i]
+                    if not (gt_ignore_inds[matched_gt]
+                            or gt_area_ignore[matched_gt]):
+                        if not gt_covered[matched_gt]:
+                            gt_covered[matched_gt] = True
+                            tp[k, i] = 1
+                        else:
+                            fp[k, i] = 1
+                    # otherwise ignore this detected bbox, tp = 0, fp = 0
+                elif min_area is None:
+                    fp[k, i] = 1
+                else:
+                    bbox = det_bboxes[i, :4]
+                    area = (bbox[2] - bbox[0] + extra_length) * (
+                        bbox[3] - bbox[1] + extra_length)
+                    if area >= min_area and area < max_area:
+                        fp[k, i] = 1
+    else:
+        # if there is no no-group-of gt bboxes in this image,
+        # then all det bboxes within area range are false positives.
+        # Only used in OpenImages evaluation.
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+
+    if ioas is None or ioas.shape[1] <= 0:
+        return tp, fp, det_bboxes
+    else:
+        # The evaluation of group-of TP and FP are done in two stages:
+        # 1. All detections are first matched to non group-of boxes; true
+        #    positives are determined.
+        # 2. Detections that are determined as false positives are matched
+        #    against group-of boxes and calculated group-of TP and FP.
+        # Only used in OpenImages evaluation.
+        det_bboxes_group = np.zeros(
+            (num_scales, ioas.shape[1], det_bboxes.shape[1]), dtype=float)
+        match_group_of = np.zeros((num_scales, num_dets), dtype=bool)
+        tp_group = np.zeros((num_scales, num_gts_group), dtype=np.float32)
+        ioas_max = ioas.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ioas_argmax = ioas.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            box_is_covered = tp[k]
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                    gt_bboxes[:, 3] - gt_bboxes[:, 1])
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                matched_gt = ioas_argmax[i]
+                if not box_is_covered[i]:
+                    if ioas_max[i] >= ioa_thr:
+                        if not (gt_ignore_inds[matched_gt]
+                                or gt_area_ignore[matched_gt]):
+                            if not tp_group[k, matched_gt]:
+                                tp_group[k, matched_gt] = 1
+                                match_group_of[k, i] = True
+                            else:
+                                match_group_of[k, i] = True
+
+                            if det_bboxes_group[k, matched_gt, -1] < \
+                                    det_bboxes[i, -1]:
+                                det_bboxes_group[k, matched_gt] = \
+                                    det_bboxes[i]
+
+        fp_group = (tp_group <= 0).astype(float)
+        tps = []
+        fps = []
+        # concatenate tp, fp, and det-boxes which not matched group of
+        # gt boxes and tp_group, fp_group, and det_bboxes_group which
+        # matched group of boxes respectively.
+        for i in range(num_scales):
+            tps.append(
+                np.concatenate((tp[i][~match_group_of[i]], tp_group[i])))
+            fps.append(
+                np.concatenate((fp[i][~match_group_of[i]], fp_group[i])))
+            det_bboxes = np.concatenate(
+                (det_bboxes[~match_group_of[i]], det_bboxes_group[i]))
+
+        tp = np.vstack(tps)
+        fp = np.vstack(fps)
+        return tp, fp, det_bboxes
+
+
 def get_cls_results(det_results, annotations, class_id):
     """Get det results and gt information of a certain class.
 
@@ -294,15 +498,38 @@ def get_cls_results(det_results, annotations, class_id):
     return cls_dets, cls_gts, cls_gts_ignore
 
 
+def get_cls_group_ofs(annotations, class_id):
+    """Get `gt_group_of` of a certain class, which is used in Open Images.
+
+    Args:
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        list[np.ndarray]: `gt_group_of` of a certain class.
+    """
+    gt_group_ofs = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        if ann.get('gt_is_group_ofs', None) is not None:
+            gt_group_ofs.append(ann['gt_is_group_ofs'][gt_inds])
+        else:
+            gt_group_ofs.append(np.empty((0, 1), dtype=np.bool))
+
+    return gt_group_ofs
+
+
 def eval_map(det_results,
              annotations,
              scale_ranges=None,
              iou_thr=0.5,
+             ioa_thr=None,
              dataset=None,
              logger=None,
              tpfp_fn=None,
              nproc=4,
-             use_legacy_coordinate=False):
+             use_legacy_coordinate=False,
+             use_group_of=False):
     """Evaluate mAP of a dataset.
 
     Args:
@@ -322,6 +549,8 @@ def eval_map(det_results,
             Default: None.
         iou_thr (float): IoU threshold to be considered as matched.
             Default: 0.5.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Default: None.
         dataset (list[str] | str | None): Dataset name or dataset classes,
             there are minor differences in metrics for different datasets, e.g.
             "voc07", "imagenet_det", etc. Default: None.
@@ -338,6 +567,8 @@ def eval_map(det_results,
             mmdet v1.x. which means width, height should be
             calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
             Default: False.
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Default: False.
 
     Returns:
         tuple: (mAP, [dict, dict, ...])
@@ -364,20 +595,33 @@ def eval_map(det_results,
         if tpfp_fn is None:
             if dataset in ['det', 'vid']:
                 tpfp_fn = tpfp_imagenet
+            elif dataset in ['oid_challenge', 'oid_v6'] \
+                    or use_group_of is True:
+                tpfp_fn = tpfp_openimages
             else:
                 tpfp_fn = tpfp_default
         if not callable(tpfp_fn):
             raise ValueError(
                 f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
-
+        args = []
+        if use_group_of:
+            # used in Open Images Dataset evaluation
+            gt_group_ofs = get_cls_group_ofs(annotations, i)
+            args.append(gt_group_ofs)
+            args.append([use_group_of for _ in range(num_imgs)])
+        if ioa_thr is not None:
+            args.append([ioa_thr for _ in range(num_imgs)])
         # compute tp and fp for each image with multiple processes
         tpfp = pool.starmap(
             tpfp_fn,
             zip(cls_dets, cls_gts, cls_gts_ignore,
                 [iou_thr for _ in range(num_imgs)],
                 [area_ranges for _ in range(num_imgs)],
-                [use_legacy_coordinate for _ in range(num_imgs)]))
-        tp, fp = tuple(zip(*tpfp))
+                [use_legacy_coordinate for _ in range(num_imgs)], *args))
+        if use_group_of:
+            tp, fp, cls_dets = tuple(zip(*tpfp))
+        else:
+            tp, fp = tuple(zip(*tpfp))
         # calculate gt number of each scale
         # ignored gts or gts beyond the specific scale are not counted
         num_gts = np.zeros(num_scales, dtype=int)
diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py
index 8a0d2e51b8f..f251d07e174 100644
--- a/mmdet/datasets/__init__.py
+++ b/mmdet/datasets/__init__.py
@@ -8,6 +8,7 @@
                                MultiImageMixDataset, RepeatDataset)
 from .deepfashion import DeepFashionDataset
 from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
+from .openimages import OpenImagesChallengeDataset, OpenImagesDataset
 from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
 from .utils import (NumClassCheckHook, get_loading_pipeline,
                     replace_ImageToTensor)
@@ -22,5 +23,6 @@
     'DistributedSampler', 'build_dataloader', 'ConcatDataset', 'RepeatDataset',
     'ClassBalancedDataset', 'WIDERFaceDataset', 'DATASETS', 'PIPELINES',
     'build_dataset', 'replace_ImageToTensor', 'get_loading_pipeline',
-    'NumClassCheckHook', 'CocoPanopticDataset', 'MultiImageMixDataset'
+    'NumClassCheckHook', 'CocoPanopticDataset', 'MultiImageMixDataset',
+    'OpenImagesDataset', 'OpenImagesChallengeDataset'
 ]
diff --git a/mmdet/datasets/custom.py b/mmdet/datasets/custom.py
index 676872c1fb3..e449150abce 100644
--- a/mmdet/datasets/custom.py
+++ b/mmdet/datasets/custom.py
@@ -241,7 +241,7 @@ def prepare_train_img(self, idx):
         return self.pipeline(results)
 
     def prepare_test_img(self, idx):
-        """Get testing data  after pipeline.
+        """Get testing data after pipeline.
 
         Args:
             idx (int): Index of data.
diff --git a/mmdet/datasets/openimages.py b/mmdet/datasets/openimages.py
new file mode 100644
index 00000000000..517105cc65f
--- /dev/null
+++ b/mmdet/datasets/openimages.py
@@ -0,0 +1,841 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import csv
+import json
+import os.path as osp
+import warnings
+from collections import OrderedDict, defaultdict
+
+import mmcv
+import numpy as np
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+from mmcv.utils import print_log
+
+from mmdet.core import eval_map
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class OpenImagesDataset(CustomDataset):
+    """Open Images dataset for detection.
+
+    Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+            image_level_ann_file (str): Image level annotation, which is used
+                in evaluation.
+            get_supercategory (bool): Whether to get parent class of the
+                current class. Default: True.
+            hierarchy_file (str): The file path of the class hierarchy.
+                Default: None.
+            get_metas (bool): Whether to get image metas in testing or
+                validation time. This should be `True` during evaluation.
+                Default: True. The OpenImages annotations do not have image
+                metas (width and height of the image), which will be used
+                during evaluation. We provide two ways to get image metas
+                in `OpenImagesDataset`:
+
+                - 1. `load from file`: Load image metas from pkl file, which
+                  is suggested to use. We provided a script to get image metas:
+                  `tools/misc/get_image_metas.py`, which need to run
+                  this script before training/testing. Please refer to
+                  `config/openimages/README.md` for more details.
+
+                - 2. `load from pipeline`, which will get image metas during
+                  test time. However, this may reduce the inference speed,
+                  especially when using distribution.
+
+            load_from_file (bool): Whether to get image metas from pkl file.
+            meta_file (str): File path to get image metas.
+            filter_labels (bool): Whether filter unannotated classes.
+                Default: True.
+            load_image_level_labels (bool): Whether load and consider image
+                level labels during evaluation. Default: True.
+    """
+
+    def __init__(self,
+                 label_file='',
+                 image_level_ann_file='',
+                 get_supercategory=True,
+                 hierarchy_file=None,
+                 get_metas=True,
+                 load_from_file=True,
+                 meta_file='',
+                 filter_labels=True,
+                 load_image_level_labels=True,
+                 **kwargs):
+        self.cat2label = defaultdict(str)
+        self.index_dict = {}
+        # need get `index_dict` before load annotations
+        class_names = self.get_classes_from_csv(label_file)
+        super(OpenImagesDataset, self).__init__(**kwargs)
+        self.CLASSES = class_names
+        self.image_level_ann_file = image_level_ann_file
+        self.load_image_level_labels = load_image_level_labels
+        if get_supercategory is True:
+            assert hierarchy_file is not None
+            self.class_label_tree = self.get_relation_matrix(hierarchy_file)
+        self.get_supercategory = get_supercategory
+        self.get_metas = get_metas
+        self.load_from_file = load_from_file
+        self.meta_file = meta_file
+        if self.data_root is not None:
+            if not osp.isabs(self.meta_file):
+                self.meta_file = osp.join(self.data_root, self.meta_file)
+        self.filter_labels = filter_labels
+        self.rank, self.world_size = get_dist_info()
+        self.temp_img_metas = []
+        self.test_img_metas = []
+        self.test_img_shapes = []
+        self.load_from_pipeline = False if load_from_file else True
+
+    def get_classes_from_csv(self, label_file):
+        """Get classes name from file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            list[str]: Class name of OpenImages.
+        """
+
+        index_list = []
+        classes_names = []
+        with open(label_file, 'r') as f:
+            reader = csv.reader(f)
+            for line in reader:
+                self.cat2label[line[0]] = line[1]
+                classes_names.append(line[1])
+                index_list.append(line[0])
+        self.index_dict = {index: i for i, index in enumerate(index_list)}
+        return classes_names
+
+    def load_annotations(self, ann_file):
+        """Load annotation from annotation file.
+
+        Special described `self.data_infos` (defaultdict[list[dict]])
+        in this function: Annotations where item of the defaultdict
+        indicates an image, each of which has (n) dicts. Keys of dicts are:
+
+            - `bbox` (list): coordinates of the box, in normalized image
+              coordinates, of shape 4.
+            - `label` (int): the label id.
+            - `is_group_of` (bool):  Indicates that the box spans a group
+              of objects (e.g., a bed of flowers or a crowd of people).
+            - `is_occluded` (bool): Indicates that the object is occluded
+              by another object in the image.
+            - `is_truncated` (bool): Indicates that the object extends
+              beyond the boundary of the image.
+            - `is_depiction` (bool): Indicates that the object is a
+              depiction.
+            - `is_inside` (bool): Indicates a picture taken from the
+              inside of the object.
+
+        Args:
+            ann_file (str): CSV style annotation file path.
+
+        Returns:
+            list[dict]:  Data infos where each item of the list
+            indicates an image. Keys of annotations are:
+
+                - `img_id` (str): Image name.
+                - `filename` (str): Image name with suffix.
+        """
+        self.ann_infos = defaultdict(list)
+        data_infos = []
+        cp_filename = None
+        with open(ann_file, 'r') as f:
+            reader = csv.reader(f)
+            for i, line in enumerate(reader):
+                if i == 0:
+                    continue
+                img_id = line[0]
+                filename = f'{img_id}.jpg'
+                label_id = line[2]
+                assert label_id in self.index_dict
+                label = int(self.index_dict[label_id])
+                bbox = [
+                    float(line[4]),  # xmin
+                    float(line[6]),  # ymin
+                    float(line[5]),  # xmax
+                    float(line[7])  # ymax
+                ]
+                is_occluded = True if int(line[8]) == 1 else False
+                is_truncated = True if int(line[9]) == 1 else False
+                is_group_of = True if int(line[10]) == 1 else False
+                is_depiction = True if int(line[11]) == 1 else False
+                is_inside = True if int(line[12]) == 1 else False
+
+                self.ann_infos[img_id].append(
+                    dict(
+                        bbox=bbox,
+                        label=label,
+                        is_occluded=is_occluded,
+                        is_truncated=is_truncated,
+                        is_group_of=is_group_of,
+                        is_depiction=is_depiction,
+                        is_inside=is_inside))
+                if filename != cp_filename:
+                    data_infos.append(dict(img_id=img_id, filename=filename))
+                    cp_filename = filename
+        return data_infos
+
+    def get_ann_info(self, idx):
+        """Get OpenImages annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+        img_id = self.data_infos[idx]['img_id']
+        bboxes = []
+        labels = []
+        bboxes_ignore = []
+        labels_ignore = []
+        is_occludeds = []
+        is_truncateds = []
+        is_group_ofs = []
+        is_depictions = []
+        is_insides = []
+        for obj in self.ann_infos[img_id]:
+            label = int(obj['label'])
+            bbox = [
+                float(obj['bbox'][0]),
+                float(obj['bbox'][1]),
+                float(obj['bbox'][2]),
+                float(obj['bbox'][3])
+            ]
+            bboxes.append(bbox)
+            labels.append(label)
+
+            # Other parameters
+            is_occludeds.append(obj['is_occluded'])
+            is_truncateds.append(obj['is_truncated'])
+            is_group_ofs.append(obj['is_group_of'])
+            is_depictions.append(obj['is_depiction'])
+            is_insides.append(obj['is_inside'])
+        if not bboxes:
+            bboxes = np.zeros((0, 4))
+            labels = np.zeros((0, ))
+        else:
+            bboxes = np.array(bboxes)
+            labels = np.array(labels)
+        if not bboxes_ignore:
+            bboxes_ignore = np.zeros((0, 4))
+            labels_ignore = np.zeros((0, ))
+        else:
+            bboxes_ignore = np.array(bboxes_ignore)
+            labels_ignore = np.array(labels_ignore)
+
+        assert len(is_group_ofs) == len(labels) == len(bboxes)
+        gt_is_group_ofs = np.array(is_group_ofs, dtype=np.bool)
+
+        # These parameters is not used yet.
+        is_occludeds = np.array(is_occludeds, dtype=np.bool)
+        is_truncateds = np.array(is_truncateds, dtype=np.bool)
+        is_depictions = np.array(is_depictions, dtype=np.bool)
+        is_insides = np.array(is_insides, dtype=np.bool)
+
+        ann = dict(
+            bboxes=bboxes.astype(np.float32),
+            labels=labels.astype(np.int64),
+            bboxes_ignore=bboxes_ignore.astype(np.float32),
+            labels_ignore=labels_ignore.astype(np.int64),
+            gt_is_group_ofs=gt_is_group_ofs,
+            is_occludeds=is_occludeds,
+            is_truncateds=is_truncateds,
+            is_depictions=is_depictions,
+            is_insides=is_insides)
+
+        return ann
+
+    def get_meta_from_file(self, meta_file=''):
+        """Get image metas from pkl file."""
+        assert meta_file.endswith('pkl'), 'File name must be pkl suffix'
+        metas = mmcv.load(meta_file)
+        assert len(metas) == len(self)
+        for i in range(len(metas)):
+            file_name = metas[i]['filename'].split('/')[-1]
+            img_info = self.data_infos[i].get('img_info', None)
+            if img_info is not None:
+                assert file_name == img_info['filename'].split('/')[-1]
+            else:
+                assert file_name == self.data_infos[i]['filename']
+            hw = metas[i]['ori_shape'][:2]
+            self.test_img_shapes.append(hw)
+
+    def get_meta_from_pipeline(self, results):
+        """Get image metas from pipeline."""
+        self.temp_img_metas.extend(results['img_metas'])
+        if dist.is_available() and self.world_size > 1:
+            from mmdet.apis.test import collect_results_cpu
+
+            self.test_img_metas = collect_results_cpu(self.temp_img_metas,
+                                                      len(self))
+        else:
+            self.test_img_metas = self.temp_img_metas
+
+    def get_img_shape(self, metas):
+        """Set images original shape into data_infos."""
+        assert len(metas) == len(self)
+        for i in range(len(metas)):
+            file_name = metas[i].data['filename'].split('/')[-1]
+            img_info = self.data_infos[i].get('img_info', None)
+            if img_info is not None:
+                assert file_name == img_info['filename'].split('/')[-1]
+            else:
+                assert file_name == self.data_infos[i]['filename']
+            hw = metas[i].data['ori_shape'][:2]
+            self.test_img_shapes.append(hw)
+
+    def prepare_test_img(self, idx):
+        """Get testing data after pipeline."""
+        img_info = self.data_infos[idx]
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        results = self.pipeline(results)
+        if self.get_metas and self.load_from_pipeline:
+            self.get_meta_from_pipeline(results)
+        return results
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small."""
+        if self.filter_empty_gt:
+            warnings.warn('OpenImageDatasets does not support '
+                          'filtering empty gt images.')
+        valid_inds = [i for i in range(len(self))]
+        return valid_inds
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio."""
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+        # TODO: set flag without width and height
+
+    def get_relation_matrix(self, hierarchy_file):
+        """Get hierarchy for classes.
+
+        Args:
+            hierarchy_file (sty): File path to the hierarchy for classes.
+
+        Returns:
+            ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """
+
+        assert hierarchy_file.endswith('json')
+        if self.data_root is not None:
+            if not osp.isabs(hierarchy_file):
+                hierarchy_file = osp.join(self.data_root, hierarchy_file)
+        with open(hierarchy_file, 'r') as f:
+            hierarchy = json.load(f)
+        class_num = len(self.CLASSES)
+        class_label_tree = np.eye(class_num, class_num)
+        class_label_tree = self._convert_hierarchy_tree(
+            hierarchy, class_label_tree)
+        return class_label_tree
+
+    def _convert_hierarchy_tree(self,
+                                hierarchy_map,
+                                class_label_tree,
+                                parents=[],
+                                get_all_parents=True):
+        """Get matrix of the corresponding relationship between the parent
+        class and the child class.
+
+        Args:
+            hierarchy_map (dict): Including label name and corresponding
+                subcategory. Keys of dicts are:
+
+                - `LabeName` (str): Name of the label.
+                - `Subcategory` (dict | list): Corresponding subcategory(ies).
+            class_label_tree (ndarray): The matrix of the corresponding
+                relationship between the parent class and the child class,
+                of shape (class_num, class_num).
+            parents (list): Corresponding parent class.
+            get_all_parents (bool): Whether get all parent names.
+                Default: True
+
+        Returns:
+            ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """
+
+        if 'Subcategory' in hierarchy_map:
+            for node in hierarchy_map['Subcategory']:
+                if 'LabelName' in node:
+                    children_name = node['LabelName']
+                    children_index = self.index_dict[children_name]
+                    children = [children_index]
+                else:
+                    continue
+                if len(parents) > 0:
+                    for parent_index in parents:
+                        if get_all_parents:
+                            children.append(parent_index)
+                        class_label_tree[children_index, parent_index] = 1
+
+                class_label_tree = self._convert_hierarchy_tree(
+                    node, class_label_tree, parents=children)
+
+        return class_label_tree
+
+    def add_supercategory_ann(self, annotations):
+        """Add parent classes of the corresponding class of the ground truth
+        bboxes."""
+        for i, ann in enumerate(annotations):
+            assert len(ann['labels']) == len(ann['bboxes']) == \
+                   len(ann['gt_is_group_ofs'])
+            gt_bboxes = []
+            gt_is_group_ofs = []
+            gt_labels = []
+            for j in range(len(ann['labels'])):
+                label = ann['labels'][j]
+                bbox = ann['bboxes'][j]
+                is_group = ann['gt_is_group_ofs'][j]
+                label = np.where(self.class_label_tree[label])[0]
+                if len(label) > 1:
+                    for k in range(len(label)):
+                        gt_bboxes.append(bbox)
+                        gt_is_group_ofs.append(is_group)
+                        gt_labels.append(label[k])
+                else:
+                    gt_bboxes.append(bbox)
+                    gt_is_group_ofs.append(is_group)
+                    gt_labels.append(label[0])
+            annotations[i] = dict(
+                bboxes=np.array(gt_bboxes).astype(np.float32),
+                labels=np.array(gt_labels).astype(np.int64),
+                bboxes_ignore=ann['bboxes_ignore'],
+                gt_is_group_ofs=np.array(gt_is_group_ofs).astype(np.bool))
+
+        return annotations
+
+    def process_results(self, det_results, annotations,
+                        image_level_annotations):
+        """Process results of the corresponding class of the detection bboxes.
+
+        Note: It will choose to do the following two processing according to
+        the parameters:
+
+        1. Whether to add parent classes of the corresponding class of the
+        detection bboxes.
+
+        2. Whether to ignore the classes that unannotated on that image.
+        """
+        if image_level_annotations is not None:
+            assert len(annotations) == \
+                   len(image_level_annotations) == \
+                   len(det_results)
+        else:
+            assert len(annotations) == len(det_results)
+        for i in range(len(det_results)):
+            results = copy.deepcopy(det_results[i])
+            valid_classes = np.where(
+                np.array([[bbox.shape[0]] for bbox in det_results[i]]) != 0)[0]
+            if image_level_annotations is not None:
+                labels = annotations[i]['labels']
+                image_level_labels = \
+                    image_level_annotations[i]['image_level_labels']
+                allowed_labeles = np.unique(
+                    np.append(labels, image_level_labels))
+            else:
+                allowed_labeles = np.unique(annotations[i]['labels'])
+
+            for valid_class in valid_classes:
+                det_cls = np.where(self.class_label_tree[valid_class])[0]
+                for index in det_cls:
+                    if index in allowed_labeles and \
+                            index != valid_class and \
+                            self.get_supercategory:
+                        det_results[i][index] = \
+                            np.concatenate((det_results[i][index],
+                                            results[valid_class]))
+                    elif index not in allowed_labeles and self.filter_labels:
+                        # Remove useless parts
+                        det_results[i][index] = np.empty(
+                            (0, 5)).astype(np.float32)
+        return det_results
+
+    def load_image_label_from_csv(self, image_level_ann_file):
+        """Load image level annotations from csv style ann_file.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            defaultdict[list[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): Label id.
+                - `confidence` (float): Labels that are human-verified to be
+                  present in an image have confidence = 1 (positive labels).
+                  Labels that are human-verified to be absent from an image
+                  have confidence = 0 (negative labels). Machine-generated
+                  labels have fractional confidences, generally >= 0.5.
+                  The higher the confidence, the smaller the chance for
+                  the label to be a false positive.
+        """
+
+        item_lists = defaultdict(list)
+        with open(image_level_ann_file, 'r') as f:
+            reader = csv.reader(f)
+            for i, line in enumerate(reader):
+                if i == 0:
+                    continue
+                img_id = line[0]
+                item_lists[img_id].append(
+                    dict(
+                        image_level_label=int(self.index_dict[line[2]]),
+                        confidence=float(line[3])))
+        return item_lists
+
+    def get_image_level_ann(self, image_level_ann_file):
+        """Get OpenImages annotation by index.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        item_lists = self.load_image_label_from_csv(image_level_ann_file)
+        image_level_annotations = []
+        for i in range(len(self)):
+            img_info = self.data_infos[i].get('img_info', None)
+            if img_info is not None:
+                # for Open Images Challenges
+                img_id = img_info['filename'].split('/')[-1][:-4]
+            else:
+                # for Open Images v6
+                img_id = self.data_infos[i]['img_id']
+            item_list = item_lists.get(img_id, None)
+            if item_list is not None:
+                image_level_labels = []
+                confidences = []
+                for obj in item_list:
+                    image_level_label = int(obj['image_level_label'])
+                    confidence = float(obj['confidence'])
+
+                    image_level_labels.append(image_level_label)
+                    confidences.append(confidence)
+
+                if not image_level_labels:
+                    image_level_labels = np.zeros((0, ))
+                    confidences = np.zeros((0, ))
+                else:
+                    image_level_labels = np.array(image_level_labels)
+                    confidences = np.array(confidences)
+            else:
+                image_level_labels = np.zeros((0, ))
+                confidences = np.zeros((0, ))
+            ann = dict(
+                image_level_labels=image_level_labels.astype(np.int64),
+                confidences=confidences.astype(np.float32))
+            image_level_annotations.append(ann)
+
+        return image_level_annotations
+
+    def denormalize_gt_bboxes(self, annotations):
+        """Convert ground truth bboxes from relative position to absolute
+        position.
+
+        Only used in evaluating time.
+        """
+        assert len(self.test_img_shapes) == len(annotations)
+        for i in range(len(annotations)):
+            h, w = self.test_img_shapes[i]
+            annotations[i]['bboxes'][:, 0::2] *= w
+            annotations[i]['bboxes'][:, 1::2] *= h
+        return annotations
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 iou_thr=0.5,
+                 ioa_thr=0.5,
+                 scale_ranges=None,
+                 denorm_gt_bbox=True,
+                 use_group_of=True):
+        """Evaluate in OpenImages.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Option is
+                 'mAP'. Default: 'mAP'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            iou_thr (float | list[float]): IoU threshold. Default: 0.5.
+            ioa_thr (float | list[float]): IoA threshold. Default: 0.5.
+            scale_ranges (list[tuple], optional): Scale ranges for evaluating
+                mAP. If not specified, all bounding boxes would be included in
+                evaluation. Default: None
+            denorm_gt_bbox (bool): Whether to denorm ground truth bboxes from
+                relative position to absolute position. Default: True
+            use_group_of (bool): Whether consider group of groud truth bboxes
+                during evaluating. Default: True.
+
+        Returns:
+            dict[str, float]: AP metrics.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+
+        if self.load_image_level_labels:
+            image_level_annotations = \
+                self.get_image_level_ann(self.image_level_ann_file)
+        else:
+            image_level_annotations = None
+
+        # load metas from file
+        if self.get_metas and self.load_from_file:
+            self.get_meta_from_file(self.meta_file)
+        # load metas from pipeline
+        else:
+            self.get_img_shape(self.test_img_metas)
+
+        if len(self.test_img_shapes) > len(self):
+            self.test_img_shapes = self.test_img_shapes[:len(self)]
+
+        if denorm_gt_bbox:
+            annotations = self.denormalize_gt_bboxes(annotations)
+
+        # Reset test_image_metas, temp_image_metas and test_img_shapes
+        # to avoid potential error
+        self.temp_img_metas = []
+        self.test_img_shapes = []
+        self.test_img_metas = []
+        if self.get_supercategory:
+            annotations = self.add_supercategory_ann(annotations)
+
+        results = self.process_results(results, annotations,
+                                       image_level_annotations)
+        if use_group_of:
+            assert ioa_thr is not None, \
+                'ioa_thr must have value when using group_of in evaluation.'
+
+        eval_results = OrderedDict()
+        iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+        ioa_thrs = [ioa_thr] if isinstance(ioa_thr, float) or ioa_thr is None \
+            else ioa_thr
+
+        # get dataset type
+        if len(self.CLASSES) == 500:
+            ds_name = 'oid_challenge'
+        elif len(self.CLASSES) == 601:
+            ds_name = 'oid_v6'
+        else:
+            ds_name = self.CLASSES
+            warnings.warn('Cannot infer dataset type from the length of the '
+                          'classes. Set `oid_v6` as dataset type.')
+
+        if metric == 'mAP':
+            assert isinstance(iou_thrs, list) and isinstance(ioa_thrs, list)
+            assert len(ioa_thrs) == len(iou_thrs)
+            mean_aps = []
+            for iou_thr, ioa_thr in zip(iou_thrs, ioa_thrs):
+                print_log(f'\n{"-" * 15}iou_thr, ioa_thr: {iou_thr}, {ioa_thr}'
+                          f'{"-" * 15}')
+                mean_ap, _ = eval_map(
+                    results,
+                    annotations,
+                    scale_ranges=scale_ranges,
+                    iou_thr=iou_thr,
+                    ioa_thr=ioa_thr,
+                    dataset=ds_name,
+                    logger=logger,
+                    use_group_of=use_group_of)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+        return eval_results
+
+
+@DATASETS.register_module()
+class OpenImagesChallengeDataset(OpenImagesDataset):
+    """Open Images Challenge dataset for detection."""
+
+    def __init__(self, **kwargs):
+        super(OpenImagesChallengeDataset, self).__init__(**kwargs)
+
+    def get_classes_from_csv(self, label_file):
+        """Get classes name from file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            list: Class name of OpenImages.
+        """
+
+        label_list = []
+        id_list = []
+        with open(label_file, 'r') as f:
+            reader = csv.reader(f)
+            for line in reader:
+                label_name = line[0]
+                label_id = int(line[2])
+
+                label_list.append(line[1])
+                id_list.append(label_id)
+                self.index_dict[label_name] = label_id - 1
+
+        indexes = np.argsort(id_list)
+        classes_names = []
+        for index in indexes:
+            classes_names.append(label_list[index])
+        return classes_names
+
+    def load_annotations(self, ann_file):
+        """Load annotation from annotation file."""
+        assert ann_file.endswith('txt')
+        with open(ann_file) as f:
+            lines = f.readlines()
+        i = 0
+        ann_infos = []
+        while i < len(lines):
+            bboxes = []
+            labels = []
+            is_group_ofs = []
+            filename = lines[i].rstrip()
+            i += 2
+            img_gt_size = int(lines[i])
+            i += 1
+            for j in range(img_gt_size):
+                sp = lines[i + j].split()
+                bboxes.append(
+                    [float(sp[1]),
+                     float(sp[2]),
+                     float(sp[3]),
+                     float(sp[4])])
+                labels.append(int(sp[0]) - 1)  # labels begin from 1
+                is_group_ofs.append(True if int(sp[5]) == 1 else False)
+            i += img_gt_size
+
+            gt_bboxes = np.array(bboxes, dtype=np.float32)
+            gt_labels = np.array(labels, dtype=np.int64)
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+            gt_is_group_ofs = np.array(is_group_ofs, dtype=np.bool)
+
+            img_info = dict(filename=filename)
+            ann_info = dict(
+                bboxes=gt_bboxes,
+                labels=gt_labels,
+                bboxes_ignore=gt_bboxes_ignore,
+                gt_is_group_ofs=gt_is_group_ofs)
+            ann_infos.append(dict(img_info=img_info, ann_info=ann_info))
+
+        return ann_infos
+
+    def prepare_train_img(self, idx):
+        """Get training data and annotations after pipeline."""
+        ann_info = self.data_infos[idx]
+        results = dict(
+            img_info=ann_info['img_info'],
+            ann_info=ann_info['ann_info'],
+        )
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Get testing data after pipeline."""
+        ann_info = self.data_infos[idx]
+        results = dict(img_info=ann_info['img_info'])
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+
+        results = self.pipeline(results)
+        if self.get_metas and self.load_from_pipeline:
+            self.get_meta_from_pipeline(results)
+        return results
+
+    def get_relation_matrix(self, hierarchy_file):
+        """Get hierarchy for classes.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            ndarray: The matrix of the corresponding
+            relationship between the parent class and the child class,
+            of shape (class_num, class_num).
+        """
+
+        assert hierarchy_file.endswith('np')
+        class_label_tree = np.load(hierarchy_file, allow_pickle=True)
+        return class_label_tree[1:, 1:]
+
+    def get_ann_info(self, idx):
+        """Get OpenImages annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+        # avoid some potential error
+        data_infos = copy.deepcopy(self.data_infos[idx]['ann_info'])
+        return data_infos
+
+    def load_image_label_from_csv(self, image_level_ann_file):
+        """Load image level annotations from csv style ann_file.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            defaultdict[list[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): of shape 1.
+                - `confidence` (float): of shape 1.
+        """
+
+        item_lists = defaultdict(list)
+        with open(image_level_ann_file, 'r') as f:
+            reader = csv.reader(f)
+            i = -1
+            for line in reader:
+                i += 1
+                if i == 0:
+                    continue
+                else:
+                    img_id = line[0]
+                    label_id = line[1]
+                    assert label_id in self.index_dict
+                    image_level_label = int(self.index_dict[label_id])
+                    confidence = float(line[2])
+                    item_lists[img_id].append(
+                        dict(
+                            image_level_label=image_level_label,
+                            confidence=confidence))
+        return item_lists
diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py
index 284fe28444c..fc68fc3d22f 100644
--- a/mmdet/datasets/pipelines/loading.py
+++ b/mmdet/datasets/pipelines/loading.py
@@ -213,6 +213,9 @@ class LoadAnnotations:
             annotation. Default: False.
         poly2mask (bool): Whether to convert the instance masks from polygons
             to bitmaps. Default: True.
+        denorm_bbox (bool): Whether to convert bbox from relative value to
+            absolute value. Only used in OpenImage Dataset.
+            Default: False.
         file_client_args (dict): Arguments to instantiate a FileClient.
             See :class:`mmcv.fileio.FileClient` for details.
             Defaults to ``dict(backend='disk')``.
@@ -224,12 +227,14 @@ def __init__(self,
                  with_mask=False,
                  with_seg=False,
                  poly2mask=True,
+                 denorm_bbox=False,
                  file_client_args=dict(backend='disk')):
         self.with_bbox = with_bbox
         self.with_label = with_label
         self.with_mask = with_mask
         self.with_seg = with_seg
         self.poly2mask = poly2mask
+        self.denorm_bbox = denorm_bbox
         self.file_client_args = file_client_args.copy()
         self.file_client = None
 
@@ -246,11 +251,24 @@ def _load_bboxes(self, results):
         ann_info = results['ann_info']
         results['gt_bboxes'] = ann_info['bboxes'].copy()
 
+        if self.denorm_bbox:
+            h, w = results['img_shape'][:2]
+            bbox_num = results['gt_bboxes'].shape[0]
+            if bbox_num != 0:
+                results['gt_bboxes'][:, 0::2] *= w
+                results['gt_bboxes'][:, 1::2] *= h
+            results['gt_bboxes'] = results['gt_bboxes'].astype(np.float32)
+
         gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
         if gt_bboxes_ignore is not None:
             results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
             results['bbox_fields'].append('gt_bboxes_ignore')
         results['bbox_fields'].append('gt_bboxes')
+
+        gt_is_group_ofs = ann_info.get('gt_is_group_ofs', None)
+        if gt_is_group_ofs is not None:
+            results['gt_is_group_ofs'] = gt_is_group_ofs.copy()
+
         return results
 
     def _load_labels(self, results):
diff --git a/tests/test_data/test_datasets/test_openimages_dataset.py b/tests/test_data/test_datasets/test_openimages_dataset.py
new file mode 100644
index 00000000000..12b2e47d593
--- /dev/null
+++ b/tests/test_data/test_datasets/test_openimages_dataset.py
@@ -0,0 +1,348 @@
+import csv
+import os.path as osp
+import tempfile
+
+import mmcv
+import numpy as np
+import pytest
+
+from mmdet.datasets import OpenImagesChallengeDataset, OpenImagesDataset
+
+
+def _create_ids_error_oid_csv(
+    label_file,
+    fake_csv_file,
+):
+    label_description = ['/m/000002', 'Football']
+    with open(label_file, 'w') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(label_description)
+
+    header = [
+        'ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
+        'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
+        'IsInside'
+    ]
+    annotations = [[
+        'color', 'xclick', '/m/000002', '1', '0.022673031', '0.9642005',
+        '0.07103825', '0.80054647', '0', '0', '0', '0', '0'
+    ],
+                   [
+                       '000595fe6fee6369', 'xclick', '/m/000000', '1', '0',
+                       '1', '0', '1', '0', '0', '1', '0', '0'
+                   ]]
+    with open(fake_csv_file, 'w') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+
+def _create_oid_style_ann(label_file, csv_file, label_level_file):
+    label_description = [['/m/000000', 'Sports equipment'],
+                         ['/m/000001', 'Ball'], ['/m/000002', 'Football'],
+                         ['/m/000004', 'Bicycle']]
+    with open(label_file, 'w') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerows(label_description)
+
+    header = [
+        'ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
+        'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
+        'IsInside'
+    ]
+    annotations = [
+        [
+            'color', 'xclick', '/m/000002', 1, 0.0333333, 0.1, 0.0333333, 0.1,
+            0, 0, 1, 0, 0
+        ],
+        [
+            'color', 'xclick', '/m/000002', 1, 0.1, 0.166667, 0.1, 0.166667, 0,
+            0, 0, 0, 0
+        ],
+    ]
+    with open(csv_file, 'w') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+    header = ['ImageID', 'Source', 'LabelName', 'Confidence']
+    annotations = [['color', 'xclick', '/m/000002', '1'],
+                   ['color', 'xclick', '/m/000004', '0']]
+    with open(label_level_file, 'w') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+
+def _create_hierarchy_json(hierarchy_name):
+    fake_hierarchy = \
+        {'LabelName':  '/m/0bl9f',      # entity label
+         'Subcategory': [
+             {
+                 'LabelName': '/m/000000',
+                 'Subcategory':
+                     [
+                         {'LabelName': '/m/000001',
+                          'Subcategory':
+                              [
+                                  {
+                                      'LabelName': '/m/000002'
+                                  }
+                              ]
+                          },
+                         {
+                             'LabelName': '/m/000004'
+                         }
+                     ]
+             }
+         ]
+         }
+
+    mmcv.dump(fake_hierarchy, hierarchy_name)
+
+
+def _create_hierarchy_np(hierarchy_name):
+    fake_hierarchy = np.array([[0, 1, 0, 0, 0], [0, 1, 1, 0,
+                                                 0], [0, 1, 1, 1, 0],
+                               [0, 1, 0, 0, 1], [0, 0, 0, 0, 0]])
+    with open(hierarchy_name, 'wb') as f:
+        np.save(f, fake_hierarchy)
+
+
+def _create_dummy_results():
+    boxes = [
+        np.zeros((0, 5)),
+        np.zeros((0, 5)),
+        np.array([[10, 10, 15, 15, 1.0], [15, 15, 30, 30, 0.98],
+                  [10, 10, 25, 25, 0.98], [28, 28, 35, 35, 0.97],
+                  [30, 30, 51, 51, 0.96], [100, 110, 120, 130, 0.15]]),
+        np.array([[30, 30, 50, 50, 0.51]]),
+    ]
+    return [boxes]
+
+
+def _creat_oid_challenge_style_ann(txt_file, label_file, label_level_file):
+    bboxes = [
+        'validation/color.jpg\n',
+        '4 29\n',
+        '2\n',
+        '1 0.0333333 0.1 0.0333333 0.1 1\n',
+        '1 0.1 0.166667 0.1 0.166667 0\n',
+    ]
+
+    with open(txt_file, 'w') as f:
+        f.writelines(bboxes)
+        f.close()
+
+    label_description = [['/m/000000', 'Sports equipment', 1],
+                         ['/m/000001', 'Ball', 2],
+                         ['/m/000002', 'Football', 3],
+                         ['/m/000004', 'Bicycle', 4]]
+    with open(label_file, 'w') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerows(label_description)
+
+    header = ['ImageID', 'LabelName', 'Confidence']
+    annotations = [['color', '/m/000001', '1'], ['color', '/m/000000', '0']]
+    with open(label_level_file, 'w') as f:
+        f_csv = csv.writer(f)
+        f_csv.writerow(header)
+        f_csv.writerows(annotations)
+
+
+def _create_metas(meta_file):
+
+    fake_meta = [{
+        'filename': 'data/OpenImages/OpenImages/validation/color.jpg',
+        'ori_shape': (300, 300, 3)
+    }]
+    mmcv.dump(fake_meta, meta_file)
+
+
+def test_oid_annotation_ids_unique():
+    # create fake ann files
+    tmp_dir = tempfile.TemporaryDirectory()
+    fake_label_file = osp.join(tmp_dir.name, 'fake_label.csv')
+    fake_ann_file = osp.join(tmp_dir.name, 'fake_ann.csv')
+    _create_ids_error_oid_csv(fake_label_file, fake_ann_file)
+
+    # test annotation ids not unique error
+    with pytest.raises(AssertionError):
+        OpenImagesDataset(
+            ann_file=fake_ann_file, label_file=fake_label_file, pipeline=[])
+    tmp_dir.cleanup()
+
+
+def test_openimages_dataset():
+    # create fake ann files
+    tmp_dir = tempfile.TemporaryDirectory()
+    label_file = osp.join(tmp_dir.name, 'label_file.csv')
+    ann_file = osp.join(tmp_dir.name, 'ann_file.csv')
+    label_level_file = osp.join(tmp_dir.name, 'label_level_file.csv')
+    _create_oid_style_ann(label_file, ann_file, label_level_file)
+
+    hierarchy_json = osp.join(tmp_dir.name, 'hierarchy.json')
+    _create_hierarchy_json(hierarchy_json)
+
+    # test whether hierarchy_file is not None when set
+    # get_parent_classes is True
+    with pytest.raises(AssertionError):
+        OpenImagesDataset(
+            ann_file=ann_file,
+            label_file=label_file,
+            image_level_ann_file=label_level_file,
+            pipeline=[])
+
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_json,
+        pipeline=[])
+    ann = dataset.get_ann_info(0)
+    # two legal detection bboxes with `group_of` parameter
+    assert ann['bboxes'].shape[0] == ann['labels'].shape[0] == \
+           ann['gt_is_group_ofs'].shape[0] == 2
+
+    # test load metas from pipeline
+    img_norm_cfg = dict(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True)
+    test_pipeline = [
+        dict(type='LoadImageFromFile'),
+        dict(
+            type='MultiScaleFlipAug',
+            img_scale=(128, 128),
+            flip=False,
+            transforms=[
+                dict(type='Resize', keep_ratio=True),
+                dict(type='RandomFlip'),
+                dict(type='Normalize', **img_norm_cfg),
+                dict(type='Pad', size_divisor=32),
+                dict(type='ImageToTensor', keys=['img']),
+                dict(type='Collect', keys=['img']),
+            ])
+    ]
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        img_prefix='tests/data',
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        load_from_file=False,
+        hierarchy_file=hierarchy_json,
+        pipeline=test_pipeline)
+    dataset.prepare_test_img(0)
+    assert len(dataset.test_img_metas) == 1
+    result = _create_dummy_results()
+    dataset.evaluate(result)
+
+    # test get hierarchy for classes
+    hierarchy_json = osp.join(tmp_dir.name, 'hierarchy.json')
+    _create_hierarchy_json(hierarchy_json)
+
+    # test with hierarchy file wrong suffix
+    with pytest.raises(AssertionError):
+        fake_path = osp.join(tmp_dir.name, 'hierarchy.csv')
+        dataset.get_relation_matrix(fake_path)
+
+    # test load hierarchy file succseefully
+    hierarchy = dataset.get_relation_matrix(hierarchy_json)
+    hierarchy_gt = np.array([[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0],
+                             [1, 0, 0, 1]])
+    assert np.equal(hierarchy, hierarchy_gt).all()
+
+    # test evaluation
+    # create fake metas
+    meta_file = osp.join(tmp_dir.name, 'meta.pkl')
+    _create_metas(meta_file)
+
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_json,
+        meta_file=meta_file,
+        pipeline=[])
+    # test evaluation with using group_of, adding father classes to
+    # GT and annotations, and considering image_level_image,
+    # In the first label (Sports equipment): tp = [0, 1, 0, 0, 1],
+    # fp = [1, 0, 1, 1, 0]
+    # In the second label (Ball), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the third label (Football), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the forth label (Bicycle), tp = [0], fp = [1].
+    result = _create_dummy_results()
+    parsed_results = dataset.evaluate(result)
+    assert np.isclose(parsed_results['mAP'], 0.8333, 1e-4)
+
+    dataset = OpenImagesDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        load_image_level_labels=False,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_json,
+        meta_file=meta_file,
+        pipeline=[])
+
+    # test evaluation with using group_of, adding father classes to
+    # GT and annotations, and not considering image_level_image,
+    # In the first label (Sports equipment): tp = [0, 1, 0, 0, 1],
+    # fp = [1, 0, 1, 1, 0]
+    # In the second label (Ball), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the third label (Football), tp = [0, 1, 0, 1], fp = [1, 0, 1, 0].
+    # In the forth label (Bicycle), tp = [], fp = [].
+    result = _create_dummy_results()
+    parsed_results = dataset.evaluate(result)
+    assert np.isclose(parsed_results['mAP'], 0.8333, 1e-4)
+    tmp_dir.cleanup()
+
+
+def test_openimages_challenge_dataset():
+    # create fake ann files
+    tmp_dir = tempfile.TemporaryDirectory()
+    ann_file = osp.join(tmp_dir.name, 'ann_file.txt')
+    label_file = osp.join(tmp_dir.name, 'label_file.csv')
+    label_level_file = osp.join(tmp_dir.name, 'label_level_file.csv')
+    _creat_oid_challenge_style_ann(ann_file, label_file, label_level_file)
+
+    dataset = OpenImagesChallengeDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        load_image_level_labels=False,
+        get_supercategory=False,
+        pipeline=[])
+    ann = dataset.get_ann_info(0)
+
+    # two legal detection bboxes with `group_of` parameter
+    assert ann['bboxes'].shape[0] == ann['labels'].shape[0] == \
+           ann['gt_is_group_ofs'].shape[0] == 2
+
+    dataset.prepare_train_img(0)
+    dataset.prepare_test_img(0)
+
+    meta_file = osp.join(tmp_dir.name, 'meta.pkl')
+    _create_metas(meta_file)
+
+    result = _create_dummy_results()
+    with pytest.raises(AssertionError):
+        fake_json = osp.join(tmp_dir.name, 'hierarchy.json')
+        dataset = OpenImagesChallengeDataset(
+            ann_file=ann_file,
+            label_file=label_file,
+            image_level_ann_file=label_level_file,
+            hierarchy_file=fake_json,
+            meta_file=meta_file,
+            pipeline=[])
+        dataset.evaluate(result)
+
+    hierarchy_file = osp.join(tmp_dir.name, 'hierarchy.np')
+    _create_hierarchy_np(hierarchy_file)
+    dataset = OpenImagesChallengeDataset(
+        ann_file=ann_file,
+        label_file=label_file,
+        image_level_ann_file=label_level_file,
+        hierarchy_file=hierarchy_file,
+        meta_file=meta_file,
+        pipeline=[])
+    dataset.evaluate(result)
+    tmp_dir.cleanup()
diff --git a/tests/test_metrics/test_mean_ap.py b/tests/test_metrics/test_mean_ap.py
index e5fe3cdf2f4..5136a92766a 100644
--- a/tests/test_metrics/test_mean_ap.py
+++ b/tests/test_metrics/test_mean_ap.py
@@ -1,6 +1,7 @@
 import numpy as np
 
-from mmdet.core.evaluation.mean_ap import eval_map, tpfp_default, tpfp_imagenet
+from mmdet.core.evaluation.mean_ap import (eval_map, tpfp_default,
+                                           tpfp_imagenet, tpfp_openimages)
 
 det_bboxes = np.array([
     [0, 0, 10, 10],
@@ -85,3 +86,82 @@ def test_eval_map():
     assert 0.291 < mean_ap < 0.293
     eval_map(det_results, annotations, use_legacy_coordinate=False)
     assert 0.291 < mean_ap < 0.293
+
+
+def test_tpfp_openimages():
+
+    det_bboxes = np.array([[10, 10, 15, 15, 1.0], [15, 15, 30, 30, 0.98],
+                           [10, 10, 25, 25, 0.98], [28, 28, 35, 35, 0.97],
+                           [30, 30, 51, 51, 0.96], [100, 110, 120, 130, 0.15]])
+    gt_bboxes = np.array([[10., 10., 30., 30.], [30., 30., 50., 50.]])
+    gt_groups_of = np.array([True, False], dtype=np.bool)
+    gt_ignore = np.zeros((0, 4))
+
+    # Open Images evaluation using group of.
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=True,
+        ioa_thr=0.5)
+
+    tp = result[0]
+    fp = result[1]
+    cls_dets = result[2]
+
+    assert tp.shape == (1, 4)
+    assert fp.shape == (1, 4)
+    assert cls_dets.shape == (4, 5)
+
+    assert (tp == np.array([[0, 1, 0, 1]])).all()
+    assert (fp == np.array([[1, 0, 1, 0]])).all()
+    cls_dets_gt = np.array([[28., 28., 35., 35., 0.97],
+                            [30., 30., 51., 51., 0.96],
+                            [100., 110., 120., 130., 0.15],
+                            [10., 10., 15., 15., 1.]])
+    assert (cls_dets == cls_dets_gt).all()
+
+    # Open Images evaluation not using group of.
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=False,
+        ioa_thr=0.5)
+    tp = result[0]
+    fp = result[1]
+    cls_dets = result[2]
+    assert tp.shape == (1, 6)
+    assert fp.shape == (1, 6)
+    assert cls_dets.shape == (6, 5)
+
+    # Open Images evaluation using group of, and gt is all group of bboxes.
+    gt_groups_of = np.array([True, True], dtype=np.bool)
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=True,
+        ioa_thr=0.5)
+    tp = result[0]
+    fp = result[1]
+    cls_dets = result[2]
+    assert tp.shape == (1, 3)
+    assert fp.shape == (1, 3)
+    assert cls_dets.shape == (3, 5)
+
+    # Open Images evaluation with empty gt.
+    gt_bboxes = np.zeros((0, 4))
+    gt_groups_of = np.empty((0))
+    result = tpfp_openimages(
+        det_bboxes,
+        gt_bboxes,
+        gt_bboxes_ignore=gt_ignore,
+        gt_bboxes_group_of=gt_groups_of,
+        use_group_of=True,
+        ioa_thr=0.5)
+    fp = result[1]
+    assert (fp == np.array([[1, 1, 1, 1, 1, 1]])).all()
diff --git a/tools/misc/get_image_metas.py b/tools/misc/get_image_metas.py
new file mode 100644
index 00000000000..a9957d9d856
--- /dev/null
+++ b/tools/misc/get_image_metas.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Get test image metas on a specific dataset.
+
+Here is an example to run this script.
+
+Example:
+    python tools/misc/get_image_metas.py ${CONFIG} \
+    --out ${OUTPUT FILE NAME}
+"""
+import argparse
+import csv
+import os.path as osp
+from multiprocessing import Pool
+
+import mmcv
+from mmcv import Config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Collect image metas')
+    parser.add_argument('config', help='Config file path')
+    parser.add_argument(
+        '--out',
+        default='validation-image-metas.pkl',
+        help='The output image metas file name. The save dir is in the '
+        'same directory as `dataset.ann_file` path')
+    parser.add_argument(
+        '--nproc',
+        default=4,
+        type=int,
+        help='Processes used for get image metas')
+    args = parser.parse_args()
+    return args
+
+
+def get_metas_from_csv_style_ann_file(ann_file):
+    data_infos = []
+    cp_filename = None
+    with open(ann_file, 'r') as f:
+        reader = csv.reader(f)
+        for i, line in enumerate(reader):
+            if i == 0:
+                continue
+            img_id = line[0]
+            filename = f'{img_id}.jpg'
+            if filename != cp_filename:
+                data_infos.append(dict(filename=filename))
+                cp_filename = filename
+    return data_infos
+
+
+def get_metas_from_txt_style_ann_file(ann_file):
+    with open(ann_file) as f:
+        lines = f.readlines()
+    i = 0
+    data_infos = []
+    while i < len(lines):
+        filename = lines[i].rstrip()
+        data_infos.append(dict(filename=filename))
+        skip_lines = int(lines[i + 2]) + 3
+        i += skip_lines
+    return data_infos
+
+
+def get_image_metas(data_info, img_prefix):
+    file_client = mmcv.FileClient(backend='disk')
+    filename = data_info.get('filename', None)
+    if filename is not None:
+        if img_prefix is not None:
+            filename = osp.join(img_prefix, filename)
+        img_bytes = file_client.get(filename)
+        img = mmcv.imfrombytes(img_bytes, flag='color')
+        meta = dict(filename=filename, ori_shape=img.shape)
+    else:
+        raise NotImplementedError('Missing `filename` in data_info')
+    return meta
+
+
+def main():
+    args = parse_args()
+    assert args.out.endswith('pkl'), 'The output file name must be pkl suffix'
+
+    # load config files
+    cfg = Config.fromfile(args.config)
+    ann_file = cfg.data.test.ann_file
+    img_prefix = cfg.data.test.img_prefix
+
+    print(f'{"-" * 5} Start Processing {"-" * 5}')
+    if ann_file.endswith('csv'):
+        data_infos = get_metas_from_csv_style_ann_file(ann_file)
+    elif ann_file.endswith('txt'):
+        data_infos = get_metas_from_txt_style_ann_file(ann_file)
+    else:
+        shuffix = ann_file.split('.')[-1]
+        raise NotImplementedError('File name must be csv or txt suffix but '
+                                  f'get {shuffix}')
+
+    print(f'Successfully load annotation file from {ann_file}')
+    print(f'Processing {len(data_infos)} images...')
+    pool = Pool(args.nproc)
+    # get image metas with multiple processes
+    image_metas = pool.starmap(
+        get_image_metas,
+        zip(data_infos, [img_prefix for _ in range(len(data_infos))]),
+    )
+    pool.close()
+
+    # save image metas
+    root_path = cfg.data.test.ann_file.rsplit('/', 1)[0]
+    save_path = osp.join(root_path, args.out)
+    mmcv.dump(image_metas, save_path)
+    print(f'Image meta file save to: {save_path}')
+
+
+if __name__ == '__main__':
+    main()

From 85c37e0c2534ec84518c9c22074d9fc8dcd61a86 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Tue, 22 Feb 2022 19:26:04 +0800
Subject: [PATCH 17/27] [Enhance] Speed up SimOTA matching. (#7098)

---
 mmdet/core/bbox/assigners/sim_ota_assigner.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mmdet/core/bbox/assigners/sim_ota_assigner.py b/mmdet/core/bbox/assigners/sim_ota_assigner.py
index 5a5902970af..263abfcd8ff 100644
--- a/mmdet/core/bbox/assigners/sim_ota_assigner.py
+++ b/mmdet/core/bbox/assigners/sim_ota_assigner.py
@@ -225,7 +225,7 @@ def get_in_gt_and_in_center_info(self, priors, gt_bboxes):
         return is_in_gts_or_centers, is_in_boxes_and_centers
 
     def dynamic_k_matching(self, cost, pairwise_ious, num_gt, valid_mask):
-        matching_matrix = torch.zeros_like(cost)
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
         # select candidate topk ious for dynamic-k calculation
         candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
         topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
@@ -233,8 +233,8 @@ def dynamic_k_matching(self, cost, pairwise_ious, num_gt, valid_mask):
         dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
         for gt_idx in range(num_gt):
             _, pos_idx = torch.topk(
-                cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
-            matching_matrix[:, gt_idx][pos_idx] = 1.0
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
 
         del topk_ious, dynamic_ks, pos_idx
 
@@ -242,10 +242,10 @@ def dynamic_k_matching(self, cost, pairwise_ious, num_gt, valid_mask):
         if prior_match_gt_mask.sum() > 0:
             cost_min, cost_argmin = torch.min(
                 cost[prior_match_gt_mask, :], dim=1)
-            matching_matrix[prior_match_gt_mask, :] *= 0.0
-            matching_matrix[prior_match_gt_mask, cost_argmin] = 1.0
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
         # get foreground mask inside box and center prior
-        fg_mask_inboxes = matching_matrix.sum(1) > 0.0
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
         valid_mask[valid_mask.clone()] = fg_mask_inboxes
 
         matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)

From cac356380d505bf15587f07c0529218cc36b9652 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Tue, 22 Feb 2022 21:35:54 +0800
Subject: [PATCH 18/27] [Feature] Add Maskformer to mmdet (#7212)

* first commit

* add README

* move model description from config to readme

add description for binary_input

add description for dice loss

add a independent panoptic gt processing function

add a independent panoptic gt processing function

remove compatibility of pretrain in maskformer

* update comments in maskformer_head

* update docs format
---
 configs/maskformer/README.md                  |  60 ++
 .../maskformer_r50_mstrain_16x1_75e_coco.py   | 220 ++++++
 mmdet/core/bbox/assigners/__init__.py         |   3 +-
 .../bbox/assigners/mask_hungarian_assigner.py | 130 ++++
 mmdet/core/bbox/match_costs/__init__.py       |   5 +-
 mmdet/core/bbox/match_costs/match_cost.py     | 114 ++-
 mmdet/core/bbox/samplers/__init__.py          |   5 +-
 .../core/bbox/samplers/mask_pseudo_sampler.py |  44 ++
 .../bbox/samplers/mask_sampling_result.py     |  60 ++
 mmdet/models/dense_heads/__init__.py          |   3 +-
 mmdet/models/dense_heads/maskformer_head.py   | 666 ++++++++++++++++++
 mmdet/models/detectors/__init__.py            |   4 +-
 mmdet/models/detectors/maskformer.py          | 106 +++
 mmdet/models/losses/dice_loss.py              |  41 +-
 mmdet/models/plugins/__init__.py              |   3 +-
 mmdet/models/plugins/pixel_decoder.py         | 245 +++++++
 mmdet/models/utils/__init__.py                |   4 +-
 mmdet/models/utils/panoptic_gt_processing.py  |  62 ++
 .../test_dense_heads/test_maskformer_head.py  | 203 ++++++
 tests/test_models/test_forward.py             | 111 +++
 tests/test_models/test_loss.py                |  24 +-
 tests/test_models/test_plugins.py             |  82 +++
 tests/test_utils/test_assigner.py             |  71 +-
 23 files changed, 2229 insertions(+), 37 deletions(-)
 create mode 100644 configs/maskformer/README.md
 create mode 100644 configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
 create mode 100644 mmdet/core/bbox/assigners/mask_hungarian_assigner.py
 create mode 100644 mmdet/core/bbox/samplers/mask_pseudo_sampler.py
 create mode 100644 mmdet/core/bbox/samplers/mask_sampling_result.py
 create mode 100644 mmdet/models/dense_heads/maskformer_head.py
 create mode 100644 mmdet/models/detectors/maskformer.py
 create mode 100644 mmdet/models/plugins/pixel_decoder.py
 create mode 100644 mmdet/models/utils/panoptic_gt_processing.py
 create mode 100644 tests/test_models/test_dense_heads/test_maskformer_head.py

diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
new file mode 100644
index 00000000000..ce1384ae77e
--- /dev/null
+++ b/configs/maskformer/README.md
@@ -0,0 +1,60 @@
+# Per-Pixel Classification is Not All You Need for Semantic Segmentation
+
+## Abstract
+
+Modern approaches typically formulate semantic segmentation as a per-pixel classification
+task, while instance-level segmentation is handled with an alternative mask
+classification. Our key insight: mask classification is sufficiently general to solve
+both semantic- and instance-level segmentation tasks in a unified manner using
+the exact same model, loss, and training procedure. Following this observation,
+we propose MaskFormer, a simple mask classification model which predicts a
+set of binary masks, each associated with a single global class label prediction.
+Overall, the proposed mask classification-based method simplifies the landscape
+of effective approaches to semantic and panoptic segmentation tasks and shows
+excellent empirical results. In particular, we observe that MaskFormer outperforms
+per-pixel classification baselines when the number of classes is large. Our mask
+classification-based method outperforms both current state-of-the-art semantic
+(55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+
+<div align=center>
+<img src="https://camo.githubusercontent.com/29fb22298d506ce176caad3006a7b05ef2603ca12cece6c788b7e73c046e8bc9/68747470733a2f2f626f77656e63303232312e6769746875622e696f2f696d616765732f6d61736b666f726d65722e706e67" height="300"/>
+</div>
+
+## Citation
+
+```
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```
+
+## Dataset
+
+MaskFormer requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path.
+The directory should be like this.
+
+```none
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   │   ├── panoptic_train2017.json
+│   │   │   ├── panoptic_train2017
+│   │   │   ├── panoptic_val2017.json
+│   │   │   ├── panoptic_val2017
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+```
+
+## Results and Models
+
+| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) | PQ | SQ | RQ | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st |                                                         Config                                                         |         Download         | detail |
+| :------: | :-----: | :-----: | :------: | :------------: | :-: | :-: | :-: | :---: | :---: | :---: | :---: | :---: | :---: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------: | :---: |
+| R-50 | pytorch |    75e    |          |                |    |    |    |      |      |      |      |      |      | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) |  | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
diff --git a/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py b/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
new file mode 100644
index 00000000000..c9d92450570
--- /dev/null
+++ b/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
@@ -0,0 +1,220 @@
+_base_ = [
+    '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='MaskFormer',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type='MaskFormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=80,
+        num_stuff_classes=53,
+        num_queries=100,
+        pixel_decoder=dict(
+            type='TransformerEncoderPixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        attn_drop=0.1,
+                        proj_drop=0.1,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.1,
+                        dropout_layer=None,
+                        add_identity=True),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'),
+                    norm_cfg=dict(type='LN'),
+                    init_cfg=None,
+                    batch_first=False),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding', num_feats=128, normalize=True)),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=6,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.1,
+                    dropout_layer=None,
+                    add_identity=True),
+                # the following parameter was not used,
+                # just make current api happy
+                feedforward_channels=2048,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            bg_cls_weight=0.1,
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=1.0),
+        loss_mask=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0)),
+    train_cfg=dict(
+        assigner=dict(
+            type='MaskHungarianAssigner',
+            cls_cost=dict(type='ClassificationCost', weight=1.0),
+            mask_cost=dict(
+                type='FocalLossCost', weight=20.0, binary_input=True),
+            dice_cost=dict(
+                type='DiceCost', weight=1.0, pred_act=True, eps=1.0)),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(object_mask_thr=0.8, iou_thr=0.8),
+    # pretrained=None,
+    init_cfg=None)
+
+# dataset settings
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[[
+            dict(
+                type='Resize',
+                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                           (736, 1333), (768, 1333), (800, 1333)],
+                multiscale_mode='value',
+                keep_ratio=True)
+        ],
+                  [
+                      dict(
+                          type='Resize',
+                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                          multiscale_mode='value',
+                          keep_ratio=True),
+                      dict(
+                          type='RandomCrop',
+                          crop_type='absolute_range',
+                          crop_size=(384, 600),
+                          allow_negative_crop=True),
+                      dict(
+                          type='Resize',
+                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                     (576, 1333), (608, 1333), (640, 1333),
+                                     (672, 1333), (704, 1333), (736, 1333),
+                                     (768, 1333), (800, 1333)],
+                          multiscale_mode='value',
+                          override=True,
+                          keep_ratio=True)
+                  ]]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=1),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=1,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': dict(lr_mult=1.0, decay_mult=0.0)
+        },
+        norm_decay_mult=0.0))
+optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    gamma=0.1,
+    by_epoch=True,
+    step=[50],
+    warmup='linear',
+    warmup_by_epoch=False,
+    warmup_ratio=1.0,  # no warmup
+    warmup_iters=10)
+runner = dict(type='EpochBasedRunner', max_epochs=75)
diff --git a/mmdet/core/bbox/assigners/__init__.py b/mmdet/core/bbox/assigners/__init__.py
index a182686491d..5eaf7fa3af6 100644
--- a/mmdet/core/bbox/assigners/__init__.py
+++ b/mmdet/core/bbox/assigners/__init__.py
@@ -6,6 +6,7 @@
 from .center_region_assigner import CenterRegionAssigner
 from .grid_assigner import GridAssigner
 from .hungarian_assigner import HungarianAssigner
+from .mask_hungarian_assigner import MaskHungarianAssigner
 from .max_iou_assigner import MaxIoUAssigner
 from .point_assigner import PointAssigner
 from .region_assigner import RegionAssigner
@@ -17,5 +18,5 @@
     'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult',
     'PointAssigner', 'ATSSAssigner', 'CenterRegionAssigner', 'GridAssigner',
     'HungarianAssigner', 'RegionAssigner', 'UniformAssigner', 'SimOTAAssigner',
-    'TaskAlignedAssigner'
+    'TaskAlignedAssigner', 'MaskHungarianAssigner'
 ]
diff --git a/mmdet/core/bbox/assigners/mask_hungarian_assigner.py b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
new file mode 100644
index 00000000000..ef0f35831d6
--- /dev/null
+++ b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.match_costs.builder import build_match_cost
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class MaskHungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth for
+    mask.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, mask focal cost and mask dice cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_cost (obj:`mmcv.ConfigDict` | dict): Classification cost config.
+        mask_cost (obj:`mmcv.ConfigDict` | dict): Mask cost config.
+        dice_cost (obj:`mmcv.ConfigDict` | dict): Dice cost config.
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.0),
+                 mask_cost=dict(
+                     type='FocalLossCost', weight=1.0, binary_input=True),
+                 dice_cost=dict(type='DiceCost', weight=1.0)):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.mask_cost = build_match_cost(mask_cost)
+        self.dice_cost = build_match_cost(dice_cost)
+
+    def assign(self,
+               cls_pred,
+               mask_pred,
+               gt_labels,
+               gt_mask,
+               img_meta,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        Args:
+            cls_pred (Tensor): Class prediction in shape
+                (num_query, cls_out_channels).
+            mask_pred (Tensor): Mask prediction in shape (num_query, H, W).
+            gt_labels (Tensor): Label of 'gt_mask'in shape = (num_gt, ).
+            gt_mask (Tensor): Ground truth mask in shape = (num_gt, H, W).
+            img_meta (dict): Meta information for current image.
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gt, num_query = gt_labels.shape[0], cls_pred.shape[0]
+
+        # 1. assign -1 by default
+        assigned_gt_inds = cls_pred.new_full((num_query, ),
+                                             -1,
+                                             dtype=torch.long)
+        assigned_labels = cls_pred.new_full((num_query, ),
+                                            -1,
+                                            dtype=torch.long)
+        if num_gt == 0 or num_query == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gt == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gt, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and maskcost.
+        if self.cls_cost.weight != 0 and cls_pred is not None:
+            cls_cost = self.cls_cost(cls_pred, gt_labels)
+        else:
+            cls_cost = 0
+
+        if self.mask_cost.weight != 0:
+            # mask_pred shape = [num_query, h, w]
+            # gt_mask shape = [num_gt, h, w]
+            # mask_cost shape = [num_query, num_gt]
+            mask_cost = self.mask_cost(mask_pred, gt_mask)
+        else:
+            mask_cost = 0
+
+        if self.dice_cost.weight != 0:
+            dice_cost = self.dice_cost(mask_pred, gt_mask)
+        else:
+            dice_cost = 0
+        cost = cls_cost + mask_cost + dice_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            cls_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            cls_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gt, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmdet/core/bbox/match_costs/__init__.py b/mmdet/core/bbox/match_costs/__init__.py
index 3f79a1ce36a..81ee588571e 100644
--- a/mmdet/core/bbox/match_costs/__init__.py
+++ b/mmdet/core/bbox/match_costs/__init__.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .builder import build_match_cost
-from .match_cost import BBoxL1Cost, ClassificationCost, FocalLossCost, IoUCost
+from .match_cost import (BBoxL1Cost, ClassificationCost, DiceCost,
+                         FocalLossCost, IoUCost)
 
 __all__ = [
     'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost',
-    'FocalLossCost'
+    'FocalLossCost', 'DiceCost'
 ]
diff --git a/mmdet/core/bbox/match_costs/match_cost.py b/mmdet/core/bbox/match_costs/match_cost.py
index d5ce4ca9f59..3c0a164b3c8 100644
--- a/mmdet/core/bbox/match_costs/match_cost.py
+++ b/mmdet/core/bbox/match_costs/match_cost.py
@@ -35,9 +35,9 @@ def __call__(self, bbox_pred, gt_bboxes):
         Args:
             bbox_pred (Tensor): Predicted boxes with normalized coordinates
                 (cx, cy, w, h), which are all in range [0, 1]. Shape
-                [num_query, 4].
+                (num_query, 4).
             gt_bboxes (Tensor): Ground truth boxes with normalized
-                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
 
         Returns:
             torch.Tensor: bbox_cost value with weight
@@ -59,6 +59,8 @@ class FocalLossCost:
          alpha (int | float, optional): focal_loss alpha
          gamma (int | float, optional): focal_loss gamma
          eps (float, optional): default 1e-12
+         binary_input (bool, optional): Whether the input is binary,
+            default False.
 
      Examples:
          >>> from mmdet.core.bbox.match_costs.match_cost import FocalLossCost
@@ -74,17 +76,23 @@ class FocalLossCost:
                 [-0.1950, -0.1207, -0.2626]])
     """
 
-    def __init__(self, weight=1., alpha=0.25, gamma=2, eps=1e-12):
+    def __init__(self,
+                 weight=1.,
+                 alpha=0.25,
+                 gamma=2,
+                 eps=1e-12,
+                 binary_input=False):
         self.weight = weight
         self.alpha = alpha
         self.gamma = gamma
         self.eps = eps
+        self.binary_input = binary_input
 
-    def __call__(self, cls_pred, gt_labels):
+    def _focal_loss_cost(self, cls_pred, gt_labels):
         """
         Args:
             cls_pred (Tensor): Predicted classification logits, shape
-                [num_query, num_class].
+                (num_query, num_class).
             gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
 
         Returns:
@@ -95,9 +103,50 @@ def __call__(self, cls_pred, gt_labels):
             1 - self.alpha) * cls_pred.pow(self.gamma)
         pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
             1 - cls_pred).pow(self.gamma)
+
         cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
         return cls_cost * self.weight
 
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)
+
 
 @MATCH_COST.register_module()
 class ClassificationCost:
@@ -128,7 +177,7 @@ def __call__(self, cls_pred, gt_labels):
         """
         Args:
             cls_pred (Tensor): Predicted classification logits, shape
-                [num_query, num_class].
+                (num_query, num_class).
             gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
 
         Returns:
@@ -170,9 +219,9 @@ def __call__(self, bboxes, gt_bboxes):
         """
         Args:
             bboxes (Tensor): Predicted boxes with unnormalized coordinates
-                (x1, y1, x2, y2). Shape [num_query, 4].
+                (x1, y1, x2, y2). Shape (num_query, 4).
             gt_bboxes (Tensor): Ground truth boxes with unnormalized
-                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
 
         Returns:
             torch.Tensor: iou_cost value with weight
@@ -183,3 +232,52 @@ def __call__(self, bboxes, gt_bboxes):
         # The 1 is a constant that doesn't change the matching, so omitted.
         iou_cost = -overlaps
         return iou_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class DiceCost:
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        weight (int | float, optional): loss_weight. Defaults to 1.
+        pred_act (bool, optional): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float, optional): default 1e-12.
+    """
+
+    def __init__(self, weight=1., pred_act=False, eps=1e-3):
+        self.weight = weight
+        self.pred_act = pred_act
+        self.eps = eps
+
+    def binary_mask_dice_loss(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_query, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_query, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        denominator = mask_preds.sum(-1)[:, None] + gt_masks.sum(-1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction logits in shape (num_query, *)
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+
+        Returns:
+            Tensor: Dice cost matrix with weight in shape (num_query, num_gt).
+        """
+        if self.pred_act:
+            mask_preds = mask_preds.sigmoid()
+        dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks)
+        return dice_cost * self.weight
diff --git a/mmdet/core/bbox/samplers/__init__.py b/mmdet/core/bbox/samplers/__init__.py
index b9e83913eaa..f58505b59dc 100644
--- a/mmdet/core/bbox/samplers/__init__.py
+++ b/mmdet/core/bbox/samplers/__init__.py
@@ -3,6 +3,8 @@
 from .combined_sampler import CombinedSampler
 from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
 from .iou_balanced_neg_sampler import IoUBalancedNegSampler
+from .mask_pseudo_sampler import MaskPseudoSampler
+from .mask_sampling_result import MaskSamplingResult
 from .ohem_sampler import OHEMSampler
 from .pseudo_sampler import PseudoSampler
 from .random_sampler import RandomSampler
@@ -12,5 +14,6 @@
 __all__ = [
     'BaseSampler', 'PseudoSampler', 'RandomSampler',
     'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
-    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler'
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'MaskPseudoSampler',
+    'MaskSamplingResult'
 ]
diff --git a/mmdet/core/bbox/samplers/mask_pseudo_sampler.py b/mmdet/core/bbox/samplers/mask_pseudo_sampler.py
new file mode 100644
index 00000000000..b5f69658d02
--- /dev/null
+++ b/mmdet/core/bbox/samplers/mask_pseudo_sampler.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+
+from mmdet.core.bbox.builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+from .mask_sampling_result import MaskSamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class MaskPseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, masks, gt_masks, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            masks (torch.Tensor): Bounding boxes
+            gt_masks (torch.Tensor): Ground truth boxes
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = masks.new_zeros(masks.shape[0], dtype=torch.uint8)
+        sampling_result = MaskSamplingResult(pos_inds, neg_inds, masks,
+                                             gt_masks, assign_result, gt_flags)
+        return sampling_result
diff --git a/mmdet/core/bbox/samplers/mask_sampling_result.py b/mmdet/core/bbox/samplers/mask_sampling_result.py
new file mode 100644
index 00000000000..3d109432260
--- /dev/null
+++ b/mmdet/core/bbox/samplers/mask_sampling_result.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class MaskSamplingResult(SamplingResult):
+    """Mask sampling result."""
+
+    def __init__(self, pos_inds, neg_inds, masks, gt_masks, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_masks = masks[pos_inds]
+        self.neg_masks = masks[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_masks.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_masks.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_masks = torch.empty_like(gt_masks)
+        else:
+            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def masks(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_masks, self.neg_masks])
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_masks'] = data.pop('pos_masks').shape
+        data['neg_masks'] = data.pop('neg_masks').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_masks': self.pos_masks,
+            'neg_masks': self.neg_masks,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
diff --git a/mmdet/models/dense_heads/__init__.py b/mmdet/models/dense_heads/__init__.py
index 81d6ec2f74d..e931e608028 100644
--- a/mmdet/models/dense_heads/__init__.py
+++ b/mmdet/models/dense_heads/__init__.py
@@ -20,6 +20,7 @@
 from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
 from .lad_head import LADHead
 from .ld_head import LDHead
+from .maskformer_head import MaskFormerHead
 from .nasfcos_head import NASFCOSHead
 from .paa_head import PAAHead
 from .pisa_retinanet_head import PISARetinaHead
@@ -49,5 +50,5 @@
     'CascadeRPNHead', 'EmbeddingRPNHead', 'LDHead', 'CascadeRPNHead',
     'AutoAssignHead', 'DETRHead', 'YOLOFHead', 'DeformableDETRHead',
     'SOLOHead', 'DecoupledSOLOHead', 'CenterNetHead', 'YOLOXHead',
-    'DecoupledSOLOLightHead', 'LADHead', 'TOODHead'
+    'DecoupledSOLOLightHead', 'LADHead', 'TOODHead', 'MaskFormerHead'
 ]
diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py
new file mode 100644
index 00000000000..3cd060e53b6
--- /dev/null
+++ b/mmdet/models/dense_heads/maskformer_head.py
@@ -0,0 +1,666 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.runner import force_fp32
+
+from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
+from mmdet.core.evaluation import INSTANCE_OFFSET
+from mmdet.models.utils import preprocess_panoptic_gt
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+
+@HEADS.register_module()
+class MaskFormerHead(AnchorFreeHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic
+    Segmentation <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer.
+        pixel_decoder (obj:`mmcv.ConfigDict`|dict): Config for pixel decoder.
+            Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add a layer
+            to change the embed_dim of tranformer encoder in pixel decoder to
+            the embed_dim of transformer decoder. Defaults to False.
+        transformer_decoder (obj:`mmcv.ConfigDict`|dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+            transformer decoder position encoding. Defaults to None.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (obj:`mmcv.ConfigDict`|dict): Config of the mask loss.
+            Defaults to `FocalLoss`.
+        loss_dice (obj:`mmcv.ConfigDict`|dict): Config of the dice loss.
+            Defaults to `DiceLoss`.
+        train_cfg (obj:`mmcv.ConfigDict`|dict): Training config of Maskformer
+            head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of Maskformer
+            head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 num_things_classes=80,
+                 num_stuff_classes=53,
+                 num_queries=100,
+                 pixel_decoder=None,
+                 enforce_decoder_input_project=False,
+                 transformer_decoder=None,
+                 positional_encoding=None,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_mask=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=20.0),
+                 loss_dice=dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     naive_dice=True,
+                     loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+
+        pixel_decoder.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = build_plugin_layer(pixel_decoder)[1]
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+        pixel_decoder_type = pixel_decoder.get('type')
+        if pixel_decoder_type == 'PixelDecoder' and (
+                self.decoder_embed_dims != in_channels[-1]
+                or enforce_decoder_input_project):
+            self.decoder_input_proj = Conv2d(
+                in_channels[-1], self.decoder_embed_dims, kernel_size=1)
+        else:
+            self.decoder_input_proj = nn.Identity()
+        self.decoder_pe = build_positional_encoding(positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, out_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            self.assigner = build_assigner(assigner)
+            sampler_cfg = dict(type='MaskPseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.bg_cls_weight = 0
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is MaskFormerHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official MaskFormerHead repo, bg_cls_weight
+            # means relative classification weight of the VOID class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(self.num_classes + 1) * class_weight
+            # set VOID class as the last indice
+            class_weight[self.num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_mask = build_loss(loss_mask)
+        self.loss_dice = build_loss(loss_dice)
+
+    def init_weights(self):
+        if isinstance(self.decoder_input_proj, Conv2d):
+            caffe2_xavier_init(self.decoder_input_proj, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs):
+        """Preprocess the ground truth for all images.
+
+        Args:
+            gt_labels_list (list[Tensor]): Each is ground truth
+                labels of each bbox, with shape (num_gts, ).
+            gt_masks_list (list[BitmapMasks]): Each is ground truth
+                masks of each instances of a image, shape
+                (num_gts, h, w).
+            gt_semantic_seg (Tensor): Ground truth of semantic
+                segmentation with the shape (batch_size, n, h, w).
+                [0, num_thing_class - 1] means things,
+                [num_thing_class, num_class-1] means stuff,
+                255 means VOID.
+            target_shape (tuple[int]): Shape of output mask_preds.
+                Resize the masks to shape of mask_preds.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels (list[Tensor]): Ground truth class indices for all\
+                    images. Each with shape (n, ), n is the sum of number\
+                    of stuff type and number of instance in a image.
+                - masks (list[Tensor]): Ground truth mask for each image, each\
+                    with shape (n, h, w).
+        """
+        num_things_list = [self.num_things_classes] * len(gt_labels_list)
+        num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list)
+
+        targets = multi_apply(preprocess_panoptic_gt, gt_labels_list,
+                              gt_masks_list, gt_semantic_segs, num_things_list,
+                              num_stuff_list)
+        labels, masks = targets
+        return labels, masks
+
+    def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for all
+                images. Each with shape (n, ), n is the sum of number of stuff
+                type and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[list[Tensor]]: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - label_weights_list (list[Tensor]): Label weights of all\
+                    images. Each with shape (num_queries, ).
+                - mask_targets_list (list[Tensor]): Mask targets of all\
+                    images. Each with shape (num_queries, h, w).
+                - mask_weights_list (list[Tensor]): Mask weights of all\
+                    images. Each with shape (num_queries, ).
+                - num_total_pos (int): Number of positive samples in all\
+                    images.
+                - num_total_neg (int): Number of negative samples in all\
+                    images.
+        """
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
+                                      mask_preds_list, gt_labels_list,
+                                      gt_masks_list, img_metas)
+
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, mask_targets_list,
+                mask_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
+                           img_metas):
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_labels (Tensor): Ground truth class indices for one image with
+                shape (n, ). n is the sum of number of stuff type and number
+                of instance in a image.
+            gt_masks (Tensor): Ground truth mask for each image, each with
+                shape (n, h, w).
+            img_metas (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image.
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image.
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image.
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        target_shape = mask_pred.shape[-2:]
+        if gt_masks.shape[0] > 0:
+            gt_masks_downsampled = F.interpolate(
+                gt_masks.unsqueeze(1).float(), target_shape,
+                mode='nearest').squeeze(1).long()
+        else:
+            gt_masks_downsampled = gt_masks
+
+        # assign and sample
+        assign_result = self.assigner.assign(cls_score, mask_pred, gt_labels,
+                                             gt_masks_downsampled, img_metas)
+        sampling_result = self.sampler.sample(assign_result, mask_pred,
+                                              gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones(self.num_queries)
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds)
+
+    @force_fp32(apply_to=('all_cls_scores', 'all_mask_preds'))
+    def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
+             gt_masks_list, img_metas):
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels).
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (n, ). n is the sum of number of stuff type
+                and number of instance in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image with
+                shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self.loss_single, all_cls_scores, all_mask_preds,
+            all_gt_labels_list, all_gt_masks_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
+                    gt_masks_list, img_metas):
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels).
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image, each with shape (n, ). n is the sum of number of stuff
+                types and number of instances in a image.
+            gt_masks_list (list[Tensor]): Ground truth mask for each image,
+                each with shape (n, h, w).
+            img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single decoder\
+                layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         num_total_pos,
+         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
+                                           gt_labels_list, gt_masks_list,
+                                           img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_ones(self.num_classes + 1)
+        class_weight[-1] = self.bg_cls_weight
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+        target_shape = mask_targets.shape[-2:]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        # upsample to shape of target
+        # shape (num_total_gts, h, w)
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(1),
+            target_shape,
+            mode='bilinear',
+            align_corners=False).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_preds, mask_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # FocalLoss support input of shape (n, num_class)
+        h, w = mask_preds.shape[-2:]
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w, 1)
+        mask_preds = mask_preds.reshape(-1, 1)
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w)
+        mask_targets = mask_targets.reshape(-1)
+        # target is (1 - mask_targets) !!!
+        loss_mask = self.loss_mask(
+            mask_preds, 1 - mask_targets, avg_factor=num_total_masks * h * w)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (list[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Classification scores for each\
+                scale level. Each is a 4D-tensor with shape\
+                (num_decoder, batch_size, num_queries, cls_out_channels).\
+                Note `cls_out_channels` should includes background.
+            all_mask_preds (Tensor): Mask scores for each decoder\
+                layer. Each with shape (num_decoder, batch_size,\
+                num_queries, h, w).
+        """
+        batch_size = len(img_metas)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        padding_mask = feats[-1].new_ones(
+            (batch_size, input_img_h, input_img_w), dtype=torch.float32)
+        for i in range(batch_size):
+            img_h, img_w, _ = img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1),
+            size=feats[-1].shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+        # when backbone is swin, memory is output of last stage of swin.
+        # when backbone is r50, memory is output of tranformer encoder.
+        mask_features, memory = self.pixel_decoder(feats, img_metas)
+        pos_embed = self.decoder_pe(padding_mask)
+        memory = self.decoder_input_proj(memory)
+        # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+        memory = memory.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        # shape (batch_size, h * w)
+        padding_mask = padding_mask.flatten(1)
+        # shape = (num_queries, embed_dims)
+        query_embed = self.query_embed.weight
+        # shape = (num_queries, batch_size, embed_dims)
+        query_embed = query_embed.unsqueeze(1).repeat(1, batch_size, 1)
+        target = torch.zeros_like(query_embed)
+        # shape (num_decoder, num_queries, batch_size, embed_dims)
+        out_dec = self.transformer_decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=padding_mask)
+        # shape (num_decoder, batch_size, num_queries, embed_dims)
+        out_dec = out_dec.transpose(1, 2)
+
+        # cls_scores
+        all_cls_scores = self.cls_embed(out_dec)
+
+        # mask_preds
+        mask_embed = self.mask_embed(out_dec)
+        all_mask_preds = torch.einsum('lbqc,bchw->lbqhw', mask_embed,
+                                      mask_features)
+
+        return all_cls_scores, all_mask_preds
+
+    def forward_train(self,
+                      feats,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_masks,
+                      gt_semantic_seg,
+                      gt_bboxes_ignore=None):
+        """Forward function for training mode.
+
+        Args:
+            feats (list[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            img_metas (list[Dict]): List of image information.
+            gt_bboxes (list[Tensor]): Each element is ground truth bboxes of
+                the image, shape (num_gts, 4). Not used here.
+            gt_labels (list[Tensor]): Each element is ground truth labels of
+                each box, shape (num_gts,).
+            gt_masks (list[BitmapMasks]): Each element is masks of instances
+                of a image, shape (num_gts, h, w).
+            gt_semantic_seg (list[tensor]):Each element is the ground truth
+                of semantic segmentation with the shape (N, H, W).
+                [0, num_thing_class - 1] means things,
+                [num_thing_class, num_class-1] means stuff,
+                255 means VOID.
+            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
+                ignored. Defaults to None.
+
+        Returns:
+            losses (dict[str, Tensor]): a dictionary of loss components
+        """
+        # not consider ignoring bboxes
+        assert gt_bboxes_ignore is None
+
+        # forward
+        all_cls_scores, all_mask_preds = self(feats, img_metas)
+
+        # preprocess ground truth
+        gt_labels, gt_masks = self.preprocess_gt(gt_labels, gt_masks,
+                                                 gt_semantic_seg)
+
+        # loss
+        losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks,
+                           img_metas)
+
+        return losses
+
+    def simple_test(self, feats, img_metas, rescale=False):
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            feats (list[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional):  If True, return boxes in
+                original image space. Default False.
+
+        Returns:
+            list[dict[str, np.array]]: semantic segmentation results\
+                and panoptic segmentation results for each image.
+
+            .. code-block:: none
+
+                [
+                    {
+                        'pan_results': <np.ndarray>, # shape = [h, w]
+                    },
+                    ...
+                ]
+        """
+        all_cls_scores, all_mask_preds = self(feats, img_metas)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(mask_pred_result.unsqueeze(1),
+                                                 size=(ori_height, ori_width),
+                                                 mode='bilinear',
+                                                 align_corners=False)\
+                    .squeeze(1)
+
+            mask = self.post_process(mask_cls_result, mask_pred_result)
+            results.append(mask)
+
+        return results
+
+    def post_process(self, mask_cls, mask_pred):
+        """Panoptic segmengation inference.
+
+        This implementation is modified from\
+            https://github.com/facebookresearch/MaskFormer
+
+        Args:
+            mask_cls (Tensor): Classfication outputs for a image.
+                shape = (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask outputs for a image.
+                shape = (num_queries, h, w).
+
+        Returns:
+            panoptic_seg (Tensor): panoptic segment result of shape (h, w),\
+                each element in Tensor means:
+                segment_id = _cls + instance_id * INSTANCE_OFFSET.
+        """
+        object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8)
+        iou_thr = self.test_cfg.get('iou_thr', 0.8)
+
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+
+        keep = labels.ne(self.num_classes) & (scores > object_mask_thr)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.full((h, w),
+                                  self.num_classes,
+                                  dtype=torch.int32,
+                                  device=cur_masks.device)
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            pass
+        else:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            instance_id = 1
+            for k in range(cur_classes.shape[0]):
+                pred_class = int(cur_classes[k].item())
+                isthing = pred_class < self.num_things_classes
+                mask = cur_mask_ids == k
+                mask_area = mask.sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                if mask_area > 0 and original_area > 0:
+                    if mask_area / original_area < iou_thr:
+                        continue
+
+                    if not isthing:
+                        # different stuff regions of same class will be
+                        # merged here, and stuff share the instance_id 0.
+                        panoptic_seg[mask] = pred_class
+                    else:
+                        panoptic_seg[mask] = (
+                            pred_class + instance_id * INSTANCE_OFFSET)
+                        instance_id += 1
+        return panoptic_seg
diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py
index 456b8d424fb..9f05a282c18 100644
--- a/mmdet/models/detectors/__init__.py
+++ b/mmdet/models/detectors/__init__.py
@@ -19,6 +19,7 @@
 from .lad import LAD
 from .mask_rcnn import MaskRCNN
 from .mask_scoring_rcnn import MaskScoringRCNN
+from .maskformer import MaskFormer
 from .nasfcos import NASFCOS
 from .paa import PAA
 from .panoptic_fpn import PanopticFPN
@@ -49,5 +50,6 @@
     'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA', 'YOLOV3', 'YOLACT',
     'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN', 'SCNet', 'SOLO',
     'DeformableDETR', 'AutoAssign', 'YOLOF', 'CenterNet', 'YOLOX',
-    'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD'
+    'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD',
+    'MaskFormer'
 ]
diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py
new file mode 100644
index 00000000000..17c5d6c895c
--- /dev/null
+++ b/mmdet/models/detectors/maskformer.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module()
+class MaskFormer(SingleStageDetector):
+    r"""Implementation of `Per-Pixel Classification is
+    NOT All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_"""
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 panoptic_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super(SingleStageDetector, self).__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        panoptic_head.update(train_cfg=train_cfg)
+        panoptic_head.update(test_cfg=test_cfg)
+        self.panoptic_head = build_head(panoptic_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def forward_dummy(self, img, img_metas):
+        """Used for computing network flops. See
+        `mmdetection/tools/analysis_tools/get_flops.py`
+
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[Dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+        """
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        outs = self.panoptic_head(x, img_metas)
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_masks,
+                      gt_semantic_seg,
+                      gt_bboxes_ignore=None,
+                      **kargs):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[Dict]): list of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box.
+            gt_masks (list[BitmapMasks]): true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+            gt_semantic_seg (list[tensor]): semantic segmentation mask for
+                images.
+            gt_bboxes_ignore (list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        # add batch_input_shape in img_metas
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        losses = self.panoptic_head.forward_train(x, img_metas, gt_bboxes,
+                                                  gt_labels, gt_masks,
+                                                  gt_semantic_seg,
+                                                  gt_bboxes_ignore)
+
+        return losses
+
+    def simple_test(self, img, img_metas, **kwargs):
+        """Test without augmentation."""
+        feat = self.extract_feat(img)
+        mask_results = self.panoptic_head.simple_test(feat, img_metas,
+                                                      **kwargs)
+
+        results = []
+        for mask in mask_results:
+            result = {'pan_results': mask.detach().cpu().numpy()}
+            results.append(result)
+
+        return results
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        raise NotImplementedError
+
+    def onnx_export(self, img, img_metas):
+        raise NotImplementedError
diff --git a/mmdet/models/losses/dice_loss.py b/mmdet/models/losses/dice_loss.py
index 121367a36ec..585beeaf1c6 100644
--- a/mmdet/models/losses/dice_loss.py
+++ b/mmdet/models/losses/dice_loss.py
@@ -11,10 +11,16 @@ def dice_loss(pred,
               weight=None,
               eps=1e-3,
               reduction='mean',
+              naive_dice=False,
               avg_factor=None):
-    """Calculate dice loss, which is proposed in
-    `V-Net: Fully Convolutional Neural Networks for Volumetric
-    Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
 
     Args:
         pred (torch.Tensor): The prediction, has a shape (n, *)
@@ -26,6 +32,11 @@ def dice_loss(pred,
         reduction (str, optional): The method used to reduce the loss into
             a scalar. Defaults to 'mean'.
             Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
         avg_factor (int, optional): Average factor that is used to average
             the loss. Defaults to None.
     """
@@ -34,9 +45,15 @@ def dice_loss(pred,
     target = target.flatten(1).float()
 
     a = torch.sum(input * target, 1)
-    b = torch.sum(input * input, 1) + eps
-    c = torch.sum(target * target, 1) + eps
-    d = (2 * a) / (b + c)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
     loss = 1 - d
     if weight is not None:
         assert weight.ndim == loss.ndim
@@ -52,11 +69,10 @@ def __init__(self,
                  use_sigmoid=True,
                  activate=True,
                  reduction='mean',
+                 naive_dice=False,
                  loss_weight=1.0,
                  eps=1e-3):
-        """`Dice Loss, which is proposed in
-        `V-Net: Fully Convolutional Neural Networks for Volumetric
-         Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
+        """Compute dice loss.
 
         Args:
             use_sigmoid (bool, optional): Whether to the prediction is
@@ -67,6 +83,11 @@ def __init__(self,
             reduction (str, optional): The method used
                 to reduce the loss. Options are "none",
                 "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
             loss_weight (float, optional): Weight of loss. Defaults to 1.0.
             eps (float): Avoid dividing by zero. Defaults to 1e-3.
         """
@@ -74,6 +95,7 @@ def __init__(self,
         super(DiceLoss, self).__init__()
         self.use_sigmoid = use_sigmoid
         self.reduction = reduction
+        self.naive_dice = naive_dice
         self.loss_weight = loss_weight
         self.eps = eps
         self.activate = activate
@@ -118,6 +140,7 @@ def forward(self,
             weight,
             eps=self.eps,
             reduction=reduction,
+            naive_dice=self.naive_dice,
             avg_factor=avg_factor)
 
         return loss
diff --git a/mmdet/models/plugins/__init__.py b/mmdet/models/plugins/__init__.py
index a4368551ddc..940d94e884a 100644
--- a/mmdet/models/plugins/__init__.py
+++ b/mmdet/models/plugins/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .dropblock import DropBlock
+from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder
 
-__all__ = ['DropBlock']
+__all__ = ['DropBlock', 'PixelDecoder', 'TransformerEncoderPixelDecoder']
diff --git a/mmdet/models/plugins/pixel_decoder.py b/mmdet/models/plugins/pixel_decoder.py
new file mode 100644
index 00000000000..f69daf46f9a
--- /dev/null
+++ b/mmdet/models/plugins/pixel_decoder.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import PLUGIN_LAYERS, Conv2d, ConvModule, caffe2_xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer_sequence)
+from mmcv.runner import BaseModule, ModuleList
+
+
+@PLUGIN_LAYERS.register_module()
+class PixelDecoder(BaseModule):
+    """Pixel decoder with a structure like fpn.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (obj:`mmcv.ConfigDict`|dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (obj:`mmcv.ConfigDict`|dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (obj:`mmcv.ConfigDict`|dict): Config for transorformer
+            encoder.Defaults to None.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (obj:`mmcv.ConfigDict`|dict):  Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_inputs = len(in_channels)
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        for i in range(0, self.num_inputs - 1):
+            l_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            o_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(l_conv)
+            self.output_convs.append(o_conv)
+
+        self.last_feat_conv = ConvModule(
+            in_channels[-1],
+            feat_channels,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def init_weights(self):
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.last_feat_conv, bias=0)
+
+    def forward(self, feats, img_metas):
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            img_metas (list[dict]): List of image information. Pass in
+                for creating more accurate padding mask. Not used here.
+
+        Returns:
+            tuple: a tuple containing the following:
+
+                - mask_feature (Tensor): Shape (batch_size, c, h, w).
+                - memory (Tensor): Output of last stage of backbone.\
+                    Shape (batch_size, c, h, w).
+        """
+        y = self.last_feat_conv(feats[-1])
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_fpn = self.lateral_convs[i](x)
+            y = cur_fpn + \
+                F.interpolate(y, size=cur_fpn.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        memory = feats[-1]
+        return mask_feature, memory
+
+
+@PLUGIN_LAYERS.register_module()
+class TransformerEncoderPixelDecoder(PixelDecoder):
+    """Pixel decoder with transormer encoder inside.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (obj:`mmcv.ConfigDict`|dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (obj:`mmcv.ConfigDict`|dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (obj:`mmcv.ConfigDict`|dict): Config for transorformer
+            encoder.Defaults to None.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (obj:`mmcv.ConfigDict`|dict):  Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels,
+                 out_channels,
+                 norm_cfg=dict(type='GN', num_groups=32),
+                 act_cfg=dict(type='ReLU'),
+                 encoder=None,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 init_cfg=None):
+        super(TransformerEncoderPixelDecoder, self).__init__(
+            in_channels,
+            feat_channels,
+            out_channels,
+            norm_cfg,
+            act_cfg,
+            init_cfg=init_cfg)
+        self.last_feat_conv = None
+
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.encoder_embed_dims = self.encoder.embed_dims
+        assert self.encoder_embed_dims == feat_channels, 'embed_dims({}) of ' \
+            'tranformer encoder must equal to feat_channels({})'.format(
+                feat_channels, self.encoder_embed_dims)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.encoder_in_proj = Conv2d(
+            in_channels[-1], feat_channels, kernel_size=1)
+        self.encoder_out_proj = ConvModule(
+            feat_channels,
+            feat_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def init_weights(self):
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.encoder_in_proj, bias=0)
+        caffe2_xavier_init(self.encoder_out_proj.conv, bias=0)
+
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, feats, img_metas):
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            img_metas (list[dict]): List of image information. Pass in
+                for creating more accurate padding mask.
+
+        Returns:
+            tuple: a tuple containing the following:
+
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - memory (Tensor): shape (batch_size, c, h, w).
+        """
+        feat_last = feats[-1]
+        bs, c, h, w = feat_last.shape
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        padding_mask = feat_last.new_ones((bs, input_img_h, input_img_w),
+                                          dtype=torch.float32)
+        for i in range(bs):
+            img_h, img_w, _ = img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1),
+            size=feat_last.shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+
+        pos_embed = self.positional_encoding(padding_mask)
+        feat_last = self.encoder_in_proj(feat_last)
+        # (batch_size, c, h, w) -> (num_queries, batch_size, c)
+        feat_last = feat_last.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        # (batch_size, h, w) -> (batch_size, h*w)
+        padding_mask = padding_mask.flatten(1)
+        memory = self.encoder(
+            query=feat_last,
+            key=None,
+            value=None,
+            query_pos=pos_embed,
+            query_key_padding_mask=padding_mask)
+        # (num_queries, batch_size, c) -> (batch_size, c, h, w)
+        memory = memory.permute(1, 2, 0).view(bs, self.encoder_embed_dims, h,
+                                              w)
+        y = self.encoder_out_proj(memory)
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_fpn = self.lateral_convs[i](x)
+            y = cur_fpn + \
+                F.interpolate(y, size=cur_fpn.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        return mask_feature, memory
diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py
index 84dc141e850..add5693b60c 100644
--- a/mmdet/models/utils/__init__.py
+++ b/mmdet/models/utils/__init__.py
@@ -9,6 +9,7 @@
 from .make_divisible import make_divisible
 from .misc import interpolate_as, sigmoid_geometric_mean
 from .normed_predictor import NormedConv2d, NormedLinear
+from .panoptic_gt_processing import preprocess_panoptic_gt
 from .positional_encoding import (LearnedPositionalEncoding,
                                   SinePositionalEncoding)
 from .res_layer import ResLayer, SimplifiedBasicBlock
@@ -25,5 +26,6 @@
     'NormedLinear', 'NormedConv2d', 'make_divisible', 'InvertedResidual',
     'SELayer', 'interpolate_as', 'ConvUpsample', 'CSPLayer',
     'adaptive_avg_pool2d', 'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc',
-    'nlc_to_nchw', 'pvt_convert', 'sigmoid_geometric_mean', 'DyReLU'
+    'nlc_to_nchw', 'pvt_convert', 'sigmoid_geometric_mean',
+    'preprocess_panoptic_gt', 'DyReLU'
 ]
diff --git a/mmdet/models/utils/panoptic_gt_processing.py b/mmdet/models/utils/panoptic_gt_processing.py
new file mode 100644
index 00000000000..513f644945c
--- /dev/null
+++ b/mmdet/models/utils/panoptic_gt_processing.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things,
+                           num_stuff):
+    """Preprocess the ground truth for a image.
+
+    Args:
+        gt_labels (Tensor): Ground truth labels of each bbox,
+            with shape (num_gts, ).
+        gt_masks (BitmapMasks): Ground truth masks of each instances
+            of a image, shape (num_gts, h, w).
+        gt_semantic_seg (Tensor): Ground truth of semantic
+            segmentation with the shape (1, h, w).
+            [0, num_thing_class - 1] means things,
+            [num_thing_class, num_class-1] means stuff,
+            255 means VOID.
+        target_shape (tuple[int]): Shape of output mask_preds.
+            Resize the masks to shape of mask_preds.
+
+    Returns:
+        tuple: a tuple containing the following targets.
+
+            - labels (Tensor): Ground truth class indices for a
+                image, with shape (n, ), n is the sum of number
+                of stuff type and number of instance in a image.
+            - masks (Tensor): Ground truth mask for a image, with
+                shape (n, h, w).
+    """
+    num_classes = num_things + num_stuff
+    things_labels = gt_labels
+    gt_semantic_seg = gt_semantic_seg.squeeze(0)
+
+    things_masks = gt_masks.pad(gt_semantic_seg.shape[-2:], pad_val=0)\
+        .to_tensor(dtype=torch.bool, device=gt_labels.device)
+
+    semantic_labels = torch.unique(
+        gt_semantic_seg,
+        sorted=False,
+        return_inverse=False,
+        return_counts=False)
+    stuff_masks_list = []
+    stuff_labels_list = []
+    for label in semantic_labels:
+        if label < num_things or label >= num_classes:
+            continue
+        stuff_mask = gt_semantic_seg == label
+        stuff_masks_list.append(stuff_mask)
+        stuff_labels_list.append(label)
+
+    if len(stuff_masks_list) > 0:
+        stuff_masks = torch.stack(stuff_masks_list, dim=0)
+        stuff_labels = torch.stack(stuff_labels_list, dim=0)
+        labels = torch.cat([things_labels, stuff_labels], dim=0)
+        masks = torch.cat([things_masks, stuff_masks], dim=0)
+    else:
+        labels = things_labels
+        masks = things_masks
+
+    masks = masks.long()
+    return labels, masks
diff --git a/tests/test_models/test_dense_heads/test_maskformer_head.py b/tests/test_models/test_dense_heads/test_maskformer_head.py
new file mode 100644
index 00000000000..e70f09afe3f
--- /dev/null
+++ b/tests/test_models/test_dense_heads/test_maskformer_head.py
@@ -0,0 +1,203 @@
+import numpy as np
+import torch
+from mmcv import ConfigDict
+
+from mmdet.core.mask import BitmapMasks
+from mmdet.models.dense_heads import MaskFormerHead
+
+
+def test_maskformer_head_loss():
+    """Tests head loss when truth is empty and non-empty."""
+    base_channels = 64
+    # batch_input_shape = (128, 160)
+    img_metas = [{
+        'batch_input_shape': (128, 160),
+        'img_shape': (126, 160, 3),
+        'ori_shape': (63, 80, 3)
+    }, {
+        'batch_input_shape': (128, 160),
+        'img_shape': (120, 160, 3),
+        'ori_shape': (60, 80, 3)
+    }]
+    feats = [
+        torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+
+    config = ConfigDict(
+        dict(
+            type='MaskFormerHead',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            num_things_classes=80,
+            num_stuff_classes=53,
+            num_queries=100,
+            pixel_decoder=dict(
+                type='TransformerEncoderPixelDecoder',
+                norm_cfg=dict(type='GN', num_groups=32),
+                act_cfg=dict(type='ReLU'),
+                encoder=dict(
+                    type='DetrTransformerEncoder',
+                    num_layers=6,
+                    transformerlayers=dict(
+                        type='BaseTransformerLayer',
+                        attn_cfgs=dict(
+                            type='MultiheadAttention',
+                            embed_dims=base_channels,
+                            num_heads=8,
+                            attn_drop=0.1,
+                            proj_drop=0.1,
+                            dropout_layer=None,
+                            batch_first=False),
+                        ffn_cfgs=dict(
+                            embed_dims=base_channels,
+                            feedforward_channels=base_channels * 8,
+                            num_fcs=2,
+                            act_cfg=dict(type='ReLU', inplace=True),
+                            ffn_drop=0.1,
+                            dropout_layer=None,
+                            add_identity=True),
+                        operation_order=('self_attn', 'norm', 'ffn', 'norm'),
+                        norm_cfg=dict(type='LN'),
+                        init_cfg=None,
+                        batch_first=False),
+                    init_cfg=None),
+                positional_encoding=dict(
+                    type='SinePositionalEncoding',
+                    num_feats=base_channels // 2,
+                    normalize=True)),
+            enforce_decoder_input_project=False,
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True),
+            transformer_decoder=dict(
+                type='DetrTransformerDecoder',
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        attn_drop=0.1,
+                        proj_drop=0.1,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 8,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.1,
+                        dropout_layer=None,
+                        add_identity=True),
+                    # the following parameter was not used,
+                    # just make current api happy
+                    feedforward_channels=base_channels * 8,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')),
+                init_cfg=None),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                bg_cls_weight=0.1,
+                use_sigmoid=False,
+                loss_weight=1.0,
+                reduction='mean',
+                class_weight=1.0),
+            loss_mask=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                reduction='mean',
+                loss_weight=20.0),
+            loss_dice=dict(
+                type='DiceLoss',
+                use_sigmoid=True,
+                activate=True,
+                reduction='mean',
+                naive_dice=True,
+                eps=1.0,
+                loss_weight=1.0),
+            train_cfg=dict(
+                assigner=dict(
+                    type='MaskHungarianAssigner',
+                    cls_cost=dict(type='ClassificationCost', weight=1.0),
+                    mask_cost=dict(
+                        type='FocalLossCost', weight=20.0, binary_input=True),
+                    dice_cost=dict(
+                        type='DiceCost', weight=1.0, pred_act=True, eps=1.0)),
+                sampler=dict(type='MaskPseudoSampler')),
+            test_cfg=dict(object_mask_thr=0.8, iou_thr=0.8)))
+    self = MaskFormerHead(**config)
+    self.init_weights()
+    all_cls_scores, all_mask_preds = self.forward(feats, img_metas)
+    # Test that empty ground truth encourages the network to predict background
+    gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])]
+    gt_masks_list = [
+        torch.zeros((0, 128, 160)).long(),
+        torch.zeros((0, 128, 160)).long()
+    ]
+
+    empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                                gt_masks_list, img_metas)
+    # When there is no truth, the cls loss should be nonzero but there should
+    # be no mask loss.
+    for key, loss in empty_gt_losses.items():
+        if 'cls' in key:
+            assert loss.item() > 0, 'cls loss should be non-zero'
+        elif 'mask' in key:
+            assert loss.item(
+            ) == 0, 'there should be no mask loss when there are no true mask'
+        elif 'dice' in key:
+            assert loss.item(
+            ) == 0, 'there should be no dice loss when there are no true mask'
+
+    # when truth is non-empty then both cls, mask, dice loss should be nonzero
+    # random inputs
+    gt_labels_list = [
+        torch.tensor([10, 100]).long(),
+        torch.tensor([100, 10]).long()
+    ]
+    mask1 = torch.zeros((2, 128, 160)).long()
+    mask1[0, :50] = 1
+    mask1[1, 50:] = 1
+    mask2 = torch.zeros((2, 128, 160)).long()
+    mask2[0, :, :50] = 1
+    mask2[1, :, 50:] = 1
+    gt_masks_list = [mask1, mask2]
+    two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list,
+                              gt_masks_list, img_metas)
+    for loss in two_gt_losses.values():
+        assert loss.item() > 0, 'all loss should be non-zero'
+
+    # test forward_train
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask2[0, :, 50:] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+        BitmapMasks(thing_mask2, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    stuff_mask2 = torch.zeros((1, 128, 160)).long()
+    stuff_mask2[0, :, 50:] = 10
+    stuff_mask2[0, :, :50] = 100
+    gt_semantic_seg = [stuff_mask1, stuff_mask2]
+
+    self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks,
+                       gt_semantic_seg)
+
+    # test inference mode
+    self.simple_test(feats, img_metas)
diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py
index db75b2fd418..6b28ba61514 100644
--- a/tests/test_models/test_forward.py
+++ b/tests/test_models/test_forward.py
@@ -700,3 +700,114 @@ def test_yolox_random_size():
         gt_labels=gt_labels,
         return_loss=True)
     assert detector._input_size == (64, 96)
+
+
+def test_maskformer_forward():
+    model_cfg = _get_detector_cfg(
+        'maskformer/maskformer_r50_mstrain_16x1_75e_coco.py')
+    base_channels = 32
+    model_cfg.backbone.depth = 18
+    model_cfg.backbone.init_cfg = None
+    model_cfg.backbone.base_channels = base_channels
+    model_cfg.panoptic_head.in_channels = [
+        base_channels * 2**i for i in range(4)
+    ]
+    model_cfg.panoptic_head.feat_channels = base_channels
+    model_cfg.panoptic_head.out_channels = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.pixel_decoder.encoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 8
+    model_cfg.panoptic_head.pixel_decoder.\
+        positional_encoding.num_feats = base_channels // 2
+    model_cfg.panoptic_head.positional_encoding.\
+        num_feats = base_channels // 2
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.attn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.embed_dims = base_channels
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.ffn_cfgs.feedforward_channels = base_channels * 8
+    model_cfg.panoptic_head.transformer_decoder.\
+        transformerlayers.feedforward_channels = base_channels * 8
+
+    from mmdet.core import BitmapMasks
+    from mmdet.models import build_detector
+    detector = build_detector(model_cfg)
+
+    # Test forward train with non-empty truth batch
+    detector.train()
+    img_metas = [
+        {
+            'batch_input_shape': (128, 160),
+            'img_shape': (126, 160, 3),
+            'ori_shape': (63, 80, 3),
+            'pad_shape': (128, 160, 3)
+        },
+    ]
+    img = torch.rand((1, 3, 128, 160))
+    gt_bboxes = None
+    gt_labels = [
+        torch.tensor([10]).long(),
+    ]
+    thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32)
+    thing_mask1[0, :50] = 1
+    gt_masks = [
+        BitmapMasks(thing_mask1, 128, 160),
+    ]
+    stuff_mask1 = torch.zeros((1, 128, 160)).long()
+    stuff_mask1[0, :50] = 10
+    stuff_mask1[0, 50:] = 100
+    gt_semantic_seg = [
+        stuff_mask1,
+    ]
+    losses = detector.forward(
+        img=img,
+        img_metas=img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        gt_masks=gt_masks,
+        gt_semantic_seg=gt_semantic_seg,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward train with an empty truth batch
+    gt_bboxes = [
+        torch.empty((0, 4)).float(),
+    ]
+    gt_labels = [
+        torch.empty((0, )).long(),
+    ]
+    mask = np.zeros((0, 128, 160), dtype=np.uint8)
+    gt_masks = [
+        BitmapMasks(mask, 128, 160),
+    ]
+    gt_semantic_seg = [
+        torch.randint(0, 133, (0, 128, 160)),
+    ]
+    losses = detector.forward(
+        img,
+        img_metas,
+        gt_bboxes=gt_bboxes,
+        gt_labels=gt_labels,
+        gt_masks=gt_masks,
+        gt_semantic_seg=gt_semantic_seg,
+        return_loss=True)
+    assert isinstance(losses, dict)
+    loss, _ = detector._parse_losses(losses)
+    assert float(loss.item()) > 0
+
+    # Test forward test
+    detector.eval()
+    with torch.no_grad():
+        img_list = [g[None, :] for g in img]
+        batch_results = []
+        for one_img, one_meta in zip(img_list, img_metas):
+            result = detector.forward([one_img], [[one_meta]],
+                                      rescale=True,
+                                      return_loss=False)
+        batch_results.append(result)
diff --git a/tests/test_models/test_loss.py b/tests/test_models/test_loss.py
index 101e5efef5d..380bc3263f7 100644
--- a/tests/test_models/test_loss.py
+++ b/tests/test_models/test_loss.py
@@ -165,51 +165,55 @@ def test_loss_with_ignore_index(use_sigmoid):
     assert torch.allclose(loss, loss_with_forward_ignore)
 
 
-def test_dice_loss():
+@pytest.mark.parametrize('naive_dice', [True, False])
+def test_dice_loss(naive_dice):
     loss_class = DiceLoss
     pred = torch.rand((10, 4, 4))
     target = torch.rand((10, 4, 4))
     weight = torch.rand((10))
 
     # Test loss forward
-    loss = loss_class()(pred, target)
+    loss = loss_class(naive_dice=naive_dice)(pred, target)
     assert isinstance(loss, torch.Tensor)
 
     # Test loss forward with weight
-    loss = loss_class()(pred, target, weight)
+    loss = loss_class(naive_dice=naive_dice)(pred, target, weight)
     assert isinstance(loss, torch.Tensor)
 
     # Test loss forward with reduction_override
-    loss = loss_class()(pred, target, reduction_override='mean')
+    loss = loss_class(naive_dice=naive_dice)(
+        pred, target, reduction_override='mean')
     assert isinstance(loss, torch.Tensor)
 
     # Test loss forward with avg_factor
-    loss = loss_class()(pred, target, avg_factor=10)
+    loss = loss_class(naive_dice=naive_dice)(pred, target, avg_factor=10)
     assert isinstance(loss, torch.Tensor)
 
     with pytest.raises(ValueError):
         # loss can evaluate with avg_factor only if
         # reduction is None, 'none' or 'mean'.
         reduction_override = 'sum'
-        loss_class()(
+        loss_class(naive_dice=naive_dice)(
             pred, target, avg_factor=10, reduction_override=reduction_override)
 
     # Test loss forward with avg_factor and reduction
     for reduction_override in [None, 'none', 'mean']:
-        loss_class()(
+        loss_class(naive_dice=naive_dice)(
             pred, target, avg_factor=10, reduction_override=reduction_override)
         assert isinstance(loss, torch.Tensor)
 
     # Test loss forward with has_acted=False and use_sigmoid=False
     with pytest.raises(NotImplementedError):
-        loss_class(use_sigmoid=False, activate=True)(pred, target)
+        loss_class(
+            use_sigmoid=False, activate=True, naive_dice=naive_dice)(pred,
+                                                                     target)
 
     # Test loss forward with weight.ndim != loss.ndim
     with pytest.raises(AssertionError):
         weight = torch.rand((2, 8))
-        loss_class()(pred, target, weight)
+        loss_class(naive_dice=naive_dice)(pred, target, weight)
 
     # Test loss forward with len(weight) != len(pred)
     with pytest.raises(AssertionError):
         weight = torch.rand((8))
-        loss_class()(pred, target, weight)
+        loss_class(naive_dice=naive_dice)(pred, target, weight)
diff --git a/tests/test_models/test_plugins.py b/tests/test_models/test_plugins.py
index 59416b20de2..b115fbd73f2 100644
--- a/tests/test_models/test_plugins.py
+++ b/tests/test_models/test_plugins.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
+from mmcv import ConfigDict
+from mmcv.cnn import build_plugin_layer
 
 from mmdet.models.plugins import DropBlock
 
@@ -27,3 +29,83 @@ def test_dropblock():
     # warmup_iters cannot be less than 0
     with pytest.raises(AssertionError):
         DropBlock(0.5, 3, -1)
+
+
+def test_pixeldecoder():
+    base_channels = 64
+    pixel_decoder_cfg = ConfigDict(
+        dict(
+            type='PixelDecoder',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')))
+    self = build_plugin_layer(pixel_decoder_cfg)[1]
+    img_metas = [{}, {}]
+    feats = [
+        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    mask_feature, memory = self(feats, img_metas)
+
+    assert (memory == feats[-1]).all()
+    assert mask_feature.shape == feats[0].shape
+
+
+def test_transformerencoderpixeldecoer():
+    base_channels = 64
+    pixel_decoder_cfg = ConfigDict(
+        dict(
+            type='TransformerEncoderPixelDecoder',
+            in_channels=[base_channels * 2**i for i in range(4)],
+            feat_channels=base_channels,
+            out_channels=base_channels,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=base_channels,
+                        num_heads=8,
+                        attn_drop=0.1,
+                        proj_drop=0.1,
+                        dropout_layer=None,
+                        batch_first=False),
+                    ffn_cfgs=dict(
+                        embed_dims=base_channels,
+                        feedforward_channels=base_channels * 8,
+                        num_fcs=2,
+                        act_cfg=dict(type='ReLU', inplace=True),
+                        ffn_drop=0.1,
+                        dropout_layer=None,
+                        add_identity=True),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'),
+                    norm_cfg=dict(type='LN'),
+                    init_cfg=None,
+                    batch_first=False),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='SinePositionalEncoding',
+                num_feats=base_channels // 2,
+                normalize=True)))
+    self = build_plugin_layer(pixel_decoder_cfg)[1]
+    img_metas = [{
+        'batch_input_shape': (128, 160),
+        'img_shape': (120, 160, 3),
+    }, {
+        'batch_input_shape': (128, 160),
+        'img_shape': (125, 160, 3),
+    }]
+    feats = [
+        torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i)))
+        for i in range(4)
+    ]
+    mask_feature, memory = self(feats, img_metas)
+
+    assert memory.shape[-2:] == feats[-1].shape[-2:]
+    assert mask_feature.shape == feats[0].shape
diff --git a/tests/test_utils/test_assigner.py b/tests/test_utils/test_assigner.py
index ca82aeda127..7728510b166 100644
--- a/tests/test_utils/test_assigner.py
+++ b/tests/test_utils/test_assigner.py
@@ -10,8 +10,9 @@
 
 from mmdet.core.bbox.assigners import (ApproxMaxIoUAssigner,
                                        CenterRegionAssigner, HungarianAssigner,
-                                       MaxIoUAssigner, PointAssigner,
-                                       TaskAlignedAssigner, UniformAssigner)
+                                       MaskHungarianAssigner, MaxIoUAssigner,
+                                       PointAssigner, TaskAlignedAssigner,
+                                       UniformAssigner)
 
 
 def test_max_iou_assigner():
@@ -539,3 +540,69 @@ def test_task_aligned_assigner():
         pred_score, pred_bbox, anchor, gt_bboxes=gt_bboxes)
     expected_gt_inds = torch.LongTensor([0, 0, 0, 0])
     assert torch.all(assign_result.gt_inds == expected_gt_inds)
+
+
+def test_mask_hungarian_match_assigner():
+    # test no gt masks
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=1.0),
+        mask_cost=dict(type='FocalLossCost', weight=20.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=1.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    cls_pred = torch.rand((10, 133))
+    mask_pred = torch.rand((10, 50, 50))
+
+    gt_labels = torch.empty((0, )).long()
+    gt_masks = torch.empty((0, 50, 50)).float()
+    img_meta = None
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds == 0)
+    assert torch.all(assign_result.labels == -1)
+
+    # test with gt masks
+    gt_labels = torch.LongTensor([10, 100])
+    gt_masks = torch.zeros((2, 50, 50)).long()
+    gt_masks[0, :25] = 1
+    gt_masks[0, 25:] = 1
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with cls mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=1.0),
+        mask_cost=dict(type='FocalLossCost', weight=0.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask focal mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(type='FocalLossCost', weight=1.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)
+
+    # test with mask dice mode
+    assigner_cfg = dict(
+        cls_cost=dict(type='ClassificationCost', weight=0.0),
+        mask_cost=dict(type='FocalLossCost', weight=0.0, binary_input=True),
+        dice_cost=dict(type='DiceCost', weight=1.0, pred_act=True, eps=1.0))
+    self = MaskHungarianAssigner(**assigner_cfg)
+    assign_result = self.assign(cls_pred, mask_pred, gt_labels, gt_masks,
+                                img_meta)
+    assert torch.all(assign_result.gt_inds > -1)
+    assert (assign_result.gt_inds > 0).sum() == gt_labels.size(0)
+    assert (assign_result.labels > -1).sum() == gt_labels.size(0)

From 5675dacdde67a8ee85221f7b6f76f80a47323cd9 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 14:05:07 +0800
Subject: [PATCH 19/27] Add deprecation message for deploy tools (#7242)

---
 tools/deployment/onnx2tensorrt.py | 12 ++++++++++++
 tools/deployment/pytorch2onnx.py  | 12 ++++++++++++
 tools/deployment/test.py          | 15 ++++++++++++++-
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/tools/deployment/onnx2tensorrt.py b/tools/deployment/onnx2tensorrt.py
index e3e9b57d2b4..b59e52ae199 100644
--- a/tools/deployment/onnx2tensorrt.py
+++ b/tools/deployment/onnx2tensorrt.py
@@ -252,3 +252,15 @@ def parse_shape(shape):
         show=args.show,
         workspace_size=args.workspace_size,
         verbose=args.verbose)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/tools/deployment/pytorch2onnx.py b/tools/deployment/pytorch2onnx.py
index c1789b442a7..5c786f8540e 100644
--- a/tools/deployment/pytorch2onnx.py
+++ b/tools/deployment/pytorch2onnx.py
@@ -343,3 +343,15 @@ def parse_args():
         do_simplify=args.simplify,
         dynamic_export=args.dynamic_export,
         skip_postprocess=args.skip_postprocess)
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
diff --git a/tools/deployment/test.py b/tools/deployment/test.py
index b32b77332e5..afbad176841 100644
--- a/tools/deployment/test.py
+++ b/tools/deployment/test.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
+import warnings
 
 import mmcv
 from mmcv import Config, DictAction
@@ -140,4 +141,16 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    # main()
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This tool will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)

From 97287795e0dd45cf407ee6193124d1f9adc8d3ce Mon Sep 17 00:00:00 2001
From: Yue Zhou <592267829@qq.com>
Date: Thu, 24 Feb 2022 14:13:01 +0800
Subject: [PATCH 20/27] Add CI for windows (#7228)

* [Fix] Fix wrong img name in onnx2tensorrt.py (#7157)

* [Docs] fix albumentations installed way (#7143)

* Add mmrotate (#7216)

* fix description for args in CSPDarknet (#7187)

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* Update build.yml

* fix test_find_latest_checkpoint

* fix data_infos__default_db_directories

* fix test_custom_classes_override_default

* Update test_custom_dataset.py

* Update test_common.py

* Update assign_result.py

* use os.path.join

* fix bug

* Update test_common.py

* Update assign_result.py

* Update sampling_result.py

* os.path -> osp

* os.path -> osp

Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>
Co-authored-by: Hyeokjoon Kwon <dsazz801@gmail.com>
---
 .github/workflows/build.yml                   | 47 +++++++++++++++++++
 README.md                                     |  1 +
 README_zh-CN.md                               |  1 +
 mmdet/core/bbox/assigners/assign_result.py    |  2 +-
 mmdet/core/bbox/samplers/sampling_result.py   |  2 +-
 mmdet/models/backbones/csp_darknet.py         |  4 +-
 tests/test_data/test_datasets/test_common.py  | 11 ++++-
 .../test_datasets/test_custom_dataset.py      | 31 ++++++------
 tests/test_utils/test_misc.py                 | 19 ++++----
 9 files changed, 87 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 85ad6be0a69..09efe6197ee 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -231,3 +231,50 @@ jobs:
           env_vars: OS,PYTHON
           name: codecov-umbrella
           fail_ci_if_error: false
+
+  build_windows:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [windows-2022]
+        python: [3.8]
+        platform: [cpu, cu111]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade --user
+      - name: Install PyTorch
+        # As a complement to Linux CI, we test on PyTorch LTS version
+        run: pip install torch==1.8.2+${{ matrix.platform }} torchvision==0.9.2+${{ matrix.platform }} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+      - name: Install MMCV
+        run: pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8/index.html --only-binary mmcv-full
+      - name: Install unittest dependencies
+        run: |
+          python -V
+          python -m pip install pycocotools
+          python -m pip install -r requirements/tests.txt -r requirements/optional.txt
+          python -m pip install albumentations>=0.3.2 --no-binary imgaug,albumentations
+          python -m pip install git+https://github.com/cocodataset/panopticapi.git
+          python -c 'import mmcv; print(mmcv.__version__)'
+      - name: Show pip list
+        run: pip list
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests
+        run: coverage run --branch --source mmdet -m pytest tests -sv
+      - name: Generate coverage report
+        run: |
+          coverage xml
+          coverage report -m
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v2
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
diff --git a/README.md b/README.md
index 48b28118c6e..d935ae2750a 100644
--- a/README.md
+++ b/README.md
@@ -321,3 +321,4 @@ If you use this toolbox or benchmark in your research, please cite this project.
 - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
 - [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
 - [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab Model Compression Toolbox and Benchmark.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 441605a8ab3..7f258b30a82 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -324,6 +324,7 @@ MMDetection 是一款由来自不同高校和企业的研发人员共同参与
 - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
 - [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
 - [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
 
 ## 欢迎加入 OpenMMLab 社区
 
diff --git a/mmdet/core/bbox/assigners/assign_result.py b/mmdet/core/bbox/assigners/assign_result.py
index c1a2d5f371f..488010b5d90 100644
--- a/mmdet/core/bbox/assigners/assign_result.py
+++ b/mmdet/core/bbox/assigners/assign_result.py
@@ -165,7 +165,7 @@ def random(cls, **kwargs):
             true_idxs = np.arange(num_gts)
             rng.shuffle(true_idxs)
             true_idxs = torch.from_numpy(true_idxs)
-            gt_inds[is_assigned] = true_idxs[:n_assigned]
+            gt_inds[is_assigned] = true_idxs[:n_assigned].long()
 
             gt_inds = torch.from_numpy(
                 rng.randint(1, num_gts + 1, size=num_preds))
diff --git a/mmdet/core/bbox/samplers/sampling_result.py b/mmdet/core/bbox/samplers/sampling_result.py
index 7d106cbeffd..50676d04191 100644
--- a/mmdet/core/bbox/samplers/sampling_result.py
+++ b/mmdet/core/bbox/samplers/sampling_result.py
@@ -42,7 +42,7 @@ def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
             if len(gt_bboxes.shape) < 2:
                 gt_bboxes = gt_bboxes.view(-1, 4)
 
-            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :]
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long(), :]
 
         if assign_result.labels is not None:
             self.pos_gt_labels = assign_result.labels[pos_inds]
diff --git a/mmdet/models/backbones/csp_darknet.py b/mmdet/models/backbones/csp_darknet.py
index db0b8de3286..2bbf3968a81 100644
--- a/mmdet/models/backbones/csp_darknet.py
+++ b/mmdet/models/backbones/csp_darknet.py
@@ -128,9 +128,9 @@ class CSPDarknet(BaseModule):
         arch (str): Architecture of CSP-Darknet, from {P5, P6}.
             Default: P5.
         deepen_factor (float): Depth multiplier, multiply number of
-            channels in each layer by this amount. Default: 1.0.
-        widen_factor (float): Width multiplier, multiply number of
             blocks in CSP layer by this amount. Default: 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
         out_indices (Sequence[int]): Output from which stages.
             Default: (2, 3, 4).
         frozen_stages (int): Stages to be frozen (stop grad and set eval
diff --git a/tests/test_data/test_datasets/test_common.py b/tests/test_data/test_datasets/test_common.py
index 911d8e2b3c9..84cac80434d 100644
--- a/tests/test_data/test_datasets/test_common.py
+++ b/tests/test_data/test_datasets/test_common.py
@@ -3,6 +3,8 @@
 import logging
 import os
 import os.path as osp
+import platform
+import shutil
 import tempfile
 from unittest.mock import MagicMock, patch
 
@@ -107,8 +109,11 @@ def _create_dummy_results():
 def test_dataset_init(config_path):
     use_symlink = False
     if not os.path.exists('./data'):
-        os.symlink('./tests/data', './data')
-        use_symlink = True
+        if platform.system() != 'Windows':
+            os.symlink('./tests/data', './data')
+            use_symlink = True
+        else:
+            shutil.copytree('./tests/data', './data')
     data_config = mmcv.Config.fromfile(config_path)
     if 'data' not in data_config:
         return
@@ -119,6 +124,8 @@ def test_dataset_init(config_path):
         dataset[0]
     if use_symlink:
         os.unlink('./data')
+    else:
+        shutil.rmtree('./data')
 
 
 def test_dataset_evaluation():
diff --git a/tests/test_data/test_datasets/test_custom_dataset.py b/tests/test_data/test_datasets/test_custom_dataset.py
index b9207be3880..4dae46480fa 100644
--- a/tests/test_data/test_datasets/test_custom_dataset.py
+++ b/tests/test_data/test_datasets/test_custom_dataset.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import os
+import os.path as osp
 import unittest
 from unittest.mock import MagicMock, patch
 
@@ -75,16 +75,16 @@ def test_custom_classes_override_default(dataset):
 
     # Test sending file path
     import tempfile
-    tmp_file = tempfile.NamedTemporaryFile()
-    with open(tmp_file.name, 'w') as f:
-        f.write('bus\ncar\n')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = tmpdir + 'classes.txt'
+        with open(path, 'w') as f:
+            f.write('bus\ncar\n')
     custom_dataset = dataset_class(
         ann_file=MagicMock(),
         pipeline=[],
-        classes=tmp_file.name,
+        classes=path,
         test_mode=True,
         img_prefix='VOC2007' if dataset == 'VOCDataset' else '')
-    tmp_file.close()
 
     assert custom_dataset.CLASSES != original_classes
     assert custom_dataset.CLASSES == ['bus', 'car']
@@ -95,35 +95,34 @@ class CustomDatasetTests(unittest.TestCase):
 
     def setUp(self):
         super().setUp()
-        self.data_dir = os.path.join(
-            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
-            'data')
+        self.data_dir = osp.join(
+            osp.dirname(osp.dirname(osp.dirname(__file__))), 'data')
         self.dataset_class = DATASETS.get('XMLDataset')
 
     def test_data_infos__default_db_directories(self):
         """Test correct data read having a Pacal-VOC directory structure."""
-        test_dataset_root = os.path.join(self.data_dir, 'VOCdevkit', 'VOC2007')
+        test_dataset_root = osp.join(self.data_dir, 'VOCdevkit', 'VOC2007')
         custom_ds = self.dataset_class(
             data_root=test_dataset_root,
-            ann_file=os.path.join(test_dataset_root, 'ImageSets', 'Main',
-                                  'trainval.txt'),
+            ann_file=osp.join(test_dataset_root, 'ImageSets', 'Main',
+                              'trainval.txt'),
             pipeline=[],
             classes=('person', 'dog'),
             test_mode=True)
 
         self.assertListEqual([{
             'id': '000001',
-            'filename': 'JPEGImages/000001.jpg',
+            'filename': osp.join('JPEGImages', '000001.jpg'),
             'width': 353,
             'height': 500
         }], custom_ds.data_infos)
 
     def test_data_infos__overridden_db_subdirectories(self):
         """Test correct data read having a customized directory structure."""
-        test_dataset_root = os.path.join(self.data_dir, 'custom_dataset')
+        test_dataset_root = osp.join(self.data_dir, 'custom_dataset')
         custom_ds = self.dataset_class(
             data_root=test_dataset_root,
-            ann_file=os.path.join(test_dataset_root, 'trainval.txt'),
+            ann_file=osp.join(test_dataset_root, 'trainval.txt'),
             pipeline=[],
             classes=('person', 'dog'),
             test_mode=True,
@@ -133,7 +132,7 @@ def test_data_infos__overridden_db_subdirectories(self):
 
         self.assertListEqual([{
             'id': '000001',
-            'filename': 'images/000001.jpg',
+            'filename': osp.join('images', '000001.jpg'),
             'width': 353,
             'height': 500
         }], custom_ds.data_infos)
diff --git a/tests/test_utils/test_misc.py b/tests/test_utils/test_misc.py
index de22ad6cb32..80d91143ba1 100644
--- a/tests/test_utils/test_misc.py
+++ b/tests/test_utils/test_misc.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
 import tempfile
 
 import numpy as np
@@ -172,32 +173,32 @@ def test_find_latest_checkpoint():
         # There are no checkpoints in the path.
         assert latest is None
 
-        path = tmpdir + '/none'
+        path = osp.join(tmpdir, 'none')
         latest = find_latest_checkpoint(path)
         # The path does not exist.
         assert latest is None
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        with open(tmpdir + '/latest.pth', 'w') as f:
+        with open(osp.join(tmpdir, 'latest.pth'), 'w') as f:
             f.write('latest')
         path = tmpdir
         latest = find_latest_checkpoint(path)
-        assert latest == tmpdir + '/latest.pth'
+        assert latest == osp.join(tmpdir, 'latest.pth')
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        with open(tmpdir + '/iter_4000.pth', 'w') as f:
+        with open(osp.join(tmpdir, 'iter_4000.pth'), 'w') as f:
             f.write('iter_4000')
-        with open(tmpdir + '/iter_8000.pth', 'w') as f:
+        with open(osp.join(tmpdir, 'iter_8000.pth'), 'w') as f:
             f.write('iter_8000')
         path = tmpdir
         latest = find_latest_checkpoint(path)
-        assert latest == tmpdir + '/iter_8000.pth'
+        assert latest == osp.join(tmpdir, 'iter_8000.pth')
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        with open(tmpdir + '/epoch_1.pth', 'w') as f:
+        with open(osp.join(tmpdir, 'epoch_1.pth'), 'w') as f:
             f.write('epoch_1')
-        with open(tmpdir + '/epoch_2.pth', 'w') as f:
+        with open(osp.join(tmpdir, 'epoch_2.pth'), 'w') as f:
             f.write('epoch_2')
         path = tmpdir
         latest = find_latest_checkpoint(path)
-        assert latest == tmpdir + '/epoch_2.pth'
+        assert latest == osp.join(tmpdir, 'epoch_2.pth')

From 5aca4fa3d41f1363ce7ddcbbe5955b051641515f Mon Sep 17 00:00:00 2001
From: Kevin Ye <1752391457@qq.com>
Date: Thu, 24 Feb 2022 14:19:49 +0800
Subject: [PATCH 21/27] add Chinese version of init_cfg (#7188)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Fix] Fix wrong img name in onnx2tensorrt.py (#7157)

* [Docs] fix albumentations installed way (#7143)

* Create init_cfg.md

* Update docs/zh_cn/tutorials/init_cfg.md

Co-authored-by: Haian Huang(深度眸) <1286304229@qq.com>

* update init_cfg.md

* update init_cfg.md

* update init_cfg.md

* update init_cfg.md

Co-authored-by: Jamie <jamiechoi1995@users.noreply.github.com>
Co-authored-by: BigDong <yudongwang@tju.edu.cn>
Co-authored-by: Haian Huang(深度眸) <1286304229@qq.com>
---
 docs/en/tutorials/init_cfg.md    |   4 +-
 docs/zh_cn/tutorials/init_cfg.md | 162 +++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100644 docs/zh_cn/tutorials/init_cfg.md

diff --git a/docs/en/tutorials/init_cfg.md b/docs/en/tutorials/init_cfg.md
index 6fc837b638e..69300e66559 100644
--- a/docs/en/tutorials/init_cfg.md
+++ b/docs/en/tutorials/init_cfg.md
@@ -142,11 +142,11 @@ class FooModel(BaseModule)
 
 	```python
 	# It is invalid that override don't have name key
-	init_cfg = dict(type='Constant', layer ['Conv1d','Conv2d'], val=1, bias=2,
+	init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
 	            	override=dict(type='Constant', val=3, bias=4))
 
 	# It is also invalid that override has name and other args except type
-	init_cfg = dict(type='Constant', layer ['Conv1d','Conv2d'], val=1, bias=2,
+	init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
 	                override=dict(name='reg', val=3, bias=4))
 	```
 
diff --git a/docs/zh_cn/tutorials/init_cfg.md b/docs/zh_cn/tutorials/init_cfg.md
new file mode 100644
index 00000000000..bb9d8ba9c49
--- /dev/null
+++ b/docs/zh_cn/tutorials/init_cfg.md
@@ -0,0 +1,162 @@
+# 教程 10: 权重初始化
+
+在训练过程中，适当的初始化策略有利于加快训练速度或获得更⾼的性能。 [MMCV](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/weight_init.py) 提供了一些常⽤的初始化模块的⽅法，如 `nn.Conv2d`。 MMdetection 中的模型初始化主要使⽤ `init_cfg`。⽤⼾可以通过以下两个步骤来初始化模型：
+
+1. 在 `model_cfg` 中为模型或其组件定义 `init_cfg`，但⼦组件的 `init_cfg` 优先级更⾼，会覆盖⽗模块的 `init_cfg` 。
+2. 像往常一样构建模型，然后显式调⽤ `model.init_weights()` ⽅法，此时模型参数将会被按照配置文件写法进行初始化。
+
+
+MMdetection 初始化工作流的高层 API 调用流程是：
+
+model_cfg(init_cfg) -> build_from_cfg -> model -> init_weight() -> initialize(self, self.init_cfg) -> children's init_weight()
+
+### 描述
+
+它的数据类型是 dict 或者 list[dict]，包含了下列键值:
+
+- `type` (str)，包含 `INTIALIZERS` 中的初始化器名称，后面跟着初始化器的参数。
+- `layer`（str 或 list[str]），包含 Pytorch 或 MMCV 中基本层的名称，以及将被初始化的可学习参数，例如 `'Conv2d'`，`'DeformConv2d'`。
+- `override` (dict 或 list[dict])，包含不继承⾃ `BaseModule` 且其初始化配置与 `layer` 键中的其他层不同的⼦模块。 `type` 中定义的初始化器将适⽤于 `layer` 中定义的所有层，因此如果⼦模块不是 `BaseModule` 的派⽣类但可以与 `layer` 中的层相同的⽅式初始化，则不需要使⽤ `override`。`override` 包含了：
+  - `type` 后跟初始化器的参数；
+  - `name` 用以指⽰将被初始化的⼦模块。
+
+### 初始化参数
+
+从 `mmcv.runner.BaseModule` 或 `mmdet.models` 继承一个新模型。这里我们用 FooModel 来举个例子。
+
+```python
+import torch.nn as nn
+from mmcv.runner import BaseModule
+
+class FooModel(BaseModule)
+	def __init__(self,
+                 arg1,
+                 arg2,
+                 init_cfg=None):
+    	super(FooModel, self).__init__(init_cfg)
+		...
+```
+
+- 直接在代码中使⽤ `init_cfg` 初始化模型
+
+  ```python
+  import torch.nn as nn
+  from mmcv.runner import BaseModule
+  # or directly inherit mmdet models
+
+  class FooModel(BaseModule)
+  	def __init__(self,
+                  arg1,
+                  arg2,
+                  init_cfg=XXX):
+    		super(FooModel, self).__init__(init_cfg)
+    	    ...
+  ```
+
+- 在 `mmcv.Sequential` 或 `mmcv.ModuleList` 代码中直接使⽤ `init_cfg` 初始化模型
+
+  ```python
+  from mmcv.runner import BaseModule, ModuleList
+
+  class FooModel(BaseModule)
+  	def __init__(self,
+                	arg1,
+                	arg2,
+                	init_cfg=None):
+    		super(FooModel, self).__init__(init_cfg)
+        	...
+        	self.conv1 = ModuleList(init_cfg=XXX)
+  ```
+
+- 使⽤配置⽂件中的 `init_cfg` 初始化模型
+
+  ```python
+  model = dict(
+  	...
+    	model = dict(
+        	type='FooModel',
+        	arg1=XXX,
+        	arg2=XXX,
+        	init_cfg=XXX),
+            ...
+  ```
+
+### init_cfg 的使用
+
+1. 用 `layer` 键初始化模型
+
+   如果我们只定义了 `layer`, 它只会在 `layer` 键中初始化网络层。
+
+   注意： `layer` 键对应的值是 Pytorch 的带有 weights 和 bias 属性的类名（因此不⽀持 `MultiheadAttention` 层）。
+
+- 定义⽤于初始化具有相同配置的模块的 `layer` 键。
+
+  ```python
+  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
+  # ⽤相同的配置初始化整个模块
+  ```
+
+- 定义⽤于初始化具有不同配置的层的 `layer` 键。
+
+  ```python
+  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+              dict(type='Constant', layer='Conv2d', val=2),
+              dict(type='Constant', layer='Linear', val=3)]
+  # nn.Conv1d 将被初始化为 dict(type='Constant', val=1)
+  # nn.Conv2d 将被初始化为 dict(type='Constant', val=2)
+  # nn.Linear 将被初始化为 dict(type='Constant', val=3)
+  ```
+
+2. 使⽤ `override` 键初始化模型
+
+- 当使⽤属性名初始化某些特定部分时，我们可以使⽤ `override` 键， `override` 中的值将忽略 init_cfg 中的值。
+
+  ```python
+  # layers：
+  # self.feat = nn.Conv1d(3, 1, 3)
+  # self.reg = nn.Conv2d(3, 3, 3)
+  # self.cls = nn.Linear(1,2)
+
+  init_cfg = dict(type='Constant',
+                  layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(type='Constant', name='reg', val=3, bias=4))
+  # self.feat and self.cls 将被初始化为 dict(type='Constant', val=1, bias=2)
+  # 叫 'reg' 的模块将被初始化为 dict(type='Constant', val=3, bias=4)
+  ```
+
+- 如果 init_cfg 中的 `layer` 为 None，则只会初始化 override 中有 name 的⼦模块，⽽ override 中的 type 和其他参数可以省略。
+
+  ```python
+  # layers：
+  # self.feat = nn.Conv1d(3, 1, 3)
+  # self.reg = nn.Conv2d(3, 3, 3)
+  # self.cls = nn.Linear(1,2)
+
+  init_cfg = dict(type='Constant', val=1, bias=2, 	override=dict(name='reg'))
+
+  # self.feat and self.cls 将被 Pytorch 初始化
+  # 叫 'reg' 的模块将被 dict(type='Constant', val=1, bias=2) 初始化
+  ```
+
+- 如果我们不定义 `layer` 或 `override` 键，它不会初始化任何东西。
+
+- 无效的使用
+
+  ```python
+  # override 没有 name 键的话是无效的
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+              	override=dict(type='Constant', val=3, bias=4))
+
+  # override 有 name 键和其他参数但是没有 type 键也是无效的
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(name='reg', val=3, bias=4))
+  ```
+
+3. 使⽤预训练模型初始化模型
+
+   ```python
+   init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+   ```
+
+更多细节可以参考 [MMCV](https://mmcv.readthedocs.io/en/latest/cnn.html#weight-initialization) 的文档和 MMCV [PR #780](https://github.com/open-mmlab/mmcv/pull/780)

From bb027f1aff91b14b07adc571fc3c287da6aed40a Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 16:24:08 +0800
Subject: [PATCH 22/27] update MaskFormer readme and docs (#7241)

* update docs for maskformer

* update readme

* update readme format

* update link

* update json link

* update format of ConfigDict

* update format of function returns

* uncomment main in deployment/test.py
---
 configs/maskformer/README.md                  | 50 +++++-------
 .../bbox/assigners/mask_hungarian_assigner.py |  6 +-
 mmdet/models/dense_heads/maskformer_head.py   | 76 +++++++++----------
 mmdet/models/detectors/maskformer.py          |  2 +-
 mmdet/models/plugins/pixel_decoder.py         | 24 +++---
 tools/deployment/test.py                      |  2 +-
 6 files changed, 74 insertions(+), 86 deletions(-)

diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
index ce1384ae77e..54110004c94 100644
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@@ -1,37 +1,18 @@
-# Per-Pixel Classification is Not All You Need for Semantic Segmentation
+# MaskFormer
+
+> [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+
+<!-- [ALGORITHM] -->
 
 ## Abstract
 
-Modern approaches typically formulate semantic segmentation as a per-pixel classification
-task, while instance-level segmentation is handled with an alternative mask
-classification. Our key insight: mask classification is sufficiently general to solve
-both semantic- and instance-level segmentation tasks in a unified manner using
-the exact same model, loss, and training procedure. Following this observation,
-we propose MaskFormer, a simple mask classification model which predicts a
-set of binary masks, each associated with a single global class label prediction.
-Overall, the proposed mask classification-based method simplifies the landscape
-of effective approaches to semantic and panoptic segmentation tasks and shows
-excellent empirical results. In particular, we observe that MaskFormer outperforms
-per-pixel classification baselines when the number of classes is large. Our mask
-classification-based method outperforms both current state-of-the-art semantic
-(55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic  segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
 
 <div align=center>
 <img src="https://camo.githubusercontent.com/29fb22298d506ce176caad3006a7b05ef2603ca12cece6c788b7e73c046e8bc9/68747470733a2f2f626f77656e63303232312e6769746875622e696f2f696d616765732f6d61736b666f726d65722e706e67" height="300"/>
 </div>
 
-## Citation
-
-```
-@inproceedings{cheng2021maskformer,
-  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
-  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
-  journal={NeurIPS},
-  year={2021}
-}
-```
-
-## Dataset
+## Introduction
 
 MaskFormer requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path.
 The directory should be like this.
@@ -55,6 +36,17 @@ mmdetection
 
 ## Results and Models
 
-| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) | PQ | SQ | RQ | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st |                                                         Config                                                         |         Download         | detail |
-| :------: | :-----: | :-----: | :------: | :------------: | :-: | :-: | :-: | :---: | :---: | :---: | :---: | :---: | :---: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------: | :---: |
-| R-50 | pytorch |    75e    |          |                |    |    |    |      |      |      |      |      |      | [config](https://github.com/open-mmlab/mmdetection/tree/master/configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py) |  | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+| Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th  | SQ_th  | RQ_th  | PQ_st  | SQ_st  | RQ_st  |                                                           Config                                                           |                                                                                                                                                                    Download                                                                                                                                                                     |                                                                         detail                                                                          |
+|:--------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:--------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------:|
+|   R-50   | pytorch |   75e   |   16.6   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+
+## Citation
+
+```latex
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```
diff --git a/mmdet/core/bbox/assigners/mask_hungarian_assigner.py b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
index ef0f35831d6..d10e62edb84 100644
--- a/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
+++ b/mmdet/core/bbox/assigners/mask_hungarian_assigner.py
@@ -29,9 +29,9 @@ class MaskHungarianAssigner(BaseAssigner):
     - positive integer: positive sample, index (1-based) of assigned gt
 
     Args:
-        cls_cost (obj:`mmcv.ConfigDict` | dict): Classification cost config.
-        mask_cost (obj:`mmcv.ConfigDict` | dict): Mask cost config.
-        dice_cost (obj:`mmcv.ConfigDict` | dict): Dice cost config.
+        cls_cost (:obj:`mmcv.ConfigDict` | dict): Classification cost config.
+        mask_cost (:obj:`mmcv.ConfigDict` | dict): Mask cost config.
+        dice_cost (:obj:`mmcv.ConfigDict` | dict): Dice cost config.
     """
 
     def __init__(self,
diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py
index 3cd060e53b6..7d7a644c7e1 100644
--- a/mmdet/models/dense_heads/maskformer_head.py
+++ b/mmdet/models/dense_heads/maskformer_head.py
@@ -28,24 +28,24 @@ class MaskFormerHead(AnchorFreeHead):
         num_things_classes (int): Number of things.
         num_stuff_classes (int): Number of stuff.
         num_queries (int): Number of query in Transformer.
-        pixel_decoder (obj:`mmcv.ConfigDict`|dict): Config for pixel decoder.
-            Defaults to None.
+        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
+            decoder. Defaults to None.
         enforce_decoder_input_project (bool, optional): Whether to add a layer
             to change the embed_dim of tranformer encoder in pixel decoder to
             the embed_dim of transformer decoder. Defaults to False.
-        transformer_decoder (obj:`mmcv.ConfigDict`|dict): Config for
+        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer decoder. Defaults to None.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer decoder position encoding. Defaults to None.
-        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification
+        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
             loss. Defaults to `CrossEntropyLoss`.
-        loss_mask (obj:`mmcv.ConfigDict`|dict): Config of the mask loss.
+        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
             Defaults to `FocalLoss`.
-        loss_dice (obj:`mmcv.ConfigDict`|dict): Config of the dice loss.
+        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
             Defaults to `DiceLoss`.
-        train_cfg (obj:`mmcv.ConfigDict`|dict): Training config of Maskformer
-            head.
-        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of Maskformer
+        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
+            Maskformer head.
+        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of Maskformer
             head.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Defaults to None.
@@ -177,12 +177,11 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs):
 
         Returns:
             tuple: a tuple containing the following targets.
-
-                - labels (list[Tensor]): Ground truth class indices for all\
-                    images. Each with shape (n, ), n is the sum of number\
-                    of stuff type and number of instance in a image.
-                - masks (list[Tensor]): Ground truth mask for each image, each\
-                    with shape (n, h, w).
+                - labels (list[Tensor]): Ground truth class indices\
+                    for all images. Each with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in a image.
+                - masks (list[Tensor]): Ground truth mask for each\
+                    image, each with shape (n, h, w).
         """
         num_things_list = [self.num_things_classes] * len(gt_labels_list)
         num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list)
@@ -213,19 +212,18 @@ def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
 
         Returns:
             tuple[list[Tensor]]: a tuple containing the following targets.
-
                 - labels_list (list[Tensor]): Labels of all images.\
                     Each with shape (num_queries, ).
-                - label_weights_list (list[Tensor]): Label weights of all\
-                    images. Each with shape (num_queries, ).
-                - mask_targets_list (list[Tensor]): Mask targets of all\
-                    images. Each with shape (num_queries, h, w).
-                - mask_weights_list (list[Tensor]): Mask weights of all\
-                    images. Each with shape (num_queries, ).
-                - num_total_pos (int): Number of positive samples in all\
-                    images.
-                - num_total_neg (int): Number of negative samples in all\
-                    images.
+                - label_weights_list (list[Tensor]): Label weights\
+                    of all images. Each with shape (num_queries, ).
+                - mask_targets_list (list[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights_list (list[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - num_total_pos (int): Number of positive samples in\
+                    all images.
+                - num_total_neg (int): Number of negative samples in\
+                    all images.
         """
         (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
          pos_inds_list,
@@ -256,7 +254,6 @@ def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
 
         Returns:
             tuple[Tensor]: a tuple containing the following for one image.
-
                 - labels (Tensor): Labels of each image.
                     shape (num_queries, ).
                 - label_weights (Tensor): Label weights of each image.
@@ -444,13 +441,14 @@ def forward(self, feats, img_metas):
             img_metas (list[dict]): List of image information.
 
         Returns:
-            all_cls_scores (Tensor): Classification scores for each\
-                scale level. Each is a 4D-tensor with shape\
-                (num_decoder, batch_size, num_queries, cls_out_channels).\
-                Note `cls_out_channels` should includes background.
-            all_mask_preds (Tensor): Mask scores for each decoder\
-                layer. Each with shape (num_decoder, batch_size,\
-                num_queries, h, w).
+            tuple: a tuple contains two elements.
+                - all_cls_scores (Tensor): Classification scores for each\
+                    scale level. Each is a 4D-tensor with shape\
+                    (num_decoder, batch_size, num_queries, cls_out_channels).\
+                    Note `cls_out_channels` should includes background.
+                - all_mask_preds (Tensor): Mask scores for each decoder\
+                    layer. Each with shape (num_decoder, batch_size,\
+                    num_queries, h, w).
         """
         batch_size = len(img_metas)
         input_img_h, input_img_w = img_metas[0]['batch_input_shape']
@@ -528,7 +526,7 @@ def forward_train(self,
                 ignored. Defaults to None.
 
         Returns:
-            losses (dict[str, Tensor]): a dictionary of loss components
+            dict[str, Tensor]: a dictionary of loss components
         """
         # not consider ignoring bboxes
         assert gt_bboxes_ignore is None
@@ -607,8 +605,8 @@ def simple_test(self, feats, img_metas, rescale=False):
     def post_process(self, mask_cls, mask_pred):
         """Panoptic segmengation inference.
 
-        This implementation is modified from\
-            https://github.com/facebookresearch/MaskFormer
+        This implementation is modified from `MaskFormer
+        <https://github.com/facebookresearch/MaskFormer>`_.
 
         Args:
             mask_cls (Tensor): Classfication outputs for a image.
@@ -617,7 +615,7 @@ def post_process(self, mask_cls, mask_pred):
                 shape = (num_queries, h, w).
 
         Returns:
-            panoptic_seg (Tensor): panoptic segment result of shape (h, w),\
+            Tensor: panoptic segment result of shape (h, w),\
                 each element in Tensor means:
                 segment_id = _cls + instance_id * INSTANCE_OFFSET.
         """
diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py
index 17c5d6c895c..73676bcdf50 100644
--- a/mmdet/models/detectors/maskformer.py
+++ b/mmdet/models/detectors/maskformer.py
@@ -7,7 +7,7 @@
 class MaskFormer(SingleStageDetector):
     r"""Implementation of `Per-Pixel Classification is
     NOT All You Need for Semantic Segmentation
-    <https://arxiv.org/pdf/2107.06278>`_"""
+    <https://arxiv.org/pdf/2107.06278>`_."""
 
     def __init__(self,
                  backbone,
diff --git a/mmdet/models/plugins/pixel_decoder.py b/mmdet/models/plugins/pixel_decoder.py
index f69daf46f9a..d1193551ddd 100644
--- a/mmdet/models/plugins/pixel_decoder.py
+++ b/mmdet/models/plugins/pixel_decoder.py
@@ -17,17 +17,17 @@ class PixelDecoder(BaseModule):
             input feature maps.
         feat_channels (int): Number channels for feature.
         out_channels (int): Number channels for output.
-        norm_cfg (obj:`mmcv.ConfigDict`|dict): Config for normalization.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
             Defaults to dict(type='GN', num_groups=32).
-        act_cfg (obj:`mmcv.ConfigDict`|dict): Config for activation.
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
             Defaults to dict(type='ReLU').
-        encoder (obj:`mmcv.ConfigDict`|dict): Config for transorformer
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transorformer
             encoder.Defaults to None.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer encoder position encoding. Defaults to
             dict(type='SinePositionalEncoding', num_feats=128,
             normalize=True).
-        init_cfg (obj:`mmcv.ConfigDict`|dict):  Initialization config dict.
+        init_cfg (:obj:`mmcv.ConfigDict` | dict):  Initialization config dict.
             Default: None
     """
 
@@ -95,10 +95,9 @@ def forward(self, feats, img_metas):
 
         Returns:
             tuple: a tuple containing the following:
-
                 - mask_feature (Tensor): Shape (batch_size, c, h, w).
                 - memory (Tensor): Output of last stage of backbone.\
-                    Shape (batch_size, c, h, w).
+                        Shape (batch_size, c, h, w).
         """
         y = self.last_feat_conv(feats[-1])
         for i in range(self.num_inputs - 2, -1, -1):
@@ -122,17 +121,17 @@ class TransformerEncoderPixelDecoder(PixelDecoder):
             input feature maps.
         feat_channels (int): Number channels for feature.
         out_channels (int): Number channels for output.
-        norm_cfg (obj:`mmcv.ConfigDict`|dict): Config for normalization.
+        norm_cfg (:obj:`mmcv.ConfigDict` | dict): Config for normalization.
             Defaults to dict(type='GN', num_groups=32).
-        act_cfg (obj:`mmcv.ConfigDict`|dict): Config for activation.
+        act_cfg (:obj:`mmcv.ConfigDict` | dict): Config for activation.
             Defaults to dict(type='ReLU').
-        encoder (obj:`mmcv.ConfigDict`|dict): Config for transorformer
+        encoder (:obj:`mmcv.ConfigDict` | dict): Config for transorformer
             encoder.Defaults to None.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for
+        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
             transformer encoder position encoding. Defaults to
             dict(type='SinePositionalEncoding', num_feats=128,
             normalize=True).
-        init_cfg (obj:`mmcv.ConfigDict`|dict):  Initialization config dict.
+        init_cfg (:obj:`mmcv.ConfigDict` | dict):  Initialization config dict.
             Default: None
     """
 
@@ -200,7 +199,6 @@ def forward(self, feats, img_metas):
 
         Returns:
             tuple: a tuple containing the following:
-
                 - mask_feature (Tensor): shape (batch_size, c, h, w).
                 - memory (Tensor): shape (batch_size, c, h, w).
         """
diff --git a/tools/deployment/test.py b/tools/deployment/test.py
index afbad176841..2daf8866e58 100644
--- a/tools/deployment/test.py
+++ b/tools/deployment/test.py
@@ -141,7 +141,7 @@ def main():
 
 
 if __name__ == '__main__':
-    # main()
+    main()
 
     # Following strings of text style are from colorama package
     bright_style, reset_style = '\x1b[1m', '\x1b[0m'

From cf342d8ed7b0f9da7ce8dcbac2e13b710d787e0a Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Thu, 24 Feb 2022 21:58:50 +0800
Subject: [PATCH 23/27] [Feature] ResNet Strikes Back. (#7001)

* [Feature] ResNet Strikes Back.

* add more cfg

* add readme

* update

* update

* update

* update

* update

* update
---
 README.md                                     |   1 +
 README_zh-CN.md                               |   1 +
 configs/resnet_strikes_back/README.md         |  37 ++++++
 ..._mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py |  18 +++
 ...aster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py |  18 +++
 .../mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py |  18 +++
 configs/resnet_strikes_back/metafile.yml      | 116 ++++++++++++++++++
 .../retinanet_r50_fpn_rsb-pretrain_1x_coco.py |  18 +++
 8 files changed, 227 insertions(+)
 create mode 100644 configs/resnet_strikes_back/README.md
 create mode 100644 configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
 create mode 100644 configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
 create mode 100644 configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
 create mode 100644 configs/resnet_strikes_back/metafile.yml
 create mode 100644 configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py

diff --git a/README.md b/README.md
index d935ae2750a..9d65d62a57f 100644
--- a/README.md
+++ b/README.md
@@ -253,6 +253,7 @@ Results and models are available in the [model zoo](docs/en/model_zoo.md).
           <li><a href="configs/gn+ws">Weight Standardization (ArXiv'2019)</a></li>
           <li><a href="configs/pisa">Prime Sample Attention (CVPR'2020)</a></li>
           <li><a href="configs/strong_baselines">Strong Baselines (CVPR'2021)</a></li>
+          <li><a href="configs/resnet_strikes_back">Resnet strikes back (ArXiv'2021)</a></li>
         </ul>
       </td>
     </tr>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 7f258b30a82..5863911117f 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -252,6 +252,7 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope
           <li><a href="configs/gn+ws">Weight Standardization (ArXiv'2019)</a></li>
           <li><a href="configs/pisa">Prime Sample Attention (CVPR'2020)</a></li>
           <li><a href="configs/strong_baselines">Strong Baselines (CVPR'2021)</a></li>
+          <li><a href="configs/resnet_strikes_back">Resnet strikes back (ArXiv'2021)</a></li>
         </ul>
       </td>
     </tr>
diff --git a/configs/resnet_strikes_back/README.md b/configs/resnet_strikes_back/README.md
new file mode 100644
index 00000000000..4d7501129e7
--- /dev/null
+++ b/configs/resnet_strikes_back/README.md
@@ -0,0 +1,37 @@
+# ResNet strikes back
+
+<!-- [OTHERS] -->
+
+## Abstract
+
+The influential Residual Networks designed by He et al. remain the gold-standard architecture in numerous scientific publications. They typically serve as the default architecture in studies, or as baselines when new architectures are proposed. Yet there has been significant progress on best practices for training neural networks since the inception of the ResNet architecture in 2015. Novel optimization & dataaugmentation have increased the effectiveness of the training recipes.
+
+In this paper, we re-evaluate the performance of the vanilla ResNet-50 when trained with a procedure that integrates such advances. We share competitive training settings and pre-trained models in the timm open-source library, with the hope that they will serve as better baselines for future work. For instance, with our more demanding training setting, a vanilla ResNet-50 reaches 80.4% top-1 accuracy at resolution 224×224 on ImageNet-val without extra data or distillation. We also report the performance achieved with popular models with our training procedure.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/12907710/149324625-4546a5a7-704f-406c-982f-0376a20d03d8.png"/>
+</div>
+
+## Results and Models
+
+|    Method    |    Backbone     | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config   | Download   |
+| :----------: | :-------------: | :-----: | :------: | :------------: | :----: | :------:| :------: | :--------: |
+| Faster R-CNN |   R-50 rsb      |    1x   |  3.9     |   -            |  40.8 (+3.4)  |    -    | [Config](./faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229.log.json)|
+|  Mask R-CNN  |   R-50 rsb      |    1x   |  4.5     |   -            |  41.2 (+3.0)  |   38.2 (+3.0)  | [Config](./mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054.log.json)|
+| Cascade Mask R-CNN |   R-50 rsb      |    1x   |  6.2     |   -            |  44.8 (+3.6)  |   39.9 (+3.6)   | [Config](./cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636.log.json)|
+| RetinaNet   |   R-50 rsb      |    1x   |  3.8     |   -            |  39.0 (+2.5)  |    -    | [Config](./retinanet_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432.log.json)|
+
+**Notes:**
+- 'rsb' is short for 'resnet strikes back'
+- We have done some grid searches on learning rate and weight decay and get these optimal hyper-parameters.
+
+## Citation
+
+```latex
+@article{wightman2021resnet,
+title={Resnet strikes back: An improved training procedure in timm},
+author={Ross Wightman, Hugo Touvron, Hervé Jégou},
+journal={arXiv preprint arXiv:2110.00476},
+year={2021}
+}
+```
diff --git a/configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100644
index 00000000000..8b601f05718
--- /dev/null
+++ b/configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/cascade_mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0002,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100644
index 00000000000..fe866843483
--- /dev/null
+++ b/configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/faster_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0002,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100644
index 00000000000..321d98ebe12
--- /dev/null
+++ b/configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0002,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/configs/resnet_strikes_back/metafile.yml b/configs/resnet_strikes_back/metafile.yml
new file mode 100644
index 00000000000..4c85a16d4fd
--- /dev/null
+++ b/configs/resnet_strikes_back/metafile.yml
@@ -0,0 +1,116 @@
+Models:
+  - Name: faster_rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: retinanet_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: RetinaNet
+    Config: configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: mask_rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
diff --git a/configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py b/configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py
new file mode 100644
index 00000000000..480697a0610
--- /dev/null
+++ b/configs/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.05,
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))

From 8dbda8e4ff22d4126d713a068a79b9d70cd2d2d5 Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Thu, 24 Feb 2022 23:02:50 +0800
Subject: [PATCH 24/27] Maskformer Visualization (#7247)

* maskformer visualization

* compatible with models that do not contain arg of pretrained

* compatible with models that do not contain arg of pretrained
---
 mmdet/apis/inference.py              |  5 +-
 mmdet/models/detectors/maskformer.py | 83 ++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/mmdet/apis/inference.py b/mmdet/apis/inference.py
index 7c1ddd0c4e7..d0ab834c4a0 100644
--- a/mmdet/apis/inference.py
+++ b/mmdet/apis/inference.py
@@ -35,7 +35,10 @@ def init_detector(config, checkpoint=None, device='cuda:0', cfg_options=None):
                         f'but got {type(config)}')
     if cfg_options is not None:
         config.merge_from_dict(cfg_options)
-    config.model.pretrained = None
+    if 'pretrained' in config.model:
+        config.model.pretrained = None
+    elif 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
     config.model.train_cfg = None
     model = build_detector(config.model, test_cfg=config.get('test_cfg'))
     if checkpoint is not None:
diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py
index 73676bcdf50..f7257d2547d 100644
--- a/mmdet/models/detectors/maskformer.py
+++ b/mmdet/models/detectors/maskformer.py
@@ -1,4 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+
+from mmdet.core import INSTANCE_OFFSET
+from mmdet.core.visualization import imshow_det_bboxes
 from ..builder import DETECTORS, build_backbone, build_head, build_neck
 from .single_stage import SingleStageDetector
 
@@ -23,6 +28,11 @@ def __init__(self,
         panoptic_head.update(train_cfg=train_cfg)
         panoptic_head.update(test_cfg=test_cfg)
         self.panoptic_head = build_head(panoptic_head)
+
+        self.num_things_classes = self.panoptic_head.num_things_classes
+        self.num_stuff_classes = self.panoptic_head.num_stuff_classes
+        self.num_classes = self.panoptic_head.num_classes
+
         self.train_cfg = train_cfg
         self.test_cfg = test_cfg
 
@@ -104,3 +114,76 @@ def aug_test(self, imgs, img_metas, **kwargs):
 
     def onnx_export(self, img, img_metas):
         raise NotImplementedError
+
+    def show_result(self,
+                    img,
+                    result,
+                    score_thr=0.3,
+                    bbox_color=(72, 101, 241),
+                    text_color=(72, 101, 241),
+                    mask_color=None,
+                    thickness=2,
+                    font_size=13,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (dict): The results.
+
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+               The tuple of color should be in BGR order. Default: 'green'.
+            text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+               The tuple of color should be in BGR order. Default: 'green'.
+            mask_color (None or str or tuple(int) or :obj:`Color`):
+               Color of masks. The tuple of color should be in BGR order.
+               Default: None.
+            thickness (int): Thickness of lines. Default: 2.
+            font_size (int): Font size of texts. Default: 13.
+            win_name (str): The window name. Default: ''.
+            wait_time (float): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        pan_results = result['pan_results']
+        # keep objects ahead
+        ids = np.unique(pan_results)[::-1]
+        legal_indices = ids != self.num_classes  # for VOID label
+        ids = ids[legal_indices]
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (pan_results[None] == ids[:, None, None])
+
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        img = imshow_det_bboxes(
+            img,
+            segms=segms,
+            labels=labels,
+            class_names=self.CLASSES,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img

From 3e204f7fee5f05f55ddddbcd750c85187275d8ed Mon Sep 17 00:00:00 2001
From: Guangchen Lin <347630870@qq.com>
Date: Thu, 24 Feb 2022 23:11:52 +0800
Subject: [PATCH 25/27] Bump versions to v2.22.0 (#7240)

* Bump versions to v2.22.0

* Fix comments and add the latest PRs

* fix the id of contributor

* relax the version of mmcv

* Add ResNet Strikes Back

* Update README_zh-CN.md

* Update README.md

* fix typo

* Update README_zh-CN.md

Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>
---
 README.md                   |  9 +++--
 README_zh-CN.md             |  8 +++--
 docker/serve/Dockerfile     |  2 +-
 docs/en/changelog.md        | 42 ++++++++++++++++++++++++
 docs/en/compatibility.md    |  5 +++
 docs/en/get_started.md      | 63 +++++++++++++++++------------------
 docs/en/model_zoo.md        | 14 +++++++-
 docs/zh_cn/compatibility.md |  5 +++
 docs/zh_cn/get_started.md   | 65 +++++++++++++++++++------------------
 mmdet/version.py            |  2 +-
 10 files changed, 143 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index 9d65d62a57f..5b7d8d37cde 100644
--- a/README.md
+++ b/README.md
@@ -74,10 +74,11 @@ This project is released under the [Apache 2.0 license](LICENSE).
 
 ## Changelog
 
-**2.21.0** was released in 8/2/2022:
+**2.22.0** was released in 24/2/2022:
 
-- Support CPU training
-- Allow to set parameters about multi-processing to speed up training and testing
+- Support [MaskFormer](configs/maskformer), [DyHead](configs/dyhead), [OpenImages Dataset](configs/openimages) and [TIMM backbone](configs/timm_example)
+- Support visualization for Panoptic Segmentation
+- Release a good recipe of using ResNet in object detectors pre-trained by [ResNet Strikes Back](https://arxiv.org/abs/2110.00476), which consistently brings about 3~4 mAP improvements over RetinaNet, Faster/Mask/Cascade Mask R-CNN
 
 Please refer to [changelog.md](docs/en/changelog.md) for details and release history.
 
@@ -162,6 +163,7 @@ Results and models are available in the [model zoo](docs/en/model_zoo.md).
       <td>
         <ul>
           <li><a href="configs/panoptic_fpn">Panoptic FPN (CVPR'2019)</a></li>
+          <li><a href="configs/maskformer">MaskFormer (NeurIPS'2019)</a></li>
         </ul>
       </td>
       <td>
@@ -225,6 +227,7 @@ Results and models are available in the [model zoo](docs/en/model_zoo.md).
         <li><a href="configs/pvt">PVT (ICCV'2021)</a></li>
         <li><a href="configs/swin">Swin (CVPR'2021)</a></li>
         <li><a href="configs/pvt">PVTv2 (ArXiv'2021)</a></li>
+        <li><a href="configs/resnet_strikes_back">ResNet strikes back (ArXiv'2021)</a></li>
       </ul>
       </td>
       <td>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 5863911117f..7a339555935 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -73,10 +73,11 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope
 
 ## 更新日志
 
-最新的 **2.21.0** 版本已经在 2022.02.08 发布:
+最新的 **2.22.0** 版本已经在 2022.02.24 发布:
 
-- 支持了 CPU 训练
-- 允许设置多进程相关的参数来加速训练与推理
+- 支持 [MaskFormer](configs/maskformer)，[DyHead](configs/dyhead)，[OpenImages Dataset](configs/openimages) 和 [TIMM backbone](configs/timm_example)
+- 支持全景分割可视化
+- 发布了一个在目标检测任务中使用 ResNet 的好方法，它是由 [ResNet Strikes Back](https://arxiv.org/abs/2110.00476) 预训练的，并且能稳定的在 RetinaNet, Faster/Mask/Cascade Mask R-CNN 上带来约 3-4 mAP 的提升
 
 如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/changelog.md)。
 
@@ -224,6 +225,7 @@ MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [Ope
         <li><a href="configs/pvt">PVT (ICCV'2021)</a></li>
         <li><a href="configs/swin">Swin (CVPR'2021)</a></li>
         <li><a href="configs/pvt">PVTv2 (ArXiv'2021)</a></li>
+        <li><a href="configs/resnet_strikes_back">ResNet strikes back (ArXiv'2021)</a></li>
       </ul>
       </td>
       <td>
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
index 9776ac43096..ffa81c82ceb 100644
--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@@ -4,7 +4,7 @@ ARG CUDNN="7"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
 ARG MMCV="1.3.17"
-ARG MMDET="2.21.0"
+ARG MMDET="2.22.0"
 
 ENV PYTHONUNBUFFERED TRUE
 
diff --git a/docs/en/changelog.md b/docs/en/changelog.md
index 012557212ff..7a3b18fcfa2 100644
--- a/docs/en/changelog.md
+++ b/docs/en/changelog.md
@@ -1,5 +1,47 @@
 ## Changelog
 
+### v2.22.0 (24/2/2022)
+
+#### Highlights
+
+- Support MaskFormer: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) (#7212)
+- Support DyHead: [Dynamic Head: Unifying Object Detection Heads with Attentions](https://arxiv.org/abs/2106.08322) (#6823)
+- Release a good recipe of using ResNet in object detectors pre-trained by [ResNet Strikes Back](https://arxiv.org/abs/2110.00476), which consistently brings about 3~4 mAP improvements over RetinaNet, Faster/Mask/Cascade Mask R-CNN (#7001)
+- Support [Open Images Dataset](https://storage.googleapis.com/openimages/web/index.html) (#6331)
+- Support TIMM backbone: [PyTorch Image Models](https://github.com/rwightman/pytorch-image-models) (#7020)
+
+#### New Features
+
+- Support [MaskFormer](configs/maskformer) (#7212)
+- Support [DyHead](configs/dyhead) (#6823)
+- Support [ResNet Strikes Back](configs/resnet_strikes_back) (#7001)
+- Support [OpenImages Dataset](configs/openimages) (#6331)
+- Support [TIMM backbone](configs/timm_example) (#7020)
+- Support visualization for Panoptic Segmentation (#7041)
+
+#### Breaking Changes
+
+In order to support the visualization for Panoptic Segmentation, the `num_classes` can not be `None` when using the `get_palette` function to determine whether to use the panoptic palette.
+
+#### Bug Fixes
+
+- Fix bug for the best checkpoints can not be saved when the `key_score` is None (#7101)
+- Fix MixUp transform filter boxes failing case (#7080)
+- Add missing properties in SABLHead (#7091)
+- Fix bug when NaNs exist in confusion matrix (#7147)
+- Fix PALETTE AttributeError in downstream task (#7230)
+
+#### Improvements
+
+- Speed up SimOTA matching (#7098)
+- Add Chinese translation of `docs_zh-CN/tutorials/init_cfg.md` (#7188)
+
+#### Contributors
+
+A total of 20 developers contributed to this release.
+Thanks @ZwwWayne, @hhaAndroid, @RangiLyu, @AronLin, @BIGWangYuDong, @jbwang1997, @zytx121, @chhluo, @shinya7y, @LuooChen, @dvansa, @siatwangmin, @del-zhenwu, @vikashranjan26, @haofanwang, @jamiechoi1995, @HJoonKwon, @yarkable, @zhijian-liu, @RangeKing
+
+
 ### v2.21.0 (8/2/2022)
 
 ### Breaking Changes
diff --git a/docs/en/compatibility.md b/docs/en/compatibility.md
index 3d6b9459849..a4f297690d9 100644
--- a/docs/en/compatibility.md
+++ b/docs/en/compatibility.md
@@ -1,5 +1,10 @@
 # Compatibility of MMDetection 2.x
 
+## MMDetection 2.21.0
+
+In order to support CPU training, the logic of scatter in batch collating has been changed. We recommend to use
+MMCV v1.4.4 or higher. For more details, please refer to [MMCV PR #1621](https://github.com/open-mmlab/mmcv/pull/1621).
+
 ## MMDetection 2.18.1
 
 ### MMCV compatibility
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index 28bbefaad94..91a8ce6218d 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -9,35 +9,36 @@
 
 Compatible MMDetection and MMCV versions are shown as below. Please install the correct version of MMCV to avoid installation issues.
 
-| MMDetection version |    MMCV version     |
-|:-------------------:|:-------------------:|
-| master              | mmcv-full>=1.3.17, <1.5.0 |
-| 2.21.0              | mmcv-full>=1.3.17, <1.5.0 |
-| 2.20.0              | mmcv-full>=1.3.17, <1.5.0 |
-| 2.19.1              | mmcv-full>=1.3.17, <1.5.0 |
-| 2.19.0              | mmcv-full>=1.3.17, <1.5.0 |
-| 2.18.0              | mmcv-full>=1.3.17, <1.4.0 |
-| 2.17.0              | mmcv-full>=1.3.14, <1.4.0 |
-| 2.16.0              | mmcv-full>=1.3.8, <1.4.0 |
-| 2.15.1              | mmcv-full>=1.3.8, <1.4.0 |
-| 2.15.0              | mmcv-full>=1.3.8, <1.4.0 |
-| 2.14.0              | mmcv-full>=1.3.8, <1.4.0 |
-| 2.13.0              | mmcv-full>=1.3.3, <1.4.0 |
-| 2.12.0              | mmcv-full>=1.3.3, <1.4.0 |
-| 2.11.0              | mmcv-full>=1.2.4, <1.4.0 |
-| 2.10.0              | mmcv-full>=1.2.4, <1.4.0 |
-| 2.9.0               | mmcv-full>=1.2.4, <1.4.0 |
-| 2.8.0               | mmcv-full>=1.2.4, <1.4.0 |
-| 2.7.0               | mmcv-full>=1.1.5, <1.4.0 |
-| 2.6.0               | mmcv-full>=1.1.5, <1.4.0 |
-| 2.5.0               | mmcv-full>=1.1.5, <1.4.0 |
-| 2.4.0               | mmcv-full>=1.1.1, <1.4.0 |
-| 2.3.0               | mmcv-full==1.0.5    |
-| 2.3.0rc0            | mmcv-full>=1.0.2    |
-| 2.2.1               | mmcv==0.6.2         |
-| 2.2.0               | mmcv==0.6.2         |
-| 2.1.0               | mmcv>=0.5.9, <=0.6.1|
-| 2.0.0               | mmcv>=0.5.1, <=0.5.8|
+| MMDetection version |       MMCV version        |
+|:-------------------:|:-------------------------:|
+|       master        | mmcv-full>=1.3.17, <1.5.0 |
+|       2.22.0        | mmcv-full>=1.3.17, <1.5.0 |
+|       2.21.0        | mmcv-full>=1.3.17, <1.5.0 |
+|       2.20.0        | mmcv-full>=1.3.17, <1.5.0 |
+|       2.19.1        | mmcv-full>=1.3.17, <1.5.0 |
+|       2.19.0        | mmcv-full>=1.3.17, <1.5.0 |
+|       2.18.0        | mmcv-full>=1.3.17, <1.4.0 |
+|       2.17.0        | mmcv-full>=1.3.14, <1.4.0 |
+|       2.16.0        | mmcv-full>=1.3.8, <1.4.0  |
+|       2.15.1        | mmcv-full>=1.3.8, <1.4.0  |
+|       2.15.0        | mmcv-full>=1.3.8, <1.4.0  |
+|       2.14.0        | mmcv-full>=1.3.8, <1.4.0  |
+|       2.13.0        | mmcv-full>=1.3.3, <1.4.0  |
+|       2.12.0        | mmcv-full>=1.3.3, <1.4.0  |
+|       2.11.0        | mmcv-full>=1.2.4, <1.4.0  |
+|       2.10.0        | mmcv-full>=1.2.4, <1.4.0  |
+|        2.9.0        | mmcv-full>=1.2.4, <1.4.0  |
+|        2.8.0        | mmcv-full>=1.2.4, <1.4.0  |
+|        2.7.0        | mmcv-full>=1.1.5, <1.4.0  |
+|        2.6.0        | mmcv-full>=1.1.5, <1.4.0  |
+|        2.5.0        | mmcv-full>=1.1.5, <1.4.0  |
+|        2.4.0        | mmcv-full>=1.1.1, <1.4.0  |
+|        2.3.0        |     mmcv-full==1.0.5      |
+|      2.3.0rc0       |     mmcv-full>=1.0.2      |
+|        2.2.1        |        mmcv==0.6.2        |
+|        2.2.0        |        mmcv==0.6.2        |
+|        2.1.0        |   mmcv>=0.5.9, <=0.6.1    |
+|        2.0.0        |   mmcv>=0.5.1, <=0.5.8    |
 
 **Note:** You need to run `pip uninstall mmcv` first if you have mmcv installed.
 If mmcv and mmcv-full are both installed, there will be `ModuleNotFoundError`.
@@ -166,7 +167,7 @@ to [official documentation](https://albumentations.ai/docs/getting_started/insta
 
 MMDetection can be built for CPU only environment (where CUDA isn't available).
 
-In CPU mode you can run the demo/webcam_demo.py for example.
+In CPU mode you can train (requires MMCV version >= 1.4.4), test or inference a model.
 However some functionality is gone in this mode:
 
 - Deformable Convolution
@@ -182,7 +183,7 @@ However some functionality is gone in this mode:
 - sigmoid_focal_loss_cuda
 - bbox_overlaps
 
-If you try to run inference with a model containing above ops, an error will be raised.
+If you try to train/test/inference a model containing above ops, an error will be raised.
 The following table lists affected algorithms.
 
 |                        Operator                         |                            Model                             |
diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md
index 4ef4aacff45..15bb4894ae0 100644
--- a/docs/en/model_zoo.md
+++ b/docs/en/model_zoo.md
@@ -242,9 +242,21 @@ Please refer to [SOLO](https://github.com/open-mmlab/mmdetection/blob/master/con
 
 Please refer to [QueryInst](https://github.com/open-mmlab/mmdetection/blob/master/configs/queryinst) for details.
 
+### PanopticFPN
+
+Please refer to [PanopticFPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/panoptic_fpn) for details.
+
+### MaskFormer
+
+Please refer to [MaskFormer](https://github.com/open-mmlab/mmdetection/blob/master/configs/maskformer) for details.
+
+### DyHead
+
+Please refer to [DyHead](https://github.com/open-mmlab/mmdetection/blob/master/configs/dyhead) for details.
+
 ### Other datasets
 
-We also benchmark some methods on [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/master/configs/pascal_voc), [Cityscapes](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes) and [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/master/configs/wider_face).
+We also benchmark some methods on [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/master/configs/pascal_voc), [Cityscapes](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes), [OpenImages](https://github.com/open-mmlab/mmdetection/blob/master/configs/openimages) and [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/master/configs/wider_face).
 
 ### Pre-trained Models
 
diff --git a/docs/zh_cn/compatibility.md b/docs/zh_cn/compatibility.md
index 80acf7f1a0e..73e1e628c7d 100644
--- a/docs/zh_cn/compatibility.md
+++ b/docs/zh_cn/compatibility.md
@@ -1,5 +1,10 @@
 # MMDetection v2.x 兼容性说明
 
+## MMDection 2.21.0
+
+为了支持 CPU 训练，MMCV 中进行批处理的 scatter 的代码逻辑已经被修改。我们推荐使用 MMCV v1.4.4 或更高版本，
+更多信息请参考 [MMCV PR #1621](https://github.com/open-mmlab/mmcv/pull/1621).
+
 ## MMDetection 2.18.1
 
 ### MMCV compatibility
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index b606a0ad4d8..b6a2c60f396 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -9,36 +9,37 @@
 
 MMDetection 和 MMCV 版本兼容性如下所示，需要安装正确的 MMCV 版本以避免安装出现问题。
 
-| MMDetection 版本 |        MMCV 版本         |
-| :--------------: | :----------------------: |
-|      master      | mmcv-full>=1.3.17, <1.5.0 |
-| 2.21.0           | mmcv-full>=1.3.17, <1.5.0 |
-| 2.20.0           | mmcv-full>=1.3.17, <1.5.0 |
-|      2.19.1      | mmcv-full>=1.3.17, <1.5.0 |
-|      2.19.0      | mmcv-full>=1.3.17, <1.5.0 |
-|      2.18.1      | mmcv-full>=1.3.17, <1.4.0 |
-|      2.18.0      | mmcv-full>=1.3.14, <1.4.0 |
-|      2.17.0      | mmcv-full>=1.3.14, <1.4.0 |
-|      2.16.0      | mmcv-full>=1.3.8, <1.4.0 |
-|      2.15.1      | mmcv-full>=1.3.8, <1.4.0 |
-|      2.15.0      | mmcv-full>=1.3.8, <1.4.0 |
-|      2.14.0      | mmcv-full>=1.3.8, <1.4.0 |
-|      2.13.0      | mmcv-full>=1.3.3, <1.4.0 |
-|      2.12.0      | mmcv-full>=1.3.3, <1.4.0 |
-|      2.11.0      | mmcv-full>=1.2.4, <1.4.0 |
-|      2.10.0      | mmcv-full>=1.2.4, <1.4.0 |
-|      2.9.0       | mmcv-full>=1.2.4, <1.4.0 |
-|      2.8.0       | mmcv-full>=1.2.4, <1.4.0 |
-|      2.7.0       | mmcv-full>=1.1.5, <1.4.0 |
-|      2.6.0       | mmcv-full>=1.1.5, <1.4.0 |
-|      2.5.0       | mmcv-full>=1.1.5, <1.4.0 |
-|      2.4.0       | mmcv-full>=1.1.1, <1.4.0 |
-|      2.3.0       |     mmcv-full==1.0.5     |
-|     2.3.0rc0     |     mmcv-full>=1.0.2     |
-|      2.2.1       |       mmcv==0.6.2        |
-|      2.2.0       |       mmcv==0.6.2        |
-|      2.1.0       |   mmcv>=0.5.9, <=0.6.1   |
-|      2.0.0       |   mmcv>=0.5.1, <=0.5.8   |
+| MMDetection 版本 |          MMCV 版本          |
+|:--------------:|:-------------------------:|
+|     master     | mmcv-full>=1.3.17, <1.5.0 |
+|     2.22.0     | mmcv-full>=1.3.17, <1.5.0 |
+|     2.21.0     | mmcv-full>=1.3.17, <1.5.0 |
+|     2.20.0     | mmcv-full>=1.3.17, <1.5.0 |
+|     2.19.1     | mmcv-full>=1.3.17, <1.5.0 |
+|     2.19.0     | mmcv-full>=1.3.17, <1.5.0 |
+|     2.18.1     | mmcv-full>=1.3.17, <1.4.0 |
+|     2.18.0     | mmcv-full>=1.3.14, <1.4.0 |
+|     2.17.0     | mmcv-full>=1.3.14, <1.4.0 |
+|     2.16.0     | mmcv-full>=1.3.8, <1.4.0  |
+|     2.15.1     | mmcv-full>=1.3.8, <1.4.0  |
+|     2.15.0     | mmcv-full>=1.3.8, <1.4.0  |
+|     2.14.0     | mmcv-full>=1.3.8, <1.4.0  |
+|     2.13.0     | mmcv-full>=1.3.3, <1.4.0  |
+|     2.12.0     | mmcv-full>=1.3.3, <1.4.0  |
+|     2.11.0     | mmcv-full>=1.2.4, <1.4.0  |
+|     2.10.0     | mmcv-full>=1.2.4, <1.4.0  |
+|     2.9.0      | mmcv-full>=1.2.4, <1.4.0  |
+|     2.8.0      | mmcv-full>=1.2.4, <1.4.0  |
+|     2.7.0      | mmcv-full>=1.1.5, <1.4.0  |
+|     2.6.0      | mmcv-full>=1.1.5, <1.4.0  |
+|     2.5.0      | mmcv-full>=1.1.5, <1.4.0  |
+|     2.4.0      | mmcv-full>=1.1.1, <1.4.0  |
+|     2.3.0      |     mmcv-full==1.0.5      |
+|    2.3.0rc0    |     mmcv-full>=1.0.2      |
+|     2.2.1      |        mmcv==0.6.2        |
+|     2.2.0      |        mmcv==0.6.2        |
+|     2.1.0      |   mmcv>=0.5.9, <=0.6.1    |
+|     2.0.0      |   mmcv>=0.5.1, <=0.5.8    |
 
 **注意：**如果已经安装了 mmcv，首先需要使用 `pip uninstall mmcv` 卸载已安装的 mmcv，如果同时安装了 mmcv 和 mmcv-full，将会报 `ModuleNotFoundError` 错误。
 
@@ -166,7 +167,7 @@ MIM 能够自动地安装 OpenMMLab 的项目以及对应的依赖包。
 
 我们的代码能够建立在只使用 CPU 的环境（CUDA 不可用）。
 
-在CPU模式下，可以运行 `demo/webcam_demo.py` 示例，然而以下功能将在 CPU 模式下不能使用：
+在CPU模式下，可以进行模型训练（需要 MMCV 版本 >= 1.4.4)、测试或者推理，然而以下功能将在 CPU 模式下不能使用：
 
 - Deformable Convolution
 - Modulated Deformable Convolution
@@ -181,7 +182,7 @@ MIM 能够自动地安装 OpenMMLab 的项目以及对应的依赖包。
 - sigmoid_focal_loss_cuda
 - bbox_overlaps
 
-因此，如果尝试使用包含上述操作的模型进行推理，将会报错。下表列出了由于依赖上述算子而无法在 CPU 上运行的相关模型：
+因此，如果尝试使用包含上述操作的模型进行训练/测试/推理，将会报错。下表列出了由于依赖上述算子而无法在 CPU 上运行的相关模型：
 
 |                        操作                         |                            模型                             |
 | :-----------------------------------------------------: | :----------------------------------------------------------: |
diff --git a/mmdet/version.py b/mmdet/version.py
index dd1bcd49098..7618b043734 100644
--- a/mmdet/version.py
+++ b/mmdet/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-__version__ = '2.21.0'
+__version__ = '2.22.0'
 short_version = __version__
 
 

From e7d05e97aa28c7b534bb9649f78970774164078d Mon Sep 17 00:00:00 2001
From: Cedric Luo <luochunhua1996@outlook.com>
Date: Fri, 25 Feb 2022 10:45:28 +0800
Subject: [PATCH 26/27] Maskformer metafile and rsb readme format (#7250)

---
 README.md                             |  1 -
 configs/maskformer/README.md          |  2 +-
 configs/maskformer/metafile.yml       | 31 +++++++++++++++++++++++++++
 configs/resnet_strikes_back/README.md |  3 +++
 4 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 configs/maskformer/metafile.yml

diff --git a/README.md b/README.md
index 48b5a548bb6..c1d63cc242e 100644
--- a/README.md
+++ b/README.md
@@ -327,4 +327,3 @@ If you use this toolbox or benchmark in your research, please cite this project.
 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
 - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
 - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
-
diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
index 54110004c94..2cb76b1bee3 100644
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@@ -38,7 +38,7 @@ mmdetection
 
 | Backbone |  style  | Lr schd | Mem (GB) | Inf time (fps) |   PQ   |   SQ   |   RQ   | PQ_th  | SQ_th  | RQ_th  | PQ_st  | SQ_st  | RQ_st  |                                                           Config                                                           |                                                                                                                                                                    Download                                                                                                                                                                     |                                                                         detail                                                                          |
 |:--------:|:-------:|:-------:|:--------:|:--------------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:--------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------:|
-|   R-50   | pytorch |   75e   |   16.6   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
+|   R-50   | pytorch |   75e   |   16.2   |       -        | 46.854 | 80.617 | 57.085 | 51.089 | 81.511 | 61.853 | 40.463 | 79.269 | 49.888 | [config](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956.log.json) | This version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) |
 
 ## Citation
 
diff --git a/configs/maskformer/metafile.yml b/configs/maskformer/metafile.yml
new file mode 100644
index 00000000000..c1654ae471c
--- /dev/null
+++ b/configs/maskformer/metafile.yml
@@ -0,0 +1,31 @@
+Collections:
+  - Name: MaskFormer
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 16x V100 GPUs
+      Architecture:
+        - MaskFormer
+    Paper:
+      URL: https://arxiv.org/pdf/2107.06278
+      Title: 'Per-Pixel Classification is Not All You Need for Semantic Segmentation'
+    README: configs/maskformer/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/detectors/maskformer.py#L7
+      Version: v2.22.0
+
+Models:
+  - Name: maskformer_r50_mstrain_16x1_75e_coco
+    In Collection: MaskFormer
+    Config: configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py
+    Metadata:
+      Training Memory (GB): 16.2
+      Epochs: 75
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 46.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth
diff --git a/configs/resnet_strikes_back/README.md b/configs/resnet_strikes_back/README.md
index 4d7501129e7..eec03826da8 100644
--- a/configs/resnet_strikes_back/README.md
+++ b/configs/resnet_strikes_back/README.md
@@ -1,5 +1,7 @@
 # ResNet strikes back
 
+> [ResNet strikes back: An improved training procedure in timm](https://arxiv.org/abs/2110.00476)
+
 <!-- [OTHERS] -->
 
 ## Abstract
@@ -22,6 +24,7 @@ In this paper, we re-evaluate the performance of the vanilla ResNet-50 when trai
 | RetinaNet   |   R-50 rsb      |    1x   |  3.8     |   -            |  39.0 (+2.5)  |    -    | [Config](./retinanet_r50_fpn_rsb-pretrain_1x_coco.py)| [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth) &#124; [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432.log.json)|
 
 **Notes:**
+
 - 'rsb' is short for 'resnet strikes back'
 - We have done some grid searches on learning rate and weight decay and get these optimal hyper-parameters.
 

From 7b2b7fe5161ede7428788aaf60a37654e586bfe5 Mon Sep 17 00:00:00 2001
From: BigDong <yudongwang@tju.edu.cn>
Date: Fri, 25 Feb 2022 20:28:45 +0800
Subject: [PATCH 27/27] [Fix] Fix Open Images testunit to avoid error in
 Windows CI (#7252)

* [Feature] support openimage group of eval

* [Feature] support openimage group of eval

* support openimage dataset

* support openimage challenge dataset

* fully support OpenImages-V6 and OpenImages Challenge 2019

* Fix some logic error

* update config file

* fix get data_infos error

* fully support OpenImages evaluation

* update OpenImages config files

* [Feature] support OpenImages datasets

* fix bug

* support load image metas from pipeline

* fix bug

* fix get classes logic error

* update code

* support get image metas

* support openimags

* support collect image metas

* support Open Images

* fix openimages logic

* minor fix

* add a new function to compute openimages tpfp

* minor fix

* fix ci error

* minor fix

* fix indication

* minor fix

* fix returns

* fix returns

* fix returns

* fix returns

* fix returns

* minor fix

* update readme

* support loading image level labels and fix some logic

* minor fix

* minor fix

* add class names

* minor fix

* minor fix

* minor fix

* add openimages test unit

* minor fix

* minor fix

* fix test unit

* minor fix

* fix logic error

* minor fix

* fully support openimages

* minor fix

* fix docstring

* fix docstrings in readthedocs

* update get image metas script

* label_description_file -> label_file

* update openimages readme

* fix test unit

* fix test unit

* minor fix

* update readme file

* Update get_image_metas.py

* fix oid testunit to avoid some error in windows

* minor fix to avoid some error in windows

* minor fix

* add comments in oid test unit

* minor fix
---
 mmdet/datasets/openimages.py                  | 10 +++---
 .../test_datasets/test_openimages_dataset.py  | 31 +++++++++++++------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/mmdet/datasets/openimages.py b/mmdet/datasets/openimages.py
index 517105cc65f..d601b482403 100644
--- a/mmdet/datasets/openimages.py
+++ b/mmdet/datasets/openimages.py
@@ -262,10 +262,10 @@ def get_meta_from_file(self, meta_file=''):
         metas = mmcv.load(meta_file)
         assert len(metas) == len(self)
         for i in range(len(metas)):
-            file_name = metas[i]['filename'].split('/')[-1]
+            file_name = osp.split(metas[i]['filename'])[-1]
             img_info = self.data_infos[i].get('img_info', None)
             if img_info is not None:
-                assert file_name == img_info['filename'].split('/')[-1]
+                assert file_name == osp.split(img_info['filename'])[-1]
             else:
                 assert file_name == self.data_infos[i]['filename']
             hw = metas[i]['ori_shape'][:2]
@@ -286,10 +286,10 @@ def get_img_shape(self, metas):
         """Set images original shape into data_infos."""
         assert len(metas) == len(self)
         for i in range(len(metas)):
-            file_name = metas[i].data['filename'].split('/')[-1]
+            file_name = osp.split(metas[i].data['ori_filename'])[-1]
             img_info = self.data_infos[i].get('img_info', None)
             if img_info is not None:
-                assert file_name == img_info['filename'].split('/')[-1]
+                assert file_name == osp.split(img_info['filename'])[-1]
             else:
                 assert file_name == self.data_infos[i]['filename']
             hw = metas[i].data['ori_shape'][:2]
@@ -519,7 +519,7 @@ def get_image_level_ann(self, image_level_ann_file):
             img_info = self.data_infos[i].get('img_info', None)
             if img_info is not None:
                 # for Open Images Challenges
-                img_id = img_info['filename'].split('/')[-1][:-4]
+                img_id = osp.split(img_info['filename'])[-1][:-4]
             else:
                 # for Open Images v6
                 img_id = self.data_infos[i]['img_id']
diff --git a/tests/test_data/test_datasets/test_openimages_dataset.py b/tests/test_data/test_datasets/test_openimages_dataset.py
index 12b2e47d593..d1e17c23b04 100644
--- a/tests/test_data/test_datasets/test_openimages_dataset.py
+++ b/tests/test_data/test_datasets/test_openimages_dataset.py
@@ -14,7 +14,9 @@ def _create_ids_error_oid_csv(
     fake_csv_file,
 ):
     label_description = ['/m/000002', 'Football']
-    with open(label_file, 'w') as f:
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_file, 'w', newline='') as f:
         f_csv = csv.writer(f)
         f_csv.writerow(label_description)
 
@@ -31,7 +33,9 @@ def _create_ids_error_oid_csv(
                        '000595fe6fee6369', 'xclick', '/m/000000', '1', '0',
                        '1', '0', '1', '0', '0', '1', '0', '0'
                    ]]
-    with open(fake_csv_file, 'w') as f:
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(fake_csv_file, 'w', newline='') as f:
         f_csv = csv.writer(f)
         f_csv.writerow(header)
         f_csv.writerows(annotations)
@@ -41,7 +45,7 @@ def _create_oid_style_ann(label_file, csv_file, label_level_file):
     label_description = [['/m/000000', 'Sports equipment'],
                          ['/m/000001', 'Ball'], ['/m/000002', 'Football'],
                          ['/m/000004', 'Bicycle']]
-    with open(label_file, 'w') as f:
+    with open(label_file, 'w', newline='') as f:
         f_csv = csv.writer(f)
         f_csv.writerows(label_description)
 
@@ -60,7 +64,9 @@ def _create_oid_style_ann(label_file, csv_file, label_level_file):
             0, 0, 0, 0
         ],
     ]
-    with open(csv_file, 'w') as f:
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(csv_file, 'w', newline='') as f:
         f_csv = csv.writer(f)
         f_csv.writerow(header)
         f_csv.writerows(annotations)
@@ -68,7 +74,9 @@ def _create_oid_style_ann(label_file, csv_file, label_level_file):
     header = ['ImageID', 'Source', 'LabelName', 'Confidence']
     annotations = [['color', 'xclick', '/m/000002', '1'],
                    ['color', 'xclick', '/m/000004', '0']]
-    with open(label_level_file, 'w') as f:
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_level_file, 'w', newline='') as f:
         f_csv = csv.writer(f)
         f_csv.writerow(header)
         f_csv.writerows(annotations)
@@ -129,8 +137,9 @@ def _creat_oid_challenge_style_ann(txt_file, label_file, label_level_file):
         '1 0.0333333 0.1 0.0333333 0.1 1\n',
         '1 0.1 0.166667 0.1 0.166667 0\n',
     ]
-
-    with open(txt_file, 'w') as f:
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(txt_file, 'w', newline='') as f:
         f.writelines(bboxes)
         f.close()
 
@@ -138,13 +147,17 @@ def _creat_oid_challenge_style_ann(txt_file, label_file, label_level_file):
                          ['/m/000001', 'Ball', 2],
                          ['/m/000002', 'Football', 3],
                          ['/m/000004', 'Bicycle', 4]]
-    with open(label_file, 'w') as f:
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_file, 'w', newline='') as f:
         f_csv = csv.writer(f)
         f_csv.writerows(label_description)
 
     header = ['ImageID', 'LabelName', 'Confidence']
     annotations = [['color', '/m/000001', '1'], ['color', '/m/000000', '0']]
-    with open(label_level_file, 'w') as f:
+    # `newline=''` is used to avoid index error of out of bounds
+    # in Windows system
+    with open(label_level_file, 'w', newline='') as f:
         f_csv = csv.writer(f)
         f_csv.writerow(header)
         f_csv.writerows(annotations)