mmdetection使用以及源码解析

mmdetection是一款优秀的基于PyTorch的开源目标检测系统,由香港中文大学(CUHK)多媒体实验室(mmlab)开发。基本上支持所有当前SOTA二阶段的目标检测算法,比如faster rcnn,mask rcnn,r-fcn,cascade rcnn,此外还支持了SSD和RetinaNet等一阶段的目标检测算法。花了一天的时间大致地看了下框架的相关源代码,以下是部分整理纪录。

安装

项目地址:open-mmlab/mmdetection。安装:mmdetection/INSTALL.md
当前该项目已经迁移到pytorch1.0版本,如果需要使用pytorch0.4可以使用如下命令进行切换:

1
2
git clone https://github.com/open-mmlab/mmdetection.git
git checkout pytorch-0.4.1 #切换至0.4.1版本

此外,该项目依赖于open-mmlab/mmcv。这个项目也是CUHK的开源的一个计算机视觉库,主要包括一些图像/视频数据读取,预处理,可视化,过程显示等操作。可以直接使用pip安装:

1
pip install mmcv

训练

tools/train.py使用

mmdetection项目中给了许多模型的训练配置文件,其中包括不同backbone的cascade rcnn, faster rcnn,mask rcnn以及retinanet和SSD等检测模型。比如训练一个backbone为resnet50的 mask rcnn模型:

1
python tools/train.py configs/mask_rcnn_r50_fpn_1x.py

有关训练模型的具体配置比如backbone选取,anchor参数配置,训练/验证,数据集,optimizer参数,迭代次数等等,都可以在configs/*.py文件中进行配置修改。train.py文件中还包括一些其他输入参数选项:

  • —work_dir 是模型checkpoint文件的输出目录,可以在configs/*.py中配置;
  • —resume_from 是指定在某个checkpoint的基础上继续训练,可以在configs/*.py中配置;
  • —validate 是指是否在训练中是否对每个checkpoint都进行评估,默认是true
  • —gpus 是指使用的GPU数量,默认值为1;
  • —launcher 是指分布式训练的任务启动器(job launcher),默认值为none表示不进行分布式训练;

mmdetection也给出了分布式训练的脚本,可以在单机或多机上进行分布式训练。

1
./tools/dist_train.sh <config_file> <GPU_NUM> [optional arg]

tools/train.py源码解析

具体源代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from __future__ import division

import argparse
from mmcv import Config

from mmdet import __version__
from mmdet.datasets import get_dataset
from mmdet.apis import (train_detector, init_dist, get_root_logger,
set_random_seed)
from mmdet.models import build_detector
import torch


def parse_args():
parser = argparse.ArgumentParser(description='Train a detector')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work_dir', help='the dir to save logs and models')
parser.add_argument(
'--resume_from', help='the checkpoint file to resume from')
parser.add_argument(
'--validate',
action='store_true',
help='whether to evaluate the checkpoint during training')
parser.add_argument(
'--gpus',
type=int,
default=1,
help='number of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()

return args


def main():
args = parse_args() #读取一些命令行必要参数

cfg = Config.fromfile(args.config)
# 1.参数读取:读取configs/*.py文件,并建立mmcv.config对象用于后续模型训练参数设置
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# update configs according to CLI args
if args.work_dir is not None:
cfg.work_dir = args.work_dir
if args.resume_from is not None:
cfg.resume_from = args.resume_from
cfg.gpus = args.gpus
if cfg.checkpoint_config is not None:
# save mmdet version in checkpoints as meta data
cfg.checkpoint_config.meta = dict(
mmdet_version=__version__, config=cfg.text)

# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)

# init logger before other steps
logger = get_root_logger(cfg.log_level)
logger.info('Distributed training: {}'.format(distributed))

# set random seeds
if args.seed is not None:
logger.info('Set random seed to {}'.format(args.seed))
set_random_seed(args.seed)

model = build_detector(
cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
# 2.构建模型:调用mmdet.models.build_detector,从config中读取必要的模型参数构建模型
train_dataset = get_dataset(cfg.data.train)
# 3.数据集读取:调用mmdet.datasets.get_dataset,根据从配置文件中读取训练测试数据集的信息,建立数据集对象
train_detector(
model,
train_dataset,
cfg,
distributed=distributed,
validate=args.validate,
logger=logger)
# 4.训练模型:调用mmdet.apis.train_detector方法,输入上面构建好的模型、数据集以及配置参数开始训练模型

if __name__ == '__main__':
main()

可以看出模型的训练入口在于调用mmdet.apis.train_detector。该接口的实现在mmdet.apis.train.py,具体源代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from __future__ import division

from collections import OrderedDict

import torch
from mmcv.runner import Runner, DistSamplerSeedHook
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel

from mmdet.core import (DistOptimizerHook, DistEvalmAPHook,
CocoDistEvalRecallHook, CocoDistEvalmAPHook)
from mmdet.datasets import build_dataloader
from mmdet.models import RPN
from .env import get_root_logger


def parse_losses(losses):
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(
'{} is not a tensor or list of tensors'.format(loss_name))

loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)

log_vars['loss'] = loss
for name in log_vars:
log_vars[name] = log_vars[name].item()

return loss, log_vars


def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)

outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))

return outputs


def train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
logger=None):
if logger is None:
logger = get_root_logger(cfg.log_level)

# start training
if distributed:
_dist_train(model, dataset, cfg, validate=validate)
else:
_non_dist_train(model, dataset, cfg, validate=validate)


def _dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
dist=True)
]
# put model on gpus
model = MMDistributedDataParallel(model.cuda())
# build runner
runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
cfg.log_level)
# register hooks
optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config)
runner.register_hook(DistSamplerSeedHook())
# register eval hooks
if validate:
if isinstance(model.module, RPN):
# TODO: implement recall hooks for other datasets
runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
else:
if cfg.data.val.type == 'CocoDataset':
runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
else:
runner.register_hook(DistEvalmAPHook(cfg.data.val))

if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)


def _non_dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
cfg.gpus,
dist=False)
]
# put model on gpus
model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
# build runner
runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
cfg.log_level)
runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
cfg.checkpoint_config, cfg.log_config)

if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)

从源码可以看出train_detector的实现很简洁,通过是否分布式训练作为分支判断,分别调用_dist_train和_non_dist_train。这两个函数的接口实现也十分简洁明了。

测试

tools/test.py使用

tools/test.py用于对训练好的模型进行测试评估。例如对mask rcnn模型使用两个GPU进行测试,并将测试结果保存在results.pkl中中,评测对象为box和segm

1
python tools/test.py configs/mask_rcnn_r50_fpn_1x.py mask_rcnn_r50_fpn_1x_20181010-41d35c05.pth --gpus 2 --out results.pkl --eval bbox segm

比如我用coco2014测试的结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=397.04s).
Accumulating evaluation results...
DONE (t=62.48s).
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.487
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.725
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.548
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.328
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.540
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.574
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.368
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.591
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.618
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.447
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.670
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.738
Running per image evaluation...
Evaluate annotation type *segm*
DONE (t=436.20s).
Accumulating evaluation results...
DONE (t=62.29s).
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.430
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.683
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.467
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.266
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.476
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.541
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.341
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.533
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.555
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.376
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.605
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.703

此处的格式化输出称为检测评价矩阵(detection evaluation metrics)。上面分别计算了box和segm在小目标、中目标以及大目标上的AP和AR值。具体参数说明,可以在coco官网上了解,在这里。在mmdetection中评测的底层实现mmdet.core.evaluation.coco_eval使用的是coco那一套,也是直接调用Moicrosoft的coco API中的pycocotools包来实现的。具体源代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def coco_eval(result_file, result_types, coco, max_dets=(100, 300, 1000)):
for res_type in result_types:
assert res_type in [
'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'
]

if mmcv.is_str(coco):
coco = COCO(coco)
assert isinstance(coco, COCO)

if result_types == ['proposal_fast']:
ar = fast_eval_recall(result_file, coco, np.array(max_dets))
for i, num in enumerate(max_dets):
print('AR@{}\t= {:.4f}'.format(num, ar[i]))
return

assert result_file.endswith('.json')
coco_dets = coco.loadRes(result_file)

img_ids = coco.getImgIds()
for res_type in result_types:
iou_type = 'bbox' if res_type == 'proposal' else res_type
cocoEval = COCOeval(coco, coco_dets, iou_type) #构造COCOeval对象
cocoEval.params.imgIds = img_ids
if res_type == 'proposal':
cocoEval.params.useCats = 0
cocoEval.params.maxDets = list(max_dets)
#依次调用evaluate,accumulate,summarize实现数据集评测
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()

从上面代码可以很清晰的看到,coco_eval函数主要是通过构造COCOeval对象,然后依次调用evaluate,accumulate,summarize实现数据集的评测。

可视化预测

如果支持X Server,可以显示图形界面,则可以通过—show选项对测试图片进行显示输出:

1
python tools/test.py <CONFIG_FILE> <CHECKPOINT_FILE> --show

在这里,强烈安利装MobaXterm远程连接神器,可以直接在Windows上使用SSH连接linux服务器,而且可以实现远程图像显示,比如常见的plt.show,以及cv2.imshow。在这里推荐下载破解版,下载地址在这里
关于tools/test.py源码介绍,作者写的比较简洁,可以自行阅读。

models模型实现

关于模型的实现,在mmdetection中也做了一些介绍,具体可以参考TECHNICAL_DETAILS.md文件。下面的主要内容也是来自这个文件。
在mmdetection中,模型主要由如下四个部分组成:

  • backbone: tong通常是一个全卷积网络用于提取feature map,比如ResNet网络
  • neck: 连接backbone和head之间的部分,比如FPN,ASPP,值得注意的是,目前只支持FPN一种,不过看request,应该很快就可以支持不带FPN的模型
  • head: 用于特定任务的部分,比如bbox预测,mask预测
  • ROI extractor: 用于从feature map中提取特征的部分,比如ROI Align

此外,在mmdetection的TECHNICAL_DETAILS文件中提到,已经实现了一些包含以上部分的通用的pipeline,比如SingleStageDetector和TwoStageDetector。可以通过这两个类的实现来阅读代码理解mmdetection框架实现的基本原理。
SingleStageDetector和TwoStageDetector均位于mmdet.models.detectors中,分别在single_stage.pytwo_stage.py中实现。

single_stage.py源码解析

在mmdetection中实现了一个通用的单stage目标检测模型,在mmdet/models/detectors/single_stage.py实现了一个通用的基础单Stage目标检测模型文件中,具体源代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch.nn as nn

from .base import BaseDetector
from .. import builder
from ..registry import DETECTORS
from mmdet.core import bbox2result


@DETECTORS.register_module
class SingleStageDetector(BaseDetector): #继承BaseDetector

def __init__(self,
backbone,
neck=None,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
#定义基础结构:backbone + neck + bbox_head, 训练测试参数初始化:train_cfg + test_cfg
super(SingleStageDetector, self).__init__()
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
self.bbox_head = builder.build_head(bbox_head)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)

def init_weights(self, pretrained=None):
super(SingleStageDetector, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
self.bbox_head.init_weights()

def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x

def forward_train(self,
img,
img_metas,
gt_bboxes,
gt_labels,
gt_bboxes_ignore=None):
# 定义模型的前向传播训练,输出训练时的损失losses
x = self.extract_feat(img)
outs = self.bbox_head(x)
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg)
losses = self.bbox_head.loss(
*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses

def simple_test(self, img, img_meta, rescale=False):
# 定义模型度测试(无数据增强),返回的bbox_results包含det_bboxes和 det_labels信息
x = self.extract_feat(img)
outs = self.bbox_head(x)
bbox_inputs = outs + (img_meta, self.test_cfg, rescale)
bbox_list = self.bbox_head.get_bboxes(*bbox_inputs)
bbox_results = [
bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
for det_bboxes, det_labels in bbox_list
]
return bbox_results[0]

def aug_test(self, imgs, img_metas, rescale=False):
#定义模型测试(数据增强,比如多尺度,翻转)
raise NotImplementedError

two_stage.py源码解析

同样,在mmdetection中也实现了一个通用的two-stage目标检测模型,在mmdet/models/detectors/two_stage.py实现了一个通用的基础two-Stage目标检测模型文件中,考虑到篇幅问题,不列具体源代码,只列举函数,不包括函数具体实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import torch
import torch.nn as nn

from .base import BaseDetector
from .test_mixins import RPNTestMixin, BBoxTestMixin, MaskTestMixin
from .. import builder
from ..registry import DETECTORS
from mmdet.core import bbox2roi, bbox2result, build_assigner, build_sampler

@DETECTORS.register_module
class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin,
MaskTestMixin):
##继承BaseDetector,RPNTestMixin,BBoxTestMixin,MaskTestMixin

def __init__(self,
backbone,
neck=None,
rpn_head=None,
bbox_roi_extractor=None,
bbox_head=None,
mask_roi_extractor=None,
mask_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(TwoStageDetector, self).__init__()
self.backbone = builder.build_backbone(backbone)

if neck is not None:
self.neck = builder.build_neck(neck)
else:
raise NotImplementedError
#默认neck不为空,后续应该会支持neck为空的情况,非FPN结构的模型

if rpn_head is not None:
self.rpn_head = builder.build_head(rpn_head)

if bbox_head is not None:
self.bbox_roi_extractor = builder.build_roi_extractor(
bbox_roi_extractor)
self.bbox_head = builder.build_head(bbox_head)

if mask_head is not None:
self.mask_roi_extractor = builder.build_roi_extractor(
mask_roi_extractor)
self.mask_head = builder.build_head(mask_head)

self.train_cfg = train_cfg
self.test_cfg = test_cfg

self.init_weights(pretrained=pretrained)

@property
def with_rpn(self):
return hasattr(self, 'rpn_head') and self.rpn_head is not None

def init_weights(self, pretrained=None):
super(TwoStageDetector, self).init_weights(pretrained)
'''
not show in here
'''

def extract_feat(self, img):
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x

def forward_train(self,
img,
img_meta,
gt_bboxes,
gt_labels,
gt_bboxes_ignore=None,
gt_masks=None,
proposals=None):
'''
not show in here
'''

def simple_test(self, img, img_meta, proposals=None, rescale=False):
"""Test without augmentation."""
定义模型度测试(无数据增强),返回的bbox_results(包含det_bboxes和 det_labels信息)以及segm_results

assert self.with_bbox, "Bbox head must be implemented."

x = self.extract_feat(img)

proposal_list = self.simple_test_rpn(
x, img_meta, self.test_cfg.rpn) if proposals is None else proposals

det_bboxes, det_labels = self.simple_test_bboxes(
x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale)
bbox_results = bbox2result(det_bboxes, det_labels,
self.bbox_head.num_classes)

if not self.with_mask:
return bbox_results
else:
segm_results = self.simple_test_mask(
x, img_meta, det_bboxes, det_labels, rescale=rescale)
return bbox_results, segm_results

def aug_test(self, imgs, img_metas, rescale=False):
"""Test with augmentations.
If rescale is False, then returned bboxes and masks will fit the scale
of imgs[0].
"""
'''
not show in here
'''

自定义数据集解析

在mmdetecion中自定义了一种数据集格式。如mmdetection/README.md中suos所述。
We define a simple annotation format.The annotation of a dataset is a list of dict, each dict corresponds to an image.
There are 3 field filename (relative path), width, height for testing,
and an additional field ann for training. ann is also a dict containing at least 2 fields:
bboxes and labels, both of which are numpy arrays. Some datasets may provide
annotations like crowd/difficult/ignored bboxes, we use bboxes_ignore and labels_ignore
to cover them.

Here is an example.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
[
{
'filename': 'a.jpg',
'width': 1280,
'height': 720,
'ann': {
'bboxes': <np.ndarray> (n, 4),
'labels': <np.ndarray> (n, ),
'bboxes_ignore': <np.ndarray> (k, 4),
'labels_ignore': <np.ndarray> (k, ) (optional field)
}
},
...
]

There are two ways to work with custom datasets.

  • online conversion:You can write a new Dataset class inherited from CustomDataset, and overwrite two methods load_annotations(self, ann_file) and get_ann_info(self, idx), like CocoDataset and VOCDataset.

  • offline conversion:You can convert the annotation format to the expected format above and save it to a pickle or json file, like pascal_voc.py. Then you can simply use CustomDataset.

作者在CustomDataset基类上分别实例化了两种数据集类CocoDataset类以及VOCDataset类。其中VOCDataset数据集类虽然继承自XMLDataset类,但是XMLDataset也是继承自CustomDataset基类的。在mmdet.datasets.coco中实现了CocoDataset类,其中包括对coco数据标注信息读取的一些类成员。源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
from pycocotools.coco import COCO

from .custom import CustomDataset


class CocoDataset(CustomDataset):

CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic_light', 'fire_hydrant',
'stop_sign', 'parking_meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports_ball', 'kite', 'baseball_bat',
'baseball_glove', 'skateboard', 'surfboard', 'tennis_racket',
'bottle', 'wine_glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot_dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', 'laptop',
'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
'vase', 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush')

def load_annotations(self, ann_file):
"""return img_infos.
img_infos是一个列表,列表中每个元素均为一个字典,字典格式如下:
{'filename': xxx.jpg,
'width': 1280,
'height': 720,
}
包含的都是图片的一些信息,当然在coco中,字典的内容还包括id,license,url等键值信息,但这些在训练过程中均为非必要信息,也就是说如果想训练自己的数据集,只需要重写load_annotations这个函数,并且返回如上格式数据信息就行
"""
self.coco = COCO(ann_file)
self.cat_ids = self.coco.getCatIds()
self.cat2label = {
cat_id: i + 1
for i, cat_id in enumerate(self.cat_ids)
}
self.img_ids = self.coco.getImgIds()
img_infos = []
for i in self.img_ids:
info = self.coco.loadImgs([i])[0]
info['filename'] = info['file_name']
img_infos.append(info)
return img_infos

def get_ann_info(self, idx):
"""return annos.返回指定编号图片的标注信息
通过调用_parse_ann_info函数获取标注信息,最后返回数据格式也是字典,字典必须包含如下内容:
{'bboxes': <np.ndarray> (n, 4),
'labels': <np.ndarray> (n, ),
'mask': <np.ndarray> (width,height),
'bboxes_ignore': <np.ndarray> (k, 4),
'labels_ignore': <np.ndarray> (k, ) (非必要,在custom.py文件中,未见到该信息被使用)
}

"""
img_id = self.img_infos[idx]['id']
ann_ids = self.coco.getAnnIds(imgIds=[img_id])
ann_info = self.coco.loadAnns(ann_ids)
return self._parse_ann_info(ann_info, self.with_mask)

def _filter_imgs(self, min_size=32):
"""Filter images too small or without ground truths.
对数据集中图片过小,或没有标注信息的图片进行过滤
在这里设置的图片最小边长小于min_size=32的都过滤掉
"""
valid_inds = []
ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
for i, img_info in enumerate(self.img_infos):
if self.img_ids[i] not in ids_with_ann:
continue
if min(img_info['width'], img_info['height']) >= min_size:
valid_inds.append(i)
return valid_inds

def _parse_ann_info(self, ann_info, with_mask=True):
"""Parse bbox and mask annotation.
Args:
ann_info (list[dict]): Annotation info of an image.
with_mask (bool): Whether to parse mask annotations.
Returns:
dict: A dict containing the following keys: bboxes, bboxes_ignore,
labels, masks, mask_polys, poly_lens.
"""
gt_bboxes = []
gt_labels = []
gt_bboxes_ignore = []
# Two formats are provided.
# 1. mask: a binary map of the same size of the image.
# 2. polys: each mask consists of one or several polys, each poly is a
# list of float.
if with_mask:
gt_masks = []
gt_mask_polys = []
gt_poly_lens = []
for i, ann in enumerate(ann_info):
if ann.get('ignore', False):
continue
x1, y1, w, h = ann['bbox']
if ann['area'] <= 0 or w < 1 or h < 1:
continue
bbox = [x1, y1, x1 + w - 1, y1 + h - 1] #x1,y1,x2,y2,左上角右下角格式
if ann['iscrowd']:
gt_bboxes_ignore.append(bbox)
else:
gt_bboxes.append(bbox)
gt_labels.append(self.cat2label[ann['category_id']])
if with_mask:
gt_masks.append(self.coco.annToMask(ann))
mask_polys = [
p for p in ann['segmentation'] if len(p) >= 6
] # valid polygons have >= 3 points (6 coordinates)
poly_lens = [len(p) for p in mask_polys]
gt_mask_polys.append(mask_polys)
gt_poly_lens.extend(poly_lens)
if gt_bboxes:
gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
gt_labels = np.array(gt_labels, dtype=np.int64)
else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64)

if gt_bboxes_ignore:
gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
else:
gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

ann = dict(
bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)

if with_mask:
ann['masks'] = gt_masks
# poly format is not used in the current implementation
ann['mask_polys'] = gt_mask_polys
ann['poly_lens'] = gt_poly_lens
return ann

从上面可以看出,CocoDataset类对CustomDataset类中的load_annotations函数以及get_ann_info函数进行了重写。通过重写的方式将coco数据格式转化为mmdetection的自定义数据集格式。在转化coco数据集的时候_parse_ann_info函数写的比较复杂,但是真正在训练测试用到的信息只有如下这些信息:

1
2
3
4
5
{'bboxes': <np.ndarray> (n, 4),
'labels': <np.ndarray> (n, ),
'mask': <np.ndarray> (width,height),
'bboxes_ignore': <np.ndarray> (k, 4),
}

至于voc格式数据集转换,可以自行阅读源码,和coco相比,要简单很多。建议在训练自己的数据时按照mmdetection在上面规定的数据格式进行转换就可以。

demo有关代码解析

在mmdetection的README.md文件中有关于测试图片并可视化的代码,比较简单,下面以maskrcnn作为模型进行测试的demox.py,源码如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import mmcv
from mmcv.runner import load_checkpoint
from mmdet.models import build_detector
from mmdet.apis import inference_detector, show_result

cfg = mmcv.Config.fromfile('configs/mask_rcnn_r50_fpn_1x.py')
cfg.model.pretrained = None

# construct the model and load checkpoint
model = build_detector(cfg.model, test_cfg=cfg.test_cfg)
_ = load_checkpoint(model, 'mask_rcnn_r50_fpn_2x_20181010-41d35c05.pth')

# test a single image
#img = mmcv.imread('img4.jpg')
#result = inference_detector(model, img, cfg)
#print(type(result[0]))
#show_result(img, result[0])
# test a list of images
import glob
imgs = glob.glob('./*.jpg')
for i, result in enumerate(inference_detector(model, imgs, cfg, device='cuda:0')):
print(i, imgs[i])
show_result(imgs[i], result)

如果在pytorch-0.4.1版本中运行该demo中出现如下问题时,

1
2
3
4
5
6
7
8
Traceback (most recent call last):
File "demox.py", line 23, in <module>
show_result(imgs[i], result)
File "/home/niceliu/mmdetection/mmdet/apis/inference.py", line 55, in show_result
for i, bbox in enumerate(result)
File "/home/niceliu/mmdetection/mmdet/apis/inference.py", line 55, in <listcomp>
for i, bbox in enumerate(result)
AttributeError: 'list' object has no attribute 'shape'

需要对在mmdet/apis/inference.py文件中,对函数show_result进行修改,完整的show_result函数源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def show_result(img, result, dataset='coco', score_thr=0.3, out_file=None):
img = mmcv.imread(img)
class_names = get_classes(dataset)
if isinstance(result, tuple):
bbox_result, segm_result = result
else:
bbox_result, segm_result = result, None
bboxes = np.vstack(bbox_result)
# draw segmentation masks
if segm_result is not None:
segms = mmcv.concat_list(segm_result)
inds = np.where(bboxes[:, -1] > score_thr)[0]
for i in inds:
color_mask = np.random.randint(
0, 256, (1, 3), dtype=np.uint8)
mask = maskUtils.decode(segms[i]).astype(np.bool)
img[mask] = img[mask] * 0.5 + color_mask * 0.5
# draw bounding boxes
labels = [
np.full(bbox.shape[0], i, dtype=np.int32)
for i, bbox in enumerate(bbox_result)
]
labels = np.concatenate(labels)
mmcv.imshow_det_bboxes(
img.copy(),
bboxes,
labels,
class_names=class_names,
score_thr=score_thr,
show=out_file is None)

在demox.py文件中着重关注inference_detector接口函数,该函数位于mmdet.api.inference.py文件中。有关代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def _inference_single(model, img, img_transform, cfg, device):
img = mmcv.imread(img)
data = _prepare_data(img, img_transform, cfg, device)
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
return result


def _inference_generator(model, imgs, img_transform, cfg, device):
for img in imgs:
yield _inference_single(model, img, img_transform, cfg, device)


def inference_detector(model, imgs, cfg, device='cuda:0'):
img_transform = ImageTransform(
size_divisor=cfg.data.test.size_divisor, **cfg.img_norm_cfg)
model = model.to(device)
model.eval()

if not isinstance(imgs, list):
return _inference_single(model, imgs, img_transform, cfg, device)
else:
return _inference_generator(model, imgs, img_transform, cfg, device)

源码比较简单,不予介绍,最后放几张demo的测试结果图。