tensorRT-caffe2tensorRT

NVIDIA TensorRT 作为一种高性能神经网络推理(Inference)引擎,可应用有图像分类、分割和目标检测等领域中,可提供最大的推理效率。下面将通过tensorRT给的几个示例来了解tensorRT的加速处理过程。

在图像分类中,tensorRT给了两个示例,一个是mnist手写字体识别,以及一个GoogLeNet。这两个示例都是通过caffe载入模型文件和权重文件,然后进行加速。这里我就介绍一下第一个示例,以及在第一个示例上修改,使用tensorRT和caffe测试自己的图片,对比加速效果。

sampleMNIST

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h> //由于需要用到cuda,所以需要包含该头文件

#include "NvInfer.h" //主要的头文件
#include "NvCaffeParser.h"//主要的头文件
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;

// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "data/samples/mnist/", "data/mnist/" };
std::string locateFile(const std::string& input)
{
return locateFile(input, directories);
}

// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
{
readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

void caffeToGIEModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory *&gieModelStream) // output buffer for the GIE model
{
// builder
IBuilder* builder = createInferBuilder(gLogger);

// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
locateFile(modelFile, directories).c_str(),
*network,
DataType::kFLOAT);

// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));

// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);

ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);

// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();

// serialize the engine, then close everything down
gieModelStream = engine->serialize();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];

// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);

// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
//caffe model 转化为GIE model,创建序列化engine
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel("mnist.prototxt", "mnist.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);

// 随机读入一张图像
srand(unsigned(time(nullptr)));
uint8_t fileData[INPUT_H*INPUT_W];
int num = rand() % 10;
readPGMFile(locateFile(std::to_string(num) + ".pgm", directories), fileData);

// print an ascii representation
std::cout << "\n\n\n---------------------------" << "\n\n\n" << std::endl;
for (int i = 0; i < INPUT_H*INPUT_W; i++)
std::cout << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % INPUT_W) ? "" : "\n");

// 使用caffe parser解析均值文件,与图像处理相减
ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("mnist_mean.binaryproto", directories).c_str());
parser->destroy();
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());

float data[INPUT_H*INPUT_W];
for (int i = 0; i < INPUT_H*INPUT_W; i++)
data[i] = float(fileData[i])-meanData[i];
meanBlob->destroy();

// 反序列化gieModelStream到engine,创建引擎执行的环境(context )
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
if (gieModelStream) gieModelStream->destroy();

IExecutionContext *context = engine->createExecutionContext();

// 执行推理
float prob[OUTPUT_SIZE];
doInference(*context, data, prob, 1);//参数:环境,输入,输出,batch大小

// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();

// print a histogram of the output distribution
std::cout << "\n\n";
float val{0.0f};
int idx{0};
for (unsigned int i = 0; i < 10; i++)
{
val = std::max(val, prob[i]);
if (val == prob[i]) idx = i;
std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
}
std::cout << std::endl;

return (idx == num && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;
}

编译运行,就可以得到上一篇博文中的实验结果。
从上面可以看出主要有这么几个关键步骤:

  • 步骤一:将caffe model转化为GIE model以便提供给cuda engine 进行推理计算(inference),主要函数:void caffeToGIEModel();
  • 步骤二:对得到的Model的流反序列化到cuda引擎,并创建用于执行推理的上下文环境 context;
  • 步骤三:推理计算(inference),主要函数:void doInference();

可以看到,上面网络构建的部分是直接对caffe的.deploy文件进行解析,得到网络。在同级的文件夹中,tensorRT还给出了一个sampleMNISTAPI示例,这个例子直接通过tensorRT api来定义网络。

此外,值得注意的是,这里数据的读入并没有依赖第三方库(比如OpenCV),数据的格式也是.pgm。我们对这个程序进行稍微的修改,来测试我们自己的图片,看一下测试一张图片需要的时间。

sampleMNIST_OpenCV

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;

// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "data/samples/mnist/", "data/mnist/" };
std::string locateFile(const std::string& input)
{
return locateFile(input, directories);
}

// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
{
readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

void caffeToGIEModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory *&gieModelStream) // output buffer for the GIE model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);

// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
locateFile(modelFile, directories).c_str(),
*network,
DataType::kFLOAT);

// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));

// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1 << 20);

ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);

// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();

// serialize the engine, then close everything down
gieModelStream = engine->serialize();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];

// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);

// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}


int main(int argc, char** argv)
{

// create a GIE model from the caffe model and serialize it to a stream
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel("mnist.prototxt", "mnist.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);


// parse the mean file
ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("mnist_mean.binaryproto", directories).c_str());
parser->destroy();
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());

// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
if (gieModelStream) gieModelStream->destroy();

IExecutionContext *context = engine->createExecutionContext();


const std::string image_folder=argv[1];

for(int i=0;i<10;i++){
const std::string image_name=image_folder+"/"+std::to_string(i)+".png";
cv::Mat mat=cv::imread(image_name,0);
if(!mat.data){std::cerr<<"image read failed: "<<image_name;exit(0);}

cv::resize(mat,mat,cv::Size(INPUT_W,INPUT_H));
mat.convertTo(mat,CV_32FC1);
const float*p=(float*)mat.data;

//subtract it from the image
float data[INPUT_H*INPUT_W];
for (int i = 0; i < INPUT_H*INPUT_W; i++)
data[i] = p[i]-meanData[i];
// run inference
float prob[OUTPUT_SIZE];
long t0=cv::getTickCount();
doInference(*context, data, prob, 1);
float val{0.0f};
int idx{0};
for (unsigned int i = 0; i < 10; i++){
val = std::max(val, prob[i]);
if (val == prob[i]) idx = i;
}
std::cout<<image_name<<":,predicted value: "<<idx<<", probability: "<<val<<std::endl;
long t1=cv::getTickCount();
double secs=(t1-t0)/cv::getTickFrequency();
std::cout<<"********tensorRT takes "<<secs*1000<<"ms **********"<<std::endl;
}

meanBlob->destroy();
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}

我们的测试图片如下:

实验结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
niceliu@ise:~/data/TensorRT-3.0.4/tensorRT_MNIST_test/build$ ./TensorRT_MNIST_test ../digit/
../digit//0.png:,predicted value: 2, probability: 0.587642
********tensorRT takes 2.0737ms **********
../digit//1.png:,predicted value: 1, probability: 0.628665
********tensorRT takes 0.218604ms **********
../digit//2.png:,predicted value: 2, probability: 0.999764
********tensorRT takes 0.207929ms **********
../digit//3.png:,predicted value: 3, probability: 0.998379
********tensorRT takes 0.225971ms **********
../digit//4.png:,predicted value: 8, probability: 0.892509
********tensorRT takes 0.204932ms **********
../digit//5.png:,predicted value: 5, probability: 0.997321
********tensorRT takes 0.204456ms **********
../digit//6.png:,predicted value: 5, probability: 0.502507
********tensorRT takes 0.20411ms **********
../digit//7.png:,predicted value: 7, probability: 0.986206
********tensorRT takes 0.203225ms **********
../digit//8.png:,predicted value: 2, probability: 0.781859
********tensorRT takes 0.205882ms **********
../digit//9.png:,predicted value: 1, probability: 0.880363
********tensorRT takes 0.210459ms **********

可以看出使用tensorRT预测一张图片的平均时间0.2ms左右。同样的数据,同样的.caffemodel,.deploy文件,使用caffe运行后的实验结果如下:

可以看出,tensorRT相比caffe,加速效果还是挺明显的,$0.37ms左右 vs 0.2ms左右$。相关的实验代码以及CMakeLists.txt文件可以看这里

tensorRT_classification

下面使用tensorRT直接测试caffemodel在图片分类上的实验效果,依然是使用OpenCV来载入图片,然后转化为float*格式输入到tensorRT中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;

// stuff we know about the network and the caffe input/output blobs
static const int N=1;
static const int INPUT_C=3;
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "/home/niceliu/data/mryx_retrieval/models/ResNet/", "data/mnist/" };
std::string locateFile(const std::string& input)
{
return locateFile(input, directories);
}

// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
{
readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

void caffeToGIEModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory *&gieModelStream) // output buffer for the GIE model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);

// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
locateFile(modelFile, directories).c_str(),
*network,
DataType::kFLOAT);

// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));

// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(10 << 20);

ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);

// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();

// serialize the engine, then close everything down
gieModelStream = engine->serialize();
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];

// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);

// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
static bool PairCompare(const std::pair<float, int>& lhs,
const std::pair<float, int>& rhs) {
return lhs.first > rhs.first;
}
/* Return the indices of the top N values of vector v. */
static std::vector<int> Argmax(const std::vector<float>& v, int N) {
std::vector<std::pair<float, int> > pairs;
for (size_t i = 0; i < v.size(); ++i)
pairs.push_back(std::make_pair(v[i], i));
std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);

std::vector<int> result;
for (int i = 0; i < N; ++i)
result.push_back(pairs[i].second);
return result;
}

int main(int argc, char** argv)
{

// create a GIE model from the caffe model and serialize it to a stream
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel("ResNet-50-deploy.prototxt", "ResNet-50-model.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);


// parse the mean file
ICaffeParser* parser = createCaffeParser();
IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("ResNet_mean.binaryproto", directories).c_str());
parser->destroy();
const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());

// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
if (gieModelStream) gieModelStream->destroy();

IExecutionContext *context = engine->createExecutionContext();


const std::string image_name=argv[1];
cv::Mat mat=cv::imread(image_name);
if(!mat.data){std::cerr<<"image read failed: "<<image_name;exit(0);}

cv::resize(mat,mat,cv::Size(INPUT_W,INPUT_H));

float* data = new float[N*INPUT_C*INPUT_H*INPUT_W];
// pixel mean used by the Faster R-CNN's author
float pixelMean[3]{ 104.0069879317889f, 116.66876761696767f, 122.6789143406786f }; // also in BGR order
unsigned volCh1=INPUT_H*INPUT_W;
for(int c=0;c<INPUT_C;c++)
{
cv::Mat_<cv::Vec3b>::iterator it=mat.begin<cv::Vec3b>();
// the color image to input should be in BGR order
for(int j=0;j<volCh1;j++)
{
//opencv read in image as BGRformat,by default,thus need only deduct the mean value
data[c*volCh1+j]=float((*it)[c])-pixelMean[c];
it++;
}

}//上面Mat转化为float*,感觉挺耗时间的,

// run inference
float prob[OUTPUT_SIZE];
long t0=cv::getTickCount();
doInference(*context, data, prob, 1);
std::vector<float> v_prob(prob,prob+sizeof(prob)/sizeof(float));
std::vector<int> maxN = Argmax(v_prob, 5);//取top5的测试结果
for (int i = 0; i < 5; ++i) {
int idx = maxN[i];
std::cout<<image_name<<":,predicted value: "<<idx<<", probability: "<<prob[idx]<<std::endl;
}

long t1=cv::getTickCount();
double secs=(t1-t0)/cv::getTickFrequency();
std::cout<<"********tensorRT takes "<<secs*1000<<"ms **********"<<std::endl;
meanBlob->destroy();
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
delete[] data;
return 0;
}

对程序编译测试,实验结果如下:

1
2
3
4
5
6
7
niceliu@ise:~/tensorRT/tensor_cls/build$ ./TensorRT_MNIST_test ../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 951, probability: 0.593339
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 522, probability: 0.206634
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 950, probability: 0.182364
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 852, probability: 0.0130069
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 722, probability: 0.00191788
********tensorRT takes 6.93229ms **********

上面测试使用的是ResNet50,top1的预测结果为951,对比ImageNet标签文件synset_words.txt,可以发现第951行刚好是orange。tensorRT测试一张图片的时间7ms左右。和caffe原生的“examples/cpp_classification/classification.bin”对比测试结果如下:

从上面的实验对比可以发现,使用同样的模型ResNet50,在inference过程中,tensorRT明显要快于caffe。

问题

(1)从测试结果lai来看,和caffe相比,tensorRT给出的分类置信度都很低,不知道是不是在cv::Mat转float*出了问题,还是tensorRT在某种程度上因为权重量化的问题;
(2)使用tensorRT测试图片时,反应时间要比caffe慢,可能在数据格式转换效率以及编译过程中CMakeLists文件编写有问题;
此外,昨天在GitHub偶然发现,NVIDIA团队,在tensorRT的基础上实现了图像识别,目标检测,图像分割等,代码在这里
参考资料:
TensorRT_TX2_20FPS
TensorRT_Tutorial
jetson-inference