tensorRT-caffe2tensorRT

NVIDIA TensorRT 作为一种高性能神经网络推理(Inference)引擎，可应用有图像分类、分割和目标检测等领域中，可提供最大的推理效率。下面将通过tensorRT给的几个示例来了解tensorRT的加速处理过程。

在图像分类中，tensorRT给了两个示例，一个是mnist手写字体识别，以及一个GoogLeNet。这两个示例都是通过caffe载入模型文件和权重文件，然后进行加速。这里我就介绍一下第一个示例，以及在第一个示例上修改，使用tensorRT和caffe测试自己的图片,对比加速效果。

sampleMNIST

#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h> //由于需要用到cuda，所以需要包含该头文件

#include "NvInfer.h"  //主要的头文件
#include "NvCaffeParser.h"//主要的头文件
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;

// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "data/samples/mnist/", "data/mnist/" };
std::string locateFile(const std::string& input)
{
    return locateFile(input, directories);
}

// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
{
    readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

void caffeToGIEModel(const std::string& deployFile,				// name for caffe prototxt
					 const std::string& modelFile,				// name for model 
					 const std::vector<std::string>& outputs,   // network outputs
					 unsigned int maxBatchSize,					// batch size - NB must be at least as large as the batch we want to run with)
					 IHostMemory *&gieModelStream)    // output buffer for the GIE model
{
	//  builder
	IBuilder* builder = createInferBuilder(gLogger);

	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
	const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
															  locateFile(modelFile, directories).c_str(),
															  *network,
															  DataType::kFLOAT);

	// specify which tensors are outputs
	for (auto& s : outputs)
		network->markOutput(*blobNameToTensor->find(s.c_str()));

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1 << 20);

	ICudaEngine* engine = builder->buildCudaEngine(*network);
	assert(engine);

	// we don't need the network any more, and we can destroy the parser
	network->destroy();
	parser->destroy();

	// serialize the engine, then close everything down
	gieModelStream = engine->serialize();
	engine->destroy();
	builder->destroy();
	shutdownProtobufLibrary();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
	const ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
		outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

	// create GPU buffers and a stream
	CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));

	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
}

int main(int argc, char** argv)
{
    //caffe model 转化为GIE model，创建序列化engine
    IHostMemory *gieModelStream{nullptr};
   	caffeToGIEModel("mnist.prototxt", "mnist.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);
   	
	// 随机读入一张图像
	srand(unsigned(time(nullptr)));
	uint8_t fileData[INPUT_H*INPUT_W];
    int num = rand() % 10;
	readPGMFile(locateFile(std::to_string(num) + ".pgm", directories), fileData);

	// print an ascii representation
	std::cout << "\n\n\n---------------------------" << "\n\n\n" << std::endl;
	for (int i = 0; i < INPUT_H*INPUT_W; i++)
		std::cout << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % INPUT_W) ? "" : "\n");

	// 使用caffe parser解析均值文件，与图像处理相减
	ICaffeParser* parser = createCaffeParser();
	IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("mnist_mean.binaryproto", directories).c_str());
	parser->destroy();
	const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());

	float data[INPUT_H*INPUT_W];
	for (int i = 0; i < INPUT_H*INPUT_W; i++)
		data[i] = float(fileData[i])-meanData[i];
	meanBlob->destroy();

	// 反序列化gieModelStream到engine，创建引擎执行的环境（context ） 
	IRuntime* runtime = createInferRuntime(gLogger);
	ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
    if (gieModelStream) gieModelStream->destroy();

	IExecutionContext *context = engine->createExecutionContext();

	// 执行推理
	float prob[OUTPUT_SIZE];
	doInference(*context, data, prob, 1);//参数：环境，输入，输出，batch大小

	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

	// print a histogram of the output distribution
	std::cout << "\n\n";
    float val{0.0f};
    int idx{0};
	for (unsigned int i = 0; i < 10; i++)
    {
        val = std::max(val, prob[i]);
        if (val == prob[i]) idx = i;
		std::cout << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
    }
	std::cout << std::endl;

	return (idx == num && val > 0.9f) ? EXIT_SUCCESS : EXIT_FAILURE;
}

编译运行，就可以得到上一篇博文中的实验结果。
从上面可以看出主要有这么几个关键步骤：

步骤一：将caffe model转化为GIE model以便提供给cuda engine 进行推理计算（inference），主要函数：void caffeToGIEModel();

步骤二：对得到的Model的流反序列化到cuda引擎，并创建用于执行推理的上下文环境 context；

步骤三：推理计算（inference），主要函数：void doInference();

可以看到，上面网络构建的部分是直接对caffe的.deploy文件进行解析，得到网络。在同级的文件夹中，tensorRT还给出了一个sampleMNISTAPI示例，这个例子直接通过tensorRT api来定义网络。

此外，值得注意的是，这里数据的读入并没有依赖第三方库（比如OpenCV）,数据的格式也是.pgm。我们对这个程序进行稍微的修改，来测试我们自己的图片，看一下测试一张图片需要的时间。

sampleMNIST_OpenCV

#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "NvCaffeParser.h"  
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;

// stuff we know about the network and the caffe input/output blobs
static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "data/samples/mnist/", "data/mnist/" };
std::string locateFile(const std::string& input)
{
    return locateFile(input, directories);
}

// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
{
    readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

void caffeToGIEModel(const std::string& deployFile,				// name for caffe prototxt
					 const std::string& modelFile,				// name for model 
					 const std::vector<std::string>& outputs,   // network outputs
					 unsigned int maxBatchSize,					// batch size - NB must be at least as large as the batch we want to run with)
					 IHostMemory *&gieModelStream)    // output buffer for the GIE model
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);

	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
	const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
															  locateFile(modelFile, directories).c_str(),
															  *network,
															  DataType::kFLOAT);

	// specify which tensors are outputs
	for (auto& s : outputs)
		network->markOutput(*blobNameToTensor->find(s.c_str()));

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1 << 20);

	ICudaEngine* engine = builder->buildCudaEngine(*network);
	assert(engine);

	// we don't need the network any more, and we can destroy the parser
	network->destroy();
	parser->destroy();

	// serialize the engine, then close everything down
	gieModelStream = engine->serialize();
	engine->destroy();
	builder->destroy();
	shutdownProtobufLibrary();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
	const ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
		outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

	// create GPU buffers and a stream
	CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));

	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
}


int main(int argc, char** argv)
{
	
	// create a GIE model from the caffe model and serialize it to a stream
    IHostMemory *gieModelStream{nullptr};
   	caffeToGIEModel("mnist.prototxt", "mnist.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);


	// parse the mean file
	ICaffeParser* parser = createCaffeParser();
	IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("mnist_mean.binaryproto", directories).c_str());
	parser->destroy();
	const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());

	// deserialize the engine 
	IRuntime* runtime = createInferRuntime(gLogger);
	ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
    if (gieModelStream) gieModelStream->destroy();

	IExecutionContext *context = engine->createExecutionContext();


	const std::string image_folder=argv[1];

	for(int i=0;i<10;i++){
		const std::string image_name=image_folder+"/"+std::to_string(i)+".png";
		cv::Mat mat=cv::imread(image_name,0);
		if(!mat.data){std::cerr<<"image read failed: "<<image_name;exit(0);}

		cv::resize(mat,mat,cv::Size(INPUT_W,INPUT_H));
		mat.convertTo(mat,CV_32FC1);
		const float*p=(float*)mat.data;

		//subtract it from the image
		float data[INPUT_H*INPUT_W];
		for (int i = 0; i < INPUT_H*INPUT_W; i++)
			data[i] = p[i]-meanData[i];
		// run inference
		float prob[OUTPUT_SIZE];
		long t0=cv::getTickCount();
		doInference(*context, data, prob, 1);
		float val{0.0f};
    	int idx{0};
		for (unsigned int i = 0; i < 10; i++){
        val = std::max(val, prob[i]);
        if (val == prob[i]) idx = i;
    	}
    	std::cout<<image_name<<":,predicted value: "<<idx<<", probability: "<<val<<std::endl;
		long t1=cv::getTickCount();
		double secs=(t1-t0)/cv::getTickFrequency();
		std::cout<<"********tensorRT takes "<<secs*1000<<"ms  **********"<<std::endl;
	}

	meanBlob->destroy();
	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();
	return 0;
}

我们的测试图片如下：

实验结果如下：

niceliu@ise:~/data/TensorRT-3.0.4/tensorRT_MNIST_test/build$ ./TensorRT_MNIST_test ../digit/
../digit//0.png:,predicted value: 2, probability: 0.587642
********tensorRT takes 2.0737ms  **********
../digit//1.png:,predicted value: 1, probability: 0.628665
********tensorRT takes 0.218604ms  **********
../digit//2.png:,predicted value: 2, probability: 0.999764
********tensorRT takes 0.207929ms  **********
../digit//3.png:,predicted value: 3, probability: 0.998379
********tensorRT takes 0.225971ms  **********
../digit//4.png:,predicted value: 8, probability: 0.892509
********tensorRT takes 0.204932ms  **********
../digit//5.png:,predicted value: 5, probability: 0.997321
********tensorRT takes 0.204456ms  **********
../digit//6.png:,predicted value: 5, probability: 0.502507
********tensorRT takes 0.20411ms  **********
../digit//7.png:,predicted value: 7, probability: 0.986206
********tensorRT takes 0.203225ms  **********
../digit//8.png:,predicted value: 2, probability: 0.781859
********tensorRT takes 0.205882ms  **********
../digit//9.png:,predicted value: 1, probability: 0.880363
********tensorRT takes 0.210459ms  **********

可以看出使用tensorRT预测一张图片的平均时间0.2ms左右。同样的数据，同样的.caffemodel，.deploy文件，使用caffe运行后的实验结果如下：

可以看出,tensorRT相比caffe，加速效果还是挺明显的,$0.37ms左右 vs 0.2ms左右$。相关的实验代码以及CMakeLists.txt文件可以看这里。

tensorRT_classification

下面使用tensorRT直接测试caffemodel在图片分类上的实验效果，依然是使用OpenCV来载入图片，然后转化为float*格式输入到tensorRT中。

#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <algorithm>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "NvCaffeParser.h"  
#include "common.h"
using namespace nvinfer1;
using namespace nvcaffeparser1;

// stuff we know about the network and the caffe input/output blobs
static const int N=1;
static const int INPUT_C=3;
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 1000;
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
const std::vector<std::string> directories{ "/home/niceliu/data/mryx_retrieval/models/ResNet/", "data/mnist/" };
std::string locateFile(const std::string& input)
{
    return locateFile(input, directories);
}

// simple PGM (portable greyscale map) reader
void readPGMFile(const std::string& fileName, uint8_t buffer[INPUT_H*INPUT_W])
{
    readPGMFile(fileName, buffer, INPUT_H, INPUT_W);
}

void caffeToGIEModel(const std::string& deployFile,				// name for caffe prototxt
					 const std::string& modelFile,				// name for model 
					 const std::vector<std::string>& outputs,   // network outputs
					 unsigned int maxBatchSize,					// batch size - NB must be at least as large as the batch we want to run with)
					 IHostMemory *&gieModelStream)    // output buffer for the GIE model
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);

	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
	const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
															  locateFile(modelFile, directories).c_str(),
															  *network,
															  DataType::kFLOAT);

	// specify which tensors are outputs
	for (auto& s : outputs)
		network->markOutput(*blobNameToTensor->find(s.c_str()));

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(10 << 20);

	ICudaEngine* engine = builder->buildCudaEngine(*network);
	assert(engine);

	// we don't need the network any more, and we can destroy the parser
	network->destroy();
	parser->destroy();

	// serialize the engine, then close everything down
	gieModelStream = engine->serialize();
	engine->destroy();
	builder->destroy();
	shutdownProtobufLibrary();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
	const ICudaEngine& engine = context.getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()
	int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), 
		outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

	// create GPU buffers and a stream
	CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));

	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
}
static bool PairCompare(const std::pair<float, int>& lhs,
                        const std::pair<float, int>& rhs) {
  return lhs.first > rhs.first;
}
/* Return the indices of the top N values of vector v. */
static std::vector<int> Argmax(const std::vector<float>& v, int N) {
  std::vector<std::pair<float, int> > pairs;
  for (size_t i = 0; i < v.size(); ++i)
    pairs.push_back(std::make_pair(v[i], i));
  std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);

  std::vector<int> result;
  for (int i = 0; i < N; ++i)
    result.push_back(pairs[i].second);
  return result;
}

int main(int argc, char** argv)
{
	
	// create a GIE model from the caffe model and serialize it to a stream
    IHostMemory *gieModelStream{nullptr};
   	caffeToGIEModel("ResNet-50-deploy.prototxt", "ResNet-50-model.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);


	// parse the mean file
	ICaffeParser* parser = createCaffeParser();
	IBinaryProtoBlob* meanBlob = parser->parseBinaryProto(locateFile("ResNet_mean.binaryproto", directories).c_str());
	parser->destroy();
	const float *meanData = reinterpret_cast<const float*>(meanBlob->getData());
	
	// deserialize the engine 
	IRuntime* runtime = createInferRuntime(gLogger);
	ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
    if (gieModelStream) gieModelStream->destroy();

	IExecutionContext *context = engine->createExecutionContext();


	const std::string image_name=argv[1];
	cv::Mat mat=cv::imread(image_name);
	if(!mat.data){std::cerr<<"image read failed: "<<image_name;exit(0);}

	cv::resize(mat,mat,cv::Size(INPUT_W,INPUT_H));

	float* data = new float[N*INPUT_C*INPUT_H*INPUT_W];
	// pixel mean used by the Faster R-CNN's author
	float pixelMean[3]{ 104.0069879317889f, 116.66876761696767f, 122.6789143406786f }; // also in BGR order
	unsigned volCh1=INPUT_H*INPUT_W;
	for(int c=0;c<INPUT_C;c++)
	{
		cv::Mat_<cv::Vec3b>::iterator it=mat.begin<cv::Vec3b>();
		// the color image to input should be in BGR order
		for(int j=0;j<volCh1;j++)
		{
			//opencv read in image as BGRformat,by default,thus need only deduct the mean value
			data[c*volCh1+j]=float((*it)[c])-pixelMean[c];
			it++;
		}

	}//上面Mat转化为float*，感觉挺耗时间的，

	// run inference
	float prob[OUTPUT_SIZE];
	long t0=cv::getTickCount();
	doInference(*context, data, prob, 1);
	std::vector<float> v_prob(prob,prob+sizeof(prob)/sizeof(float));
	std::vector<int> maxN = Argmax(v_prob, 5);//取top5的测试结果
	for (int i = 0; i < 5; ++i) {
    int idx = maxN[i];
   std::cout<<image_name<<":,predicted value: "<<idx<<", probability: "<<prob[idx]<<std::endl;
}

	long t1=cv::getTickCount();
	double secs=(t1-t0)/cv::getTickFrequency();
	std::cout<<"********tensorRT takes "<<secs*1000<<"ms  **********"<<std::endl;
	meanBlob->destroy();
	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();
	delete[] data;
	return 0;
}

对程序编译测试，实验结果如下：

niceliu@ise:~/tensorRT/tensor_cls/build$ ./TensorRT_MNIST_test ../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 951, probability: 0.593339
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 522, probability: 0.206634
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 950, probability: 0.182364
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 852, probability: 0.0130069
../../../data/jetson-inference/build/x86_64/bin/orange_0.jpg:,predicted value: 722, probability: 0.00191788
********tensorRT takes 6.93229ms  **********

上面测试使用的是ResNet50,top1的预测结果为951，对比ImageNet标签文件synset_words.txt，可以发现第951行刚好是orange。tensorRT测试一张图片的时间7ms左右。和caffe原生的“examples/cpp_classification/classification.bin”对比测试结果如下：

从上面的实验对比可以发现，使用同样的模型ResNet50，在inference过程中，tensorRT明显要快于caffe。

问题

（1）从测试结果lai来看，和caffe相比，tensorRT给出的分类置信度都很低，不知道是不是在cv::Mat转float*出了问题，还是tensorRT在某种程度上因为权重量化的问题；
（2）使用tensorRT测试图片时，反应时间要比caffe慢，可能在数据格式转换效率以及编译过程中CMakeLists文件编写有问题；
此外，昨天在GitHub偶然发现，NVIDIA团队，在tensorRT的基础上实现了图像识别，目标检测，图像分割等，代码在这里。
参考资料：
TensorRT_TX2_20FPS
TensorRT_Tutorial
jetson-inference