In this notebook we show step by step procedure to use TensorRT to optimize a trained MLP recommender system and accelerate inference.
After training the model, you should have two new files:
Now that you have the frozen graph, convert it to Universal Framework Format (UFF). TensorRT ships with a UFF toolkit that can be called from the command line.
convert-to-uff tensorflow --input-file sampleMovieLens.pb -o sampleMovieLens.uff -O prediction/Sigmoid
After this, you should see a sampleMovieLens.uff
in the '/data' directory
#Create UFF parser.
auto parser = nvuffparser::createUffParser();
#Specify inputs and outputs of the network.
parser->registerInput(config::kUSER_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
parser->registerInput(config::kITEM_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
parser->registerOutput(config::kUFF_OUTPUT_NODE);
#Create the builder.
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(config::gLogger);
nvinfer1::INetworkDefinition* network = builder->createNetwork();
#Parse the UFF model and populate the network. kDTYPE can be either kFLOAT (fp32) or kHALF (fp16).
bool parsingSuccessful = parser->parse(config::UFF_MODEL_FILE.c_str(), *network, config::kDTYPE);
By default, the TensorFlow model outputs several recommendations. Since we only want to keep the best recommendations, we can add a TopK layer to the network. Since TopK doesn’t modify the output values (only removes low ranking ones), it is perfectly safe to add this layer to an already trained model.
#Retrieve last layer of UFF Network.
auto uffLastLayer = network->getLayer(network->getNbLayers()-1);
#Reshape output of fully connected layer numOfMovies x 1 x 1 x 1 to numOfMovies x 1 x 1.
auto reshapeLayer = network->addShuffle(*uffLastLayer->getOutput(0));
reshapeLayer->setReshapeDimensions(nvinfer1::Dims3{1, config::kNUM_MOVIES_PER_USER, 1});
#Apply TopK layer to retrieve item probabilities and corresponding index number.
auto topK = network->addTopK(*reshapeLayer->getOutput(0), nvinfer1::TopKOperation::kMAX, config::kTOPK_MOVIES, 0x2);
#Specify topK tensors as outputs.
network->markOutput(*topK->getOutput(0));
network->markOutput(*topK->getOutput(1));
#TopK indices are 32-bit integers.
topK->getOutput(1)->setType(nvinfer1::DataType::kINT32);
This single step automatically performs:
(1) Tensor fusions
(2) Reduced precision
(3) Target autotuning
(4) Tensor memory management
Now that our network has a TopK layer, we can build a TensorRT engine.
#Specify total number of users(batch size)
builder->setMaxBatchSize(config::kNUM_USERS);
#Workspace size refers to the amount of device memory available to the builder when building an engine.
builder->setMaxWorkspaceSize(1_GB);
#Build the engine
nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
#Since we’ve finished building the engine, we can clean up
network->destroy();
builder->destroy();
parser->destroy();
Since we need to share the engine with child processes, we now need to serialize it and save it to a shared memory buffer.Note that you can share the engine as you would share any other data across processes i.e. by writing/reading from the disk, using sockets, or even using cloud services if you are so inclined.
#Helper function that creates a shared buffer, then copies data to it.
void createSharedBuffer(const void* data, size_t size, const char* bufferName)
{
#Create a shared buffer for the serialized engine.
int fd = shm_open(bufferName, O_RDWR | O_CREAT, 0666);
if (fd <= 0) config::logThrowError("Could not create file descriptor: /dev/shm" + std::string{bufferName});
fallocate(fd, 0, 0, size);
void* sharedBuffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
close(fd);
#Copy the data to the shared buffer.
std::memcpy(sharedBuffer, data, size);
}
#Serialize the engine for sharing then destroy it.
nvinfer1::IHostMemory* serializedEngine = engine->serialize();
engine->destroy();
#Copy to a new shared buffer.
createSharedBuffer(serializedEngine->data(), serializedEngine->size(), config::kSHM_SHARED_ENGINE);
#Serialized engine is now copied to shared memory, destroy the original buffer.
serializedEngine->destroy();
# Helper function to open a shared buffer and sets data/size accordingly.
void loadSharedBuffer(void*& data, size_t& size, const char* bufferName)
{
#Open a file descriptor for the shared buffer.
int fd = shm_open(bufferName, O_RDONLY, 0666);
if (fd <= 0) config::logThrowError("Could not open file descriptor: /dev/shm" + std::string{bufferName});
#Get size of shared memory buffer.
struct stat sb; fstat(fd, &sb);
size = sb.st_size;
if (size <= 0) config::logThrowError("Failed to fetch model stream from shared memory buffer.");
#Retrieve the shared buffer and close the file descriptor.
data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
close(fd);
}
#Get the shared buffer
loadSharedBuffer(serializedEngineData, serializedEngineSize, config::kSHM_SHARED_ENGINE);
Retrieve the engine from the serialized format:
#Use a TensorRT IRuntime for engine deserialization.
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(config::gLogger);
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(serializedEngineData, serializedEngineSize, nullptr);
Running an engine involves three steps:
1. Transferring inputs to device memory.
2. Executing with context->enqueue.
3. Transferring the outputs back to host memory.
#Copy input from host to device.
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));
#Do inference.
b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);
#Copy output from device to host.
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
cudaStreamSynchronize(b.mStream);
#Helper function to do inference on a single batch, return inference time in microseconds.
float timeBatchInference(Batch& b)
{
#Get input binding indices.
int userInputIndex = b.mEngine->getBindingIndex(config::kUSER_BLOB_NAME);
int itemInputIndex = b.mEngine->getBindingIndex(config::kITEM_BLOB_NAME);
#Get output binding indices.
int outputPredictionIndex = b.mEngine->getBindingIndex(config::kUFF_OUTPUT_NODE);
int outputItemProbIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_PROB);
int outputItemNameIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_NAME);
#Run and time inference.
GPUTimer timer{b.mStream};
timer.start();
{
#Copy input from host to device.
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));
#Do inference.
b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);
#Copy output from device to host.
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
cudaStreamSynchronize(b.mStream);
}
#Done inference.
timer.stop();
return timer.microseconds();
}