Optimize Multi-layer Perceptrons(MLP) based Recommender System with TensorRT¶

In this notebook we show step by step procedure to use TensorRT to optimize a trained MLP recommender system and accelerate inference.

After training the model, you should have two new files:

movielens_ratings.txt
sampleMovieLens.pb

STEP-1: Import TensorFlow Model in to TensorRT¶

Now that you have the frozen graph, convert it to Universal Framework Format (UFF). TensorRT ships with a UFF toolkit that can be called from the command line.

1. Converting the Trained Model to UFF

convert-to-uff tensorflow --input-file sampleMovieLens.pb -o sampleMovieLens.uff -O prediction/Sigmoid

After this, you should see a sampleMovieLens.uff in the '/data' directory

2. Parse the UFF file into a TensorRT Network.

#Create UFF parser.
auto parser = nvuffparser::createUffParser();
        
#Specify inputs and outputs of the network.
parser->registerInput(config::kUSER_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
parser->registerInput(config::kITEM_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
parser->registerOutput(config::kUFF_OUTPUT_NODE);

#Create the builder.
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(config::gLogger);
nvinfer1::INetworkDefinition* network = builder->createNetwork();

#Parse the UFF model and populate the network. kDTYPE can be either kFLOAT (fp32) or kHALF (fp16).
bool parsingSuccessful = parser->parse(config::UFF_MODEL_FILE.c_str(), *network, config::kDTYPE);

STEP-2: Adding a TopK Layer to the Network¶

By default, the TensorFlow model outputs several recommendations. Since we only want to keep the best recommendations, we can add a TopK layer to the network. Since TopK doesn’t modify the output values (only removes low ranking ones), it is perfectly safe to add this layer to an already trained model.

#Retrieve last layer of UFF Network.
auto uffLastLayer = network->getLayer(network->getNbLayers()-1);

#Reshape output of fully connected layer numOfMovies x 1 x 1 x 1 to numOfMovies x 1 x 1.
auto reshapeLayer = network->addShuffle(*uffLastLayer->getOutput(0));
reshapeLayer->setReshapeDimensions(nvinfer1::Dims3{1, config::kNUM_MOVIES_PER_USER, 1});

#Apply TopK layer to retrieve item probabilities and corresponding index number. 
auto topK = network->addTopK(*reshapeLayer->getOutput(0), nvinfer1::TopKOperation::kMAX, config::kTOPK_MOVIES, 0x2);

#Specify topK tensors as outputs.
network->markOutput(*topK->getOutput(0));
network->markOutput(*topK->getOutput(1));

#TopK indices are 32-bit integers.
topK->getOutput(1)->setType(nvinfer1::DataType::kINT32);

STEP-3: Build the TensorRT inference engine¶

This single step automatically performs:
(1) Tensor fusions
(2) Reduced precision
(3) Target autotuning
(4) Tensor memory management

Now that our network has a TopK layer, we can build a TensorRT engine.

#Specify total number of users(batch size)
builder->setMaxBatchSize(config::kNUM_USERS);

#Workspace size refers to the amount of device memory available to the builder when building an engine.
builder->setMaxWorkspaceSize(1_GB);

#Build the engine
nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);

#Since we’ve finished building the engine, we can clean up
network->destroy();
builder->destroy();
parser->destroy();

STEP-4: Serializing the Engine to a Shared Memory Buffer¶

Since we need to share the engine with child processes, we now need to serialize it and save it to a shared memory buffer.Note that you can share the engine as you would share any other data across processes i.e. by writing/reading from the disk, using sockets, or even using cloud services if you are so inclined.

#Helper function that creates a shared buffer, then copies data to it.
void createSharedBuffer(const void* data, size_t size, const char* bufferName)
{
        #Create a shared buffer for the serialized engine.
        int fd = shm_open(bufferName, O_RDWR | O_CREAT, 0666);
        if (fd <= 0) config::logThrowError("Could not create file descriptor: /dev/shm" + std::string{bufferName});
        fallocate(fd, 0, 0, size);
        void* sharedBuffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
        close(fd);
        #Copy the data to the shared buffer.
        std::memcpy(sharedBuffer, data, size);
}

#Serialize the engine for sharing then destroy it.
nvinfer1::IHostMemory* serializedEngine = engine->serialize();
engine->destroy();

#Copy to a new shared buffer.
createSharedBuffer(serializedEngine->data(), serializedEngine->size(), config::kSHM_SHARED_ENGINE);

#Serialized engine is now copied to shared memory, destroy the original buffer.
serializedEngine->destroy();

STEP-5: Retrieving the Engine from Shared Memory¶

# Helper function to open a shared buffer and sets data/size accordingly.
void loadSharedBuffer(void*& data, size_t& size, const char* bufferName)
{
        #Open a file descriptor for the shared buffer.
        int fd = shm_open(bufferName, O_RDONLY, 0666);
        if (fd <= 0) config::logThrowError("Could not open file descriptor: /dev/shm" + std::string{bufferName});

        #Get size of shared memory buffer.
        struct stat sb; fstat(fd, &sb);
        size = sb.st_size;
        if (size <= 0) config::logThrowError("Failed to fetch model stream from shared memory buffer.");

        #Retrieve the shared buffer and close the file descriptor.
        data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
        close(fd);
}

#Get the shared buffer
loadSharedBuffer(serializedEngineData, serializedEngineSize, config::kSHM_SHARED_ENGINE);

Retrieve the engine from the serialized format:

#Use a TensorRT IRuntime for engine deserialization.
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(config::gLogger);
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(serializedEngineData, serializedEngineSize, nullptr);

STEP-6: Running Inference¶

Running an engine involves three steps:

  1. Transferring inputs to device memory.
  2. Executing with context->enqueue.
  3. Transferring the outputs back to host memory.

#Copy input from host to device.
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));

#Do inference.
b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);

#Copy output from device to host.
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
cudaStreamSynchronize(b.mStream);

#Helper function to do inference on a single batch, return inference time in microseconds.
float timeBatchInference(Batch& b)
{
        #Get input binding indices.
        int userInputIndex = b.mEngine->getBindingIndex(config::kUSER_BLOB_NAME);
        int itemInputIndex = b.mEngine->getBindingIndex(config::kITEM_BLOB_NAME);
        #Get output binding indices.
        int outputPredictionIndex = b.mEngine->getBindingIndex(config::kUFF_OUTPUT_NODE);
        int outputItemProbIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_PROB);
        int outputItemNameIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_NAME);

        #Run and time inference.
        GPUTimer timer{b.mStream};
        timer.start();
        {
        #Copy input from host to device.
        config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
        config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));

        #Do inference.
        b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);

        #Copy output from device to host.
        config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
        config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
        config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
        cudaStreamSynchronize(b.mStream);
        }
    #Done inference.
    timer.stop();
    return timer.microseconds();
}