Optimize Multi-layer Perceptrons(MLP) based Recommender System with TensorRT

In this notebook we show step by step procedure to use TensorRT to optimize a trained MLP recommender system and accelerate inference.

After training the model, you should have two new files:

  1. movielens_ratings.txt
  2. sampleMovieLens.pb

STEP-1: Import TensorFlow Model in to TensorRT

Now that you have the frozen graph, convert it to Universal Framework Format (UFF). TensorRT ships with a UFF toolkit that can be called from the command line.

  • 1. Converting the Trained Model to UFF
    convert-to-uff tensorflow --input-file sampleMovieLens.pb -o sampleMovieLens.uff -O prediction/Sigmoid

    After this, you should see a sampleMovieLens.uff in the '/data' directory

  • 2. Parse the UFF file into a TensorRT Network.
    #Create UFF parser.
    auto parser = nvuffparser::createUffParser();
    #Specify inputs and outputs of the network.
    parser->registerInput(config::kUSER_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
    parser->registerInput(config::kITEM_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
    #Create the builder.
    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(config::gLogger);
    nvinfer1::INetworkDefinition* network = builder->createNetwork();
    #Parse the UFF model and populate the network. kDTYPE can be either kFLOAT (fp32) or kHALF (fp16).
    bool parsingSuccessful = parser->parse(config::UFF_MODEL_FILE.c_str(), *network, config::kDTYPE);

    STEP-2: Adding a TopK Layer to the Network

    By default, the TensorFlow model outputs several recommendations. Since we only want to keep the best recommendations, we can add a TopK layer to the network. Since TopK doesn’t modify the output values (only removes low ranking ones), it is perfectly safe to add this layer to an already trained model.

    #Retrieve last layer of UFF Network.
    auto uffLastLayer = network->getLayer(network->getNbLayers()-1);
    #Reshape output of fully connected layer numOfMovies x 1 x 1 x 1 to numOfMovies x 1 x 1.
    auto reshapeLayer = network->addShuffle(*uffLastLayer->getOutput(0));
    reshapeLayer->setReshapeDimensions(nvinfer1::Dims3{1, config::kNUM_MOVIES_PER_USER, 1});
    #Apply TopK layer to retrieve item probabilities and corresponding index number. 
    auto topK = network->addTopK(*reshapeLayer->getOutput(0), nvinfer1::TopKOperation::kMAX, config::kTOPK_MOVIES, 0x2);
    #Specify topK tensors as outputs.
    #TopK indices are 32-bit integers.

    STEP-3: Build the TensorRT inference engine

    This single step automatically performs:
    (1) Tensor fusions
    (2) Reduced precision
    (3) Target autotuning
    (4) Tensor memory management

    Now that our network has a TopK layer, we can build a TensorRT engine.

    #Specify total number of users(batch size)
    #Workspace size refers to the amount of device memory available to the builder when building an engine.
    #Build the engine
    nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
    #Since we’ve finished building the engine, we can clean up

    STEP-4: Serializing the Engine to a Shared Memory Buffer

    Since we need to share the engine with child processes, we now need to serialize it and save it to a shared memory buffer.Note that you can share the engine as you would share any other data across processes i.e. by writing/reading from the disk, using sockets, or even using cloud services if you are so inclined.

    #Helper function that creates a shared buffer, then copies data to it.
    void createSharedBuffer(const void* data, size_t size, const char* bufferName)
            #Create a shared buffer for the serialized engine.
            int fd = shm_open(bufferName, O_RDWR | O_CREAT, 0666);
            if (fd <= 0) config::logThrowError("Could not create file descriptor: /dev/shm" + std::string{bufferName});
            fallocate(fd, 0, 0, size);
            void* sharedBuffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
            #Copy the data to the shared buffer.
            std::memcpy(sharedBuffer, data, size);
    #Serialize the engine for sharing then destroy it.
    nvinfer1::IHostMemory* serializedEngine = engine->serialize();
    #Copy to a new shared buffer.
    createSharedBuffer(serializedEngine->data(), serializedEngine->size(), config::kSHM_SHARED_ENGINE);
    #Serialized engine is now copied to shared memory, destroy the original buffer.

    STEP-5: Retrieving the Engine from Shared Memory

    # Helper function to open a shared buffer and sets data/size accordingly.
    void loadSharedBuffer(void*& data, size_t& size, const char* bufferName)
            #Open a file descriptor for the shared buffer.
            int fd = shm_open(bufferName, O_RDONLY, 0666);
            if (fd <= 0) config::logThrowError("Could not open file descriptor: /dev/shm" + std::string{bufferName});
            #Get size of shared memory buffer.
            struct stat sb; fstat(fd, &sb);
            size = sb.st_size;
            if (size <= 0) config::logThrowError("Failed to fetch model stream from shared memory buffer.");
            #Retrieve the shared buffer and close the file descriptor.
            data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
    #Get the shared buffer
    loadSharedBuffer(serializedEngineData, serializedEngineSize, config::kSHM_SHARED_ENGINE);

    Retrieve the engine from the serialized format:

    #Use a TensorRT IRuntime for engine deserialization.
    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(config::gLogger);
    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(serializedEngineData, serializedEngineSize, nullptr);

    STEP-6: Running Inference

    Running an engine involves three steps:

      1. Transferring inputs to device memory.
      2. Executing with context->enqueue.
      3. Transferring the outputs back to host memory.
    #Copy input from host to device.
    config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
    config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));
    #Do inference.
    b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);
    #Copy output from device to host.
    config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
    config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
    config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
    #Helper function to do inference on a single batch, return inference time in microseconds.
    float timeBatchInference(Batch& b)
            #Get input binding indices.
            int userInputIndex = b.mEngine->getBindingIndex(config::kUSER_BLOB_NAME);
            int itemInputIndex = b.mEngine->getBindingIndex(config::kITEM_BLOB_NAME);
            #Get output binding indices.
            int outputPredictionIndex = b.mEngine->getBindingIndex(config::kUFF_OUTPUT_NODE);
            int outputItemProbIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_PROB);
            int outputItemNameIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_NAME);
            #Run and time inference.
            GPUTimer timer{b.mStream};
            #Copy input from host to device.
            config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
            config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));
            #Do inference.
            b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);
            #Copy output from device to host.
            config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
            config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
            config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
        #Done inference.
        return timer.microseconds();