Optimize Multi-layer Perceptrons(MLP) based Recommender System with TensorRT

In this notebook we show step by step procedure to use TensorRT to optimize a trained MLP recommender system and accelerate inference.

After training the model, you should have two new files:

  1. movielens_ratings.txt
  2. sampleMovieLens.pb

STEP-1: Import TensorFlow Model in to TensorRT

Now that you have the frozen graph, convert it to Universal Framework Format (UFF). TensorRT ships with a UFF toolkit that can be called from the command line.

  • 1. Converting the Trained Model to UFF
  • In [ ]:
    convert-to-uff tensorflow --input-file sampleMovieLens.pb -o sampleMovieLens.uff -O prediction/Sigmoid
    

    After this, you should see a sampleMovieLens.uff in the '/data' directory

  • 2. Parse the UFF file into a TensorRT Network.
  • In [ ]:
    #Create UFF parser.
    auto parser = nvuffparser::createUffParser();
            
    #Specify inputs and outputs of the network.
    parser->registerInput(config::kUSER_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
    parser->registerInput(config::kITEM_BLOB_NAME, inputDims, nvuffparser::UffInputOrder::kNCHW);
    parser->registerOutput(config::kUFF_OUTPUT_NODE);
    
    #Create the builder.
    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(config::gLogger);
    nvinfer1::INetworkDefinition* network = builder->createNetwork();
    
    #Parse the UFF model and populate the network. kDTYPE can be either kFLOAT (fp32) or kHALF (fp16).
    bool parsingSuccessful = parser->parse(config::UFF_MODEL_FILE.c_str(), *network, config::kDTYPE);
    

    STEP-2: Adding a TopK Layer to the Network

    By default, the TensorFlow model outputs several recommendations. Since we only want to keep the best recommendations, we can add a TopK layer to the network. Since TopK doesn’t modify the output values (only removes low ranking ones), it is perfectly safe to add this layer to an already trained model.

    In [ ]:
    #Retrieve last layer of UFF Network.
    auto uffLastLayer = network->getLayer(network->getNbLayers()-1);
    
    #Reshape output of fully connected layer numOfMovies x 1 x 1 x 1 to numOfMovies x 1 x 1.
    auto reshapeLayer = network->addShuffle(*uffLastLayer->getOutput(0));
    reshapeLayer->setReshapeDimensions(nvinfer1::Dims3{1, config::kNUM_MOVIES_PER_USER, 1});
    
    #Apply TopK layer to retrieve item probabilities and corresponding index number. 
    auto topK = network->addTopK(*reshapeLayer->getOutput(0), nvinfer1::TopKOperation::kMAX, config::kTOPK_MOVIES, 0x2);
    
    #Specify topK tensors as outputs.
    network->markOutput(*topK->getOutput(0));
    network->markOutput(*topK->getOutput(1));
    
    #TopK indices are 32-bit integers.
    topK->getOutput(1)->setType(nvinfer1::DataType::kINT32);
    

    STEP-3: Build the TensorRT inference engine

    This single step automatically performs:
    (1) Tensor fusions
    (2) Reduced precision
    (3) Target autotuning
    (4) Tensor memory management

    Now that our network has a TopK layer, we can build a TensorRT engine.

    In [ ]:
    #Specify total number of users(batch size)
    builder->setMaxBatchSize(config::kNUM_USERS);
    
    #Workspace size refers to the amount of device memory available to the builder when building an engine.
    builder->setMaxWorkspaceSize(1_GB);
    
    #Build the engine
    nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
    
    #Since we’ve finished building the engine, we can clean up
    network->destroy();
    builder->destroy();
    parser->destroy();
    

    STEP-4: Serializing the Engine to a Shared Memory Buffer

    Since we need to share the engine with child processes, we now need to serialize it and save it to a shared memory buffer.Note that you can share the engine as you would share any other data across processes i.e. by writing/reading from the disk, using sockets, or even using cloud services if you are so inclined.

    In [ ]:
    #Helper function that creates a shared buffer, then copies data to it.
    void createSharedBuffer(const void* data, size_t size, const char* bufferName)
    {
            #Create a shared buffer for the serialized engine.
            int fd = shm_open(bufferName, O_RDWR | O_CREAT, 0666);
            if (fd <= 0) config::logThrowError("Could not create file descriptor: /dev/shm" + std::string{bufferName});
            fallocate(fd, 0, 0, size);
            void* sharedBuffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
            close(fd);
            #Copy the data to the shared buffer.
            std::memcpy(sharedBuffer, data, size);
    }
    
    In [ ]:
    #Serialize the engine for sharing then destroy it.
    nvinfer1::IHostMemory* serializedEngine = engine->serialize();
    engine->destroy();
    
    #Copy to a new shared buffer.
    createSharedBuffer(serializedEngine->data(), serializedEngine->size(), config::kSHM_SHARED_ENGINE);
    
    #Serialized engine is now copied to shared memory, destroy the original buffer.
    serializedEngine->destroy();
    

    STEP-5: Retrieving the Engine from Shared Memory

    In [ ]:
    # Helper function to open a shared buffer and sets data/size accordingly.
    void loadSharedBuffer(void*& data, size_t& size, const char* bufferName)
    {
            #Open a file descriptor for the shared buffer.
            int fd = shm_open(bufferName, O_RDONLY, 0666);
            if (fd <= 0) config::logThrowError("Could not open file descriptor: /dev/shm" + std::string{bufferName});
    
            #Get size of shared memory buffer.
            struct stat sb; fstat(fd, &sb);
            size = sb.st_size;
            if (size <= 0) config::logThrowError("Failed to fetch model stream from shared memory buffer.");
    
            #Retrieve the shared buffer and close the file descriptor.
            data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
            close(fd);
    }
    
    In [ ]:
    #Get the shared buffer
    loadSharedBuffer(serializedEngineData, serializedEngineSize, config::kSHM_SHARED_ENGINE);
    

    Retrieve the engine from the serialized format:

    In [ ]:
    #Use a TensorRT IRuntime for engine deserialization.
    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(config::gLogger);
    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(serializedEngineData, serializedEngineSize, nullptr);
    

    STEP-6: Running Inference

    Running an engine involves three steps:

      1. Transferring inputs to device memory.
      2. Executing with context->enqueue.
      3. Transferring the outputs back to host memory.
    In [ ]:
    #Copy input from host to device.
    config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
    config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));
    
    #Do inference.
    b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);
    
    #Copy output from device to host.
    config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
    config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
    config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
    cudaStreamSynchronize(b.mStream);
    
    In [ ]:
    #Helper function to do inference on a single batch, return inference time in microseconds.
    float timeBatchInference(Batch& b)
    {
            #Get input binding indices.
            int userInputIndex = b.mEngine->getBindingIndex(config::kUSER_BLOB_NAME);
            int itemInputIndex = b.mEngine->getBindingIndex(config::kITEM_BLOB_NAME);
            #Get output binding indices.
            int outputPredictionIndex = b.mEngine->getBindingIndex(config::kUFF_OUTPUT_NODE);
            int outputItemProbIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_PROB);
            int outputItemNameIndex = b.mEngine->getBindingIndex(config::kTOPK_ITEM_NAME);
    
            #Run and time inference.
            GPUTimer timer{b.mStream};
            timer.start();
            {
            #Copy input from host to device.
            config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[userInputIndex], cudaMemcpyHostToDevice, b.mStream));
            config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[itemInputIndex], cudaMemcpyHostToDevice, b.mStream));
    
            #Do inference.
            b.mContext->enqueue(config::kNUM_USERS, b.mDeviceMemory, b.mStream, nullptr);
    
            #Copy output from device to host.
            config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[outputPredictionIndex], cudaMemcpyDeviceToHost, b.mStream));
            config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemProbIndex], b.mDeviceMemory[outputItemProbIndex], b.mMemSizes[outputItemProbIndex], cudaMemcpyDeviceToHost, b.mStream));
            config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputItemNameIndex], b.mDeviceMemory[outputItemNameIndex], b.mMemSizes[outputItemNameIndex], cudaMemcpyDeviceToHost, b.mStream));
            cudaStreamSynchronize(b.mStream);
            }
        #Done inference.
        timer.stop();
        return timer.microseconds();
    }