You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
422 lines
13 KiB
C++
422 lines
13 KiB
C++
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
#ifndef TENSORRT_BUFFERS_H
|
|
#define TENSORRT_BUFFERS_H
|
|
|
|
#include "NvInfer.h"
|
|
#include "common.h"
|
|
#include "half.h"
|
|
#include <cassert>
|
|
#include <cuda_runtime_api.h>
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <memory>
|
|
#include <new>
|
|
#include <numeric>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
namespace samplesCommon
|
|
{
|
|
|
|
//!
|
|
//! \brief The GenericBuffer class is a templated class for buffers.
|
|
//!
|
|
//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation,
|
|
//! deallocation, querying of buffers on both the device and the host.
|
|
//! It can handle data of arbitrary types because it stores byte buffers.
|
|
//! The template parameters AllocFunc and FreeFunc are used for the
|
|
//! allocation and deallocation of the buffer.
|
|
//! AllocFunc must be a functor that takes in (void** ptr, size_t size)
|
|
//! and returns bool. ptr is a pointer to where the allocated buffer address should be stored.
|
|
//! size is the amount of memory in bytes to allocate.
|
|
//! The boolean indicates whether or not the memory allocation was successful.
|
|
//! FreeFunc must be a functor that takes in (void* ptr) and returns void.
|
|
//! ptr is the allocated buffer address. It must work with nullptr input.
|
|
//!
|
|
template <typename AllocFunc, typename FreeFunc>
|
|
class GenericBuffer
|
|
{
|
|
public:
|
|
//!
|
|
//! \brief Construct an empty buffer.
|
|
//!
|
|
GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT)
|
|
: mSize(0)
|
|
, mCapacity(0)
|
|
, mType(type)
|
|
, mBuffer(nullptr)
|
|
{
|
|
}
|
|
|
|
//!
|
|
//! \brief Construct a buffer with the specified allocation size in bytes.
|
|
//!
|
|
GenericBuffer(size_t size, nvinfer1::DataType type)
|
|
: mSize(size)
|
|
, mCapacity(size)
|
|
, mType(type)
|
|
{
|
|
if (!allocFn(&mBuffer, this->nbBytes()))
|
|
{
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
|
|
GenericBuffer(GenericBuffer&& buf)
|
|
: mSize(buf.mSize)
|
|
, mCapacity(buf.mCapacity)
|
|
, mType(buf.mType)
|
|
, mBuffer(buf.mBuffer)
|
|
{
|
|
buf.mSize = 0;
|
|
buf.mCapacity = 0;
|
|
buf.mType = nvinfer1::DataType::kFLOAT;
|
|
buf.mBuffer = nullptr;
|
|
}
|
|
|
|
GenericBuffer& operator=(GenericBuffer&& buf)
|
|
{
|
|
if (this != &buf)
|
|
{
|
|
freeFn(mBuffer);
|
|
mSize = buf.mSize;
|
|
mCapacity = buf.mCapacity;
|
|
mType = buf.mType;
|
|
mBuffer = buf.mBuffer;
|
|
// Reset buf.
|
|
buf.mSize = 0;
|
|
buf.mCapacity = 0;
|
|
buf.mBuffer = nullptr;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns pointer to underlying array.
|
|
//!
|
|
void* data()
|
|
{
|
|
return mBuffer;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns pointer to underlying array.
|
|
//!
|
|
const void* data() const
|
|
{
|
|
return mBuffer;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the size (in number of elements) of the buffer.
|
|
//!
|
|
size_t size() const
|
|
{
|
|
return mSize;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the size (in bytes) of the buffer.
|
|
//!
|
|
size_t nbBytes() const
|
|
{
|
|
return this->size() * samplesCommon::getElementSize(mType);
|
|
}
|
|
|
|
//!
|
|
//! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
|
|
//!
|
|
void resize(size_t newSize)
|
|
{
|
|
mSize = newSize;
|
|
if (mCapacity < newSize)
|
|
{
|
|
freeFn(mBuffer);
|
|
if (!allocFn(&mBuffer, this->nbBytes()))
|
|
{
|
|
throw std::bad_alloc{};
|
|
}
|
|
mCapacity = newSize;
|
|
}
|
|
}
|
|
|
|
//!
|
|
//! \brief Overload of resize that accepts Dims
|
|
//!
|
|
void resize(const nvinfer1::Dims& dims)
|
|
{
|
|
return this->resize(samplesCommon::volume(dims));
|
|
}
|
|
|
|
~GenericBuffer()
|
|
{
|
|
freeFn(mBuffer);
|
|
}
|
|
|
|
private:
|
|
size_t mSize{0}, mCapacity{0};
|
|
nvinfer1::DataType mType;
|
|
void* mBuffer;
|
|
AllocFunc allocFn;
|
|
FreeFunc freeFn;
|
|
};
|
|
|
|
class DeviceAllocator
|
|
{
|
|
public:
|
|
bool operator()(void** ptr, size_t size) const
|
|
{
|
|
return cudaMalloc(ptr, size) == cudaSuccess;
|
|
}
|
|
};
|
|
|
|
class DeviceFree
|
|
{
|
|
public:
|
|
void operator()(void* ptr) const
|
|
{
|
|
cudaFree(ptr);
|
|
}
|
|
};
|
|
|
|
class HostAllocator
|
|
{
|
|
public:
|
|
bool operator()(void** ptr, size_t size) const
|
|
{
|
|
*ptr = malloc(size);
|
|
return *ptr != nullptr;
|
|
}
|
|
};
|
|
|
|
class HostFree
|
|
{
|
|
public:
|
|
void operator()(void* ptr) const
|
|
{
|
|
free(ptr);
|
|
}
|
|
};
|
|
|
|
using DeviceBuffer = GenericBuffer<DeviceAllocator, DeviceFree>;
|
|
using HostBuffer = GenericBuffer<HostAllocator, HostFree>;
|
|
|
|
//!
|
|
//! \brief The ManagedBuffer class groups together a pair of corresponding device and host buffers.
|
|
//!
|
|
class ManagedBuffer
|
|
{
|
|
public:
|
|
DeviceBuffer deviceBuffer;
|
|
HostBuffer hostBuffer;
|
|
};
|
|
|
|
//!
|
|
//! \brief The BufferManager class handles host and device buffer allocation and deallocation.
|
|
//!
|
|
//! \details This RAII class handles host and device buffer allocation and deallocation,
|
|
//! memcpy between host and device buffers to aid with inference,
|
|
//! and debugging dumps to validate inference. The BufferManager class is meant to be
|
|
//! used to simplify buffer management and any interactions between buffers and the engine.
|
|
//!
|
|
class BufferManager
|
|
{
|
|
public:
|
|
static const size_t kINVALID_SIZE_VALUE = ~size_t(0);
|
|
|
|
//!
|
|
//! \brief Create a BufferManager for handling buffer interactions with engine.
|
|
//!
|
|
BufferManager(std::shared_ptr<nvinfer1::ICudaEngine> engine, const int batchSize = 0,
|
|
const nvinfer1::IExecutionContext* context = nullptr)
|
|
: mEngine(engine)
|
|
, mBatchSize(batchSize)
|
|
{
|
|
// Full Dims implies no batch size.
|
|
assert(engine->hasImplicitBatchDimension() || mBatchSize == 0);
|
|
// Create host and device buffers
|
|
for (int i = 0; i < mEngine->getNbBindings(); i++)
|
|
{
|
|
auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i);
|
|
size_t vol = context || !mBatchSize ? 1 : static_cast<size_t>(mBatchSize);
|
|
nvinfer1::DataType type = mEngine->getBindingDataType(i);
|
|
int vecDim = mEngine->getBindingVectorizedDim(i);
|
|
if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector
|
|
{
|
|
int scalarsPerVec = mEngine->getBindingComponentsPerElement(i);
|
|
dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec);
|
|
vol *= scalarsPerVec;
|
|
}
|
|
vol *= samplesCommon::volume(dims);
|
|
std::unique_ptr<ManagedBuffer> manBuf{new ManagedBuffer()};
|
|
manBuf->deviceBuffer = DeviceBuffer(vol, type);
|
|
manBuf->hostBuffer = HostBuffer(vol, type);
|
|
mDeviceBindings.emplace_back(manBuf->deviceBuffer.data());
|
|
mManagedBuffers.emplace_back(std::move(manBuf));
|
|
}
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns a vector of device buffers that you can use directly as
|
|
//! bindings for the execute and enqueue methods of IExecutionContext.
|
|
//!
|
|
std::vector<void*>& getDeviceBindings()
|
|
{
|
|
return mDeviceBindings;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns a vector of device buffers.
|
|
//!
|
|
const std::vector<void*>& getDeviceBindings() const
|
|
{
|
|
return mDeviceBindings;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the device buffer corresponding to tensorName.
|
|
//! Returns nullptr if no such tensor can be found.
|
|
//!
|
|
void* getDeviceBuffer(const std::string& tensorName) const
|
|
{
|
|
return getBuffer(false, tensorName);
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the host buffer corresponding to tensorName.
|
|
//! Returns nullptr if no such tensor can be found.
|
|
//!
|
|
void* getHostBuffer(const std::string& tensorName) const
|
|
{
|
|
return getBuffer(true, tensorName);
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the size of the host and device buffers that correspond to tensorName.
|
|
//! Returns kINVALID_SIZE_VALUE if no such tensor can be found.
|
|
//!
|
|
size_t size(const std::string& tensorName) const
|
|
{
|
|
int index = mEngine->getBindingIndex(tensorName.c_str());
|
|
if (index == -1)
|
|
return kINVALID_SIZE_VALUE;
|
|
return mManagedBuffers[index]->hostBuffer.nbBytes();
|
|
}
|
|
|
|
//!
|
|
//! \brief Templated print function that dumps buffers of arbitrary type to std::ostream.
|
|
//! rowCount parameter controls how many elements are on each line.
|
|
//! A rowCount of 1 means that there is only 1 element on each line.
|
|
//!
|
|
template <typename T>
|
|
void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount)
|
|
{
|
|
assert(rowCount != 0);
|
|
assert(bufSize % sizeof(T) == 0);
|
|
T* typedBuf = static_cast<T*>(buf);
|
|
size_t numItems = bufSize / sizeof(T);
|
|
for (int i = 0; i < static_cast<int>(numItems); i++)
|
|
{
|
|
// Handle rowCount == 1 case
|
|
if (rowCount == 1 && i != static_cast<int>(numItems) - 1)
|
|
os << typedBuf[i] << std::endl;
|
|
else if (rowCount == 1)
|
|
os << typedBuf[i];
|
|
// Handle rowCount > 1 case
|
|
else if (i % rowCount == 0)
|
|
os << typedBuf[i];
|
|
else if (i % rowCount == rowCount - 1)
|
|
os << " " << typedBuf[i] << std::endl;
|
|
else
|
|
os << " " << typedBuf[i];
|
|
}
|
|
}
|
|
|
|
//!
|
|
//! \brief Copy the contents of input host buffers to input device buffers synchronously.
|
|
//!
|
|
void copyInputToDevice()
|
|
{
|
|
memcpyBuffers(true, false, false);
|
|
}
|
|
|
|
//!
|
|
//! \brief Copy the contents of output device buffers to output host buffers synchronously.
|
|
//!
|
|
void copyOutputToHost()
|
|
{
|
|
memcpyBuffers(false, true, false);
|
|
}
|
|
|
|
//!
|
|
//! \brief Copy the contents of input host buffers to input device buffers asynchronously.
|
|
//!
|
|
void copyInputToDeviceAsync(const cudaStream_t& stream = 0)
|
|
{
|
|
memcpyBuffers(true, false, true, stream);
|
|
}
|
|
|
|
//!
|
|
//! \brief Copy the contents of output device buffers to output host buffers asynchronously.
|
|
//!
|
|
void copyOutputToHostAsync(const cudaStream_t& stream = 0)
|
|
{
|
|
memcpyBuffers(false, true, true, stream);
|
|
}
|
|
|
|
~BufferManager() = default;
|
|
|
|
private:
|
|
void* getBuffer(const bool isHost, const std::string& tensorName) const
|
|
{
|
|
int index = mEngine->getBindingIndex(tensorName.c_str());
|
|
if (index == -1)
|
|
return nullptr;
|
|
return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data());
|
|
}
|
|
|
|
void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream = 0)
|
|
{
|
|
for (int i = 0; i < mEngine->getNbBindings(); i++)
|
|
{
|
|
void* dstPtr
|
|
= deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data();
|
|
const void* srcPtr
|
|
= deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data();
|
|
const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes();
|
|
const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
|
|
if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i)))
|
|
{
|
|
if (async)
|
|
CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream));
|
|
else
|
|
CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType));
|
|
}
|
|
}
|
|
}
|
|
|
|
std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The pointer to the engine
|
|
int mBatchSize; //!< The batch size for legacy networks, 0 otherwise.
|
|
std::vector<std::unique_ptr<ManagedBuffer>> mManagedBuffers; //!< The vector of pointers to managed buffers
|
|
std::vector<void*> mDeviceBindings; //!< The vector of device buffers needed for engine execution
|
|
};
|
|
|
|
} // namespace samplesCommon
|
|
|
|
#endif // TENSORRT_BUFFERS_H
|