/* * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TRT_SAMPLE_DEVICE_H #define TRT_SAMPLE_DEVICE_H #include #include #include #include #include #include "sampleUtils.h" namespace sample { inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) { if (ret != cudaSuccess) { err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; abort(); } } class TrtCudaEvent; namespace { void cudaSleep(void* sleep) { std::this_thread::sleep_for(std::chrono::duration(*static_cast(sleep))); } } // namespace //! //! \class TrtCudaStream //! \brief Managed CUDA stream //! class TrtCudaStream { public: TrtCudaStream() { cudaCheck(cudaStreamCreate(&mStream)); } TrtCudaStream(const TrtCudaStream&) = delete; TrtCudaStream& operator=(const TrtCudaStream&) = delete; TrtCudaStream(TrtCudaStream&&) = delete; TrtCudaStream& operator=(TrtCudaStream&&) = delete; ~TrtCudaStream() { cudaCheck(cudaStreamDestroy(mStream)); } cudaStream_t get() const { return mStream; } void synchronize() { cudaCheck(cudaStreamSynchronize(mStream)); } void wait(TrtCudaEvent& event); void sleep(float* ms) { cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms)); } private: cudaStream_t mStream{}; }; //! //! \class TrtCudaEvent //! \brief Managed CUDA event //! class TrtCudaEvent { public: explicit TrtCudaEvent(bool blocking = true) { const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault; cudaCheck(cudaEventCreateWithFlags(&mEvent, flags)); } TrtCudaEvent(const TrtCudaEvent&) = delete; TrtCudaEvent& operator=(const TrtCudaEvent&) = delete; TrtCudaEvent(TrtCudaEvent&&) = delete; TrtCudaEvent& operator=(TrtCudaEvent&&) = delete; ~TrtCudaEvent() { cudaCheck(cudaEventDestroy(mEvent)); } cudaEvent_t get() const { return mEvent; } void record(const TrtCudaStream& stream) { cudaCheck(cudaEventRecord(mEvent, stream.get())); } void synchronize() { cudaCheck(cudaEventSynchronize(mEvent)); } // Returns time elapsed time in milliseconds float operator-(const TrtCudaEvent& e) const { float time{0}; cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); return time; } private: cudaEvent_t mEvent{}; }; inline void TrtCudaStream::wait(TrtCudaEvent& event) { cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0)); } //! //! \class TrtCudaGraph //! \brief Managed CUDA graph //! class TrtCudaGraph { public: explicit TrtCudaGraph() = default; TrtCudaGraph(const TrtCudaGraph&) = delete; TrtCudaGraph& operator=(const TrtCudaGraph&) = delete; TrtCudaGraph(TrtCudaGraph&&) = delete; TrtCudaGraph& operator=(TrtCudaGraph&&) = delete; ~TrtCudaGraph() { if (mGraphExec) { cudaGraphExecDestroy(mGraphExec); } } void beginCapture(TrtCudaStream& stream) { cudaCheck(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); } bool launch(TrtCudaStream& stream) { return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess; } void endCapture(TrtCudaStream& stream) { cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph)); cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); cudaCheck(cudaGraphDestroy(mGraph)); } void endCaptureOnError(TrtCudaStream& stream) { // There are two possibilities why stream capture would fail: // (1) stream is in cudaErrorStreamCaptureInvalidated state. // (2) TRT reports a failure. // In case (1), the returning mGraph should be nullptr. // In case (2), the returning mGraph is not nullptr, but it should not be used. const auto ret = cudaStreamEndCapture(stream.get(), &mGraph); if (ret == cudaErrorStreamCaptureInvalidated) { assert(mGraph == nullptr); } else { assert(ret == cudaSuccess); assert(mGraph != nullptr); cudaCheck(cudaGraphDestroy(mGraph)); mGraph = nullptr; } // Clean up any CUDA error. cudaGetLastError(); sample::gLogWarning << "The CUDA graph capture on the stream has failed." << std::endl; } private: cudaGraph_t mGraph{}; cudaGraphExec_t mGraphExec{}; }; //! //! \class TrtCudaBuffer //! \brief Managed buffer for host and device //! template class TrtCudaBuffer { public: TrtCudaBuffer() = default; TrtCudaBuffer(const TrtCudaBuffer&) = delete; TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete; TrtCudaBuffer(TrtCudaBuffer&& rhs) { reset(rhs.mPtr); rhs.mPtr = nullptr; } TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) { if (this != &rhs) { reset(rhs.mPtr); rhs.mPtr = nullptr; } return *this; } ~TrtCudaBuffer() { reset(); } TrtCudaBuffer(size_t size) { A()(&mPtr, size); } void allocate(size_t size) { reset(); A()(&mPtr, size); } void reset(void* ptr = nullptr) { if (mPtr) { D()(mPtr); } mPtr = ptr; } void* get() const { return mPtr; } private: void* mPtr{nullptr}; }; struct DeviceAllocator { void operator()(void** ptr, size_t size) { cudaCheck(cudaMalloc(ptr, size)); } }; struct DeviceDeallocator { void operator()(void* ptr) { cudaCheck(cudaFree(ptr)); } }; struct ManagedAllocator { void operator()(void** ptr, size_t size) { cudaCheck(cudaMallocManaged(ptr, size)); } }; struct HostAllocator { void operator()(void** ptr, size_t size) { cudaCheck(cudaMallocHost(ptr, size)); } }; struct HostDeallocator { void operator()(void* ptr) { cudaCheck(cudaFreeHost(ptr)); } }; using TrtDeviceBuffer = TrtCudaBuffer; using TrtManagedBuffer = TrtCudaBuffer; using TrtHostBuffer = TrtCudaBuffer; //! //! \class MirroredBuffer //! \brief Coupled host and device buffers //! class IMirroredBuffer { public: //! //! Allocate memory for the mirrored buffer give the size //! of the allocation. //! virtual void allocate(size_t size) = 0; //! //! Get the pointer to the device side buffer. //! //! \return pointer to device memory or nullptr if uninitialized. //! virtual void* getDeviceBuffer() const = 0; //! //! Get the pointer to the host side buffer. //! //! \return pointer to host memory or nullptr if uninitialized. //! virtual void* getHostBuffer() const = 0; //! //! Copy the memory from host to device. //! virtual void hostToDevice(TrtCudaStream& stream) = 0; //! //! Copy the memory from device to host. //! virtual void deviceToHost(TrtCudaStream& stream) = 0; //! //! Interface to get the size of the memory //! //! \return the size of memory allocated. //! virtual size_t getSize() const = 0; //! //! Virtual destructor declaraion //! virtual ~IMirroredBuffer() = default; }; // class IMirroredBuffer //! //! Class to have a separate memory buffer for discrete device and host allocations. //! class DiscreteMirroredBuffer : public IMirroredBuffer { public: void allocate(size_t size) override { mSize = size; mHostBuffer.allocate(size); mDeviceBuffer.allocate(size); } void* getDeviceBuffer() const override { return mDeviceBuffer.get(); } void* getHostBuffer() const override { return mHostBuffer.get(); } void hostToDevice(TrtCudaStream& stream) override { cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); } void deviceToHost(TrtCudaStream& stream) override { cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); } size_t getSize() const override { return mSize; } private: size_t mSize{0}; TrtHostBuffer mHostBuffer; TrtDeviceBuffer mDeviceBuffer; }; // class DiscreteMirroredBuffer //! //! Class to have a unified memory buffer for embedded devices. //! class UnifiedMirroredBuffer : public IMirroredBuffer { public: void allocate(size_t size) override { mSize = size; mBuffer.allocate(size); } void* getDeviceBuffer() const override { return mBuffer.get(); } void* getHostBuffer() const override { return mBuffer.get(); } void hostToDevice(TrtCudaStream& stream) override { // Does nothing since we are using unified memory. } void deviceToHost(TrtCudaStream& stream) override { // Does nothing since we are using unified memory. } size_t getSize() const override { return mSize; } private: size_t mSize{0}; TrtManagedBuffer mBuffer; }; // class UnifiedMirroredBuffer //! //! Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is //! not possible. //! class OutputAllocator : public nvinfer1::IOutputAllocator { public: OutputAllocator(IMirroredBuffer* buffer) : mBuffer(buffer) { } void* reallocateOutput( char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override { // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr // even for empty tensors, so allocate a dummy byte. size = std::max(size, static_cast(1)); if (size > mSize) { mBuffer->allocate(roundUp(size, alignment)); mSize = size; } return mBuffer->getDeviceBuffer(); } void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override {} IMirroredBuffer* getBuffer() { return mBuffer.get(); } ~OutputAllocator() override {} private: std::unique_ptr mBuffer; uint64_t mSize{}; }; inline void setCudaDevice(int device, std::ostream& os) { cudaCheck(cudaSetDevice(device)); cudaDeviceProp properties; cudaCheck(cudaGetDeviceProperties(&properties, device)); // clang-format off os << "=== Device Information ===" << std::endl; os << "Selected Device: " << properties.name << std::endl; os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; os << "SMs: " << properties.multiProcessorCount << std::endl; os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; os << "Application Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; os << "Application Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; os << std::endl; os << "Note: The application clock rates do not reflect the actual clock rates that the GPU is " << "currently running at." << std::endl; // clang-format on } inline int32_t getCudaDriverVersion() { int32_t version{-1}; cudaCheck(cudaDriverGetVersion(&version)); return version; } inline int32_t getCudaRuntimeVersion() { int32_t version{-1}; cudaCheck(cudaRuntimeGetVersion(&version)); return version; } } // namespace sample #endif // TRT_SAMPLE_DEVICE_H