You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
529 lines
18 KiB
C++
529 lines
18 KiB
C++
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "sampleUtils.h"
|
|
#include "half.h"
|
|
|
|
using namespace nvinfer1;
|
|
|
|
namespace sample
|
|
{
|
|
|
|
size_t dataTypeSize(nvinfer1::DataType dataType)
|
|
{
|
|
switch (dataType)
|
|
{
|
|
case nvinfer1::DataType::kINT32:
|
|
case nvinfer1::DataType::kFLOAT: return 4U;
|
|
case nvinfer1::DataType::kHALF: return 2U;
|
|
case nvinfer1::DataType::kBOOL:
|
|
case nvinfer1::DataType::kUINT8:
|
|
case nvinfer1::DataType::kINT8:
|
|
case nvinfer1::DataType::kFP8: return 1U;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch)
|
|
{
|
|
int32_t maxNbElems = 1;
|
|
for (int32_t i = 0; i < dims.nbDims; ++i)
|
|
{
|
|
// Get effective length of axis.
|
|
int32_t d = dims.d[i];
|
|
// Any dimension is 0, it is an empty tensor.
|
|
if (d == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
if (i == vecDim)
|
|
{
|
|
d = samplesCommon::divUp(d, comps);
|
|
}
|
|
maxNbElems = std::max(maxNbElems, d * strides.d[i]);
|
|
}
|
|
return static_cast<int64_t>(maxNbElems) * batch * (vecDim < 0 ? 1 : comps);
|
|
}
|
|
|
|
nvinfer1::Dims toDims(std::vector<int32_t> const& vec)
|
|
{
|
|
int32_t limit = static_cast<int32_t>(nvinfer1::Dims::MAX_DIMS);
|
|
if (static_cast<int32_t>(vec.size()) > limit)
|
|
{
|
|
sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
|
|
}
|
|
// Pick first nvinfer1::Dims::MAX_DIMS elements
|
|
nvinfer1::Dims dims{std::min(static_cast<int32_t>(vec.size()), limit), {}};
|
|
std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
|
|
return dims;
|
|
}
|
|
|
|
void loadFromFile(std::string const& fileName, char* dst, size_t size)
|
|
{
|
|
ASSERT(dst);
|
|
|
|
std::ifstream file(fileName, std::ios::in | std::ios::binary);
|
|
if (file.is_open())
|
|
{
|
|
file.read(dst, size);
|
|
size_t const nbBytesRead = file.gcount();
|
|
file.close();
|
|
if (nbBytesRead != size)
|
|
{
|
|
std::ostringstream msg;
|
|
msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size
|
|
<< " bytes but only read: " << nbBytesRead << " bytes";
|
|
throw std::invalid_argument(msg.str());
|
|
}
|
|
}
|
|
else
|
|
{
|
|
std::ostringstream msg;
|
|
msg << "Cannot open file " << fileName << "!";
|
|
throw std::invalid_argument(msg.str());
|
|
}
|
|
}
|
|
|
|
std::vector<std::string> splitToStringVec(std::string const& s, char separator)
|
|
{
|
|
std::vector<std::string> splitted;
|
|
|
|
for (size_t start = 0; start < s.length();)
|
|
{
|
|
size_t separatorIndex = s.find(separator, start);
|
|
if (separatorIndex == std::string::npos)
|
|
{
|
|
separatorIndex = s.length();
|
|
}
|
|
splitted.emplace_back(s.substr(start, separatorIndex - start));
|
|
start = separatorIndex + 1;
|
|
}
|
|
|
|
return splitted;
|
|
}
|
|
|
|
bool broadcastIOFormats(std::vector<IOFormat> const& formats, size_t nbBindings, bool isInput /*= true*/)
|
|
{
|
|
bool broadcast = formats.size() == 1;
|
|
bool validFormatsCount = broadcast || (formats.size() == nbBindings);
|
|
if (!formats.empty() && !validFormatsCount)
|
|
{
|
|
if (isInput)
|
|
{
|
|
throw std::invalid_argument(
|
|
"The number of inputIOFormats must match network's inputs or be one for broadcasting.");
|
|
}
|
|
|
|
throw std::invalid_argument(
|
|
"The number of outputIOFormats must match network's outputs or be one for broadcasting.");
|
|
}
|
|
return broadcast;
|
|
}
|
|
|
|
void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
|
|
{
|
|
using TensorToLayer = std::unordered_map<nvinfer1::ITensor*, nvinfer1::ILayer*>;
|
|
using LayerToTensor = std::unordered_map<nvinfer1::ILayer*, nvinfer1::ITensor*>;
|
|
|
|
// 1. Collect layers and tensors information from the network.
|
|
TensorToLayer matmulI2L;
|
|
TensorToLayer constO2L;
|
|
TensorToLayer shuffleI2L;
|
|
LayerToTensor shuffleL2O;
|
|
auto collectMappingInfo = [&](int32_t const idx)
|
|
{
|
|
ILayer* l = network.getLayer(idx);
|
|
switch (l->getType())
|
|
{
|
|
case nvinfer1::LayerType::kMATRIX_MULTIPLY:
|
|
{
|
|
// assume weights on the second input.
|
|
matmulI2L.insert({l->getInput(1), l});
|
|
break;
|
|
}
|
|
case nvinfer1::LayerType::kCONSTANT:
|
|
{
|
|
DataType const dtype = static_cast<nvinfer1::IConstantLayer*>(l)->getWeights().type;
|
|
if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF)
|
|
{
|
|
// Sparsify float only.
|
|
constO2L.insert({l->getOutput(0), l});
|
|
}
|
|
break;
|
|
}
|
|
case nvinfer1::LayerType::kSHUFFLE:
|
|
{
|
|
shuffleI2L.insert({l->getInput(0), l});
|
|
shuffleL2O.insert({l, l->getOutput(0)});
|
|
break;
|
|
}
|
|
default: break;
|
|
}
|
|
};
|
|
int32_t const nbLayers = network.getNbLayers();
|
|
for (int32_t i = 0; i < nbLayers; ++i)
|
|
{
|
|
collectMappingInfo(i);
|
|
}
|
|
if (matmulI2L.size() == 0 || constO2L.size() == 0)
|
|
{
|
|
// No MatrixMultiply or Constant layer found, no weights to sparsify.
|
|
return;
|
|
}
|
|
|
|
// Helper for analysis
|
|
auto isTranspose
|
|
= [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); };
|
|
auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; };
|
|
auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool
|
|
{
|
|
for (int32_t i = 0; i < dims.nbDims; ++i)
|
|
{
|
|
if (dims.d[i] != i || dims.d[i] != -1)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
};
|
|
auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor*
|
|
{
|
|
while (shuffleI2L.find(t) != shuffleI2L.end())
|
|
{
|
|
nvinfer1::IShuffleLayer* s = static_cast<nvinfer1::IShuffleLayer*>(shuffleI2L.at(t));
|
|
if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions())
|
|
|| !isIdenticalReshape(s->getReshapeDimensions()))
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (isTranspose(s->getFirstTranspose()))
|
|
{
|
|
needTranspose = !needTranspose;
|
|
}
|
|
if (isTranspose(s->getSecondTranspose()))
|
|
{
|
|
needTranspose = !needTranspose;
|
|
}
|
|
|
|
t = shuffleL2O.at(s);
|
|
}
|
|
return t;
|
|
};
|
|
|
|
// 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose
|
|
std::unordered_map<nvinfer1::IConstantLayer*, bool> constantLayerToSparse;
|
|
for (auto& o2l : constO2L)
|
|
{
|
|
// If need to transpose the weights of the Constant layer.
|
|
// Need to transpose by default due to semantic difference.
|
|
bool needTranspose{true};
|
|
ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose);
|
|
if (matmulI2L.find(t) == matmulI2L.end())
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// check MatMul params...
|
|
IMatrixMultiplyLayer* mm = static_cast<nvinfer1::IMatrixMultiplyLayer*>(matmulI2L.at(t));
|
|
bool const twoInputs = mm->getNbInputs() == 2;
|
|
bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions());
|
|
bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE
|
|
&& mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR;
|
|
if (!(twoInputs && all2D && isSimple))
|
|
{
|
|
continue;
|
|
}
|
|
if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE)
|
|
{
|
|
needTranspose = !needTranspose;
|
|
}
|
|
|
|
constantLayerToSparse.insert({static_cast<IConstantLayer*>(o2l.second), needTranspose});
|
|
}
|
|
|
|
// 3. Finally, sparsify the weights
|
|
auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose)
|
|
{
|
|
Dims dims = layer->getOutput(0)->getDimensions();
|
|
ASSERT(dims.nbDims == 2);
|
|
int32_t const idxN = needTranspose ? 1 : 0;
|
|
int32_t const n = dims.d[idxN];
|
|
int32_t const k = dims.d[1 - idxN];
|
|
sparseWeights.emplace_back();
|
|
std::vector<int8_t>& spw = sparseWeights.back();
|
|
Weights w = layer->getWeights();
|
|
DataType const dtype = w.type;
|
|
ASSERT(dtype == nvinfer1::DataType::kFLOAT
|
|
|| dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored.
|
|
|
|
if (needTranspose)
|
|
{
|
|
if (dtype == nvinfer1::DataType::kFLOAT)
|
|
{
|
|
spw.resize(w.count * sizeof(float));
|
|
transpose2DWeights<float>(spw.data(), w.values, k, n);
|
|
}
|
|
else if (dtype == nvinfer1::DataType::kHALF)
|
|
{
|
|
spw.resize(w.count * sizeof(half_float::half));
|
|
transpose2DWeights<half_float::half>(spw.data(), w.values, k, n);
|
|
}
|
|
|
|
w.values = spw.data();
|
|
std::vector<int8_t> tmpW;
|
|
sparsify(w, n, 1, tmpW);
|
|
|
|
if (dtype == nvinfer1::DataType::kFLOAT)
|
|
{
|
|
transpose2DWeights<float>(spw.data(), tmpW.data(), n, k);
|
|
}
|
|
else if (dtype == nvinfer1::DataType::kHALF)
|
|
{
|
|
transpose2DWeights<half_float::half>(spw.data(), tmpW.data(), n, k);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
sparsify(w, n, 1, spw);
|
|
}
|
|
|
|
w.values = spw.data();
|
|
layer->setWeights(w);
|
|
};
|
|
for (auto& l : constantLayerToSparse)
|
|
{
|
|
sparsifyConstantWeights(l.first, l.second);
|
|
}
|
|
}
|
|
|
|
template <typename L>
|
|
void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
|
|
{
|
|
auto weights = l.getKernelWeights();
|
|
sparsify(weights, k, trs, sparseWeights);
|
|
weights.values = sparseWeights.data();
|
|
l.setKernelWeights(weights);
|
|
}
|
|
|
|
// Explicit instantiation
|
|
template void setSparseWeights<IConvolutionLayer>(
|
|
IConvolutionLayer& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
|
|
template void setSparseWeights<IFullyConnectedLayer>(
|
|
IFullyConnectedLayer& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
|
|
|
|
void sparsify(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
|
|
{
|
|
for (int32_t l = 0; l < network.getNbLayers(); ++l)
|
|
{
|
|
auto* layer = network.getLayer(l);
|
|
auto const t = layer->getType();
|
|
if (t == nvinfer1::LayerType::kCONVOLUTION)
|
|
{
|
|
auto& conv = *static_cast<IConvolutionLayer*>(layer);
|
|
auto const& dims = conv.getKernelSizeNd();
|
|
ASSERT(dims.nbDims == 2 || dims.nbDims == 3);
|
|
auto const k = conv.getNbOutputMaps();
|
|
auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int32_t>());
|
|
sparseWeights.emplace_back();
|
|
setSparseWeights(conv, k, trs, sparseWeights.back());
|
|
}
|
|
else if (t == nvinfer1::LayerType::kFULLY_CONNECTED)
|
|
{
|
|
auto& fc = *static_cast<nvinfer1::IFullyConnectedLayer*>(layer);
|
|
auto const k = fc.getNbOutputChannels();
|
|
sparseWeights.emplace_back();
|
|
setSparseWeights(fc, k, 1, sparseWeights.back());
|
|
}
|
|
}
|
|
|
|
sparsifyMatMulKernelWeights(network, sparseWeights);
|
|
}
|
|
|
|
void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
|
|
{
|
|
switch (weights.type)
|
|
{
|
|
case DataType::kFLOAT:
|
|
sparsify(static_cast<float const*>(weights.values), weights.count, k, trs, sparseWeights);
|
|
break;
|
|
case DataType::kHALF:
|
|
sparsify(static_cast<half_float::half const*>(weights.values), weights.count, k, trs, sparseWeights);
|
|
break;
|
|
case DataType::kINT8:
|
|
case DataType::kINT32:
|
|
case DataType::kUINT8:
|
|
case DataType::kBOOL:
|
|
case DataType::kFP8: break;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void print(std::ostream& os, T v)
|
|
{
|
|
os << v;
|
|
}
|
|
|
|
void print(std::ostream& os, int8_t v)
|
|
{
|
|
os << static_cast<int32_t>(v);
|
|
}
|
|
|
|
template <typename T>
|
|
void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
|
|
Dims const& strides, int32_t vectorDim, int32_t spv)
|
|
{
|
|
auto const vol = volume(dims);
|
|
T const* typedBuffer = static_cast<T const*>(buffer);
|
|
std::string sep;
|
|
for (int64_t v = 0; v < vol; ++v)
|
|
{
|
|
int64_t curV = v;
|
|
int32_t dataOffset = 0;
|
|
for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
|
|
{
|
|
int32_t dimVal = curV % dims.d[dimIndex];
|
|
if (dimIndex == vectorDim)
|
|
{
|
|
dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
|
|
}
|
|
else
|
|
{
|
|
dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
|
|
}
|
|
curV /= dims.d[dimIndex];
|
|
ASSERT(curV >= 0);
|
|
}
|
|
|
|
os << sep;
|
|
sep = separator;
|
|
print(os, typedBuffer[dataOffset]);
|
|
}
|
|
}
|
|
|
|
// Explicit instantiation
|
|
template void dumpBuffer<bool>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
|
|
Dims const& strides, int32_t vectorDim, int32_t spv);
|
|
template void dumpBuffer<int32_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
|
|
Dims const& strides, int32_t vectorDim, int32_t spv);
|
|
template void dumpBuffer<int8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
|
|
Dims const& strides, int32_t vectorDim, int32_t spv);
|
|
template void dumpBuffer<float>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
|
|
Dims const& strides, int32_t vectorDim, int32_t spv);
|
|
template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
|
|
Dims const& strides, int32_t vectorDim, int32_t spv);
|
|
template void dumpBuffer<uint8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
|
|
Dims const& strides, int32_t vectorDim, int32_t spv);
|
|
|
|
template <typename T>
|
|
void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
|
|
{
|
|
auto const c = count / (k * trs);
|
|
sparseWeights.resize(count * sizeof(T));
|
|
auto* sparseValues = reinterpret_cast<T*>(sparseWeights.data());
|
|
|
|
constexpr int32_t window = 4;
|
|
constexpr int32_t nonzeros = 2;
|
|
|
|
int32_t const crs = c * trs;
|
|
auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; };
|
|
|
|
for (int64_t ki = 0; ki < k; ++ki)
|
|
{
|
|
for (int64_t rsi = 0; rsi < trs; ++rsi)
|
|
{
|
|
int32_t w = 0;
|
|
int32_t nz = 0;
|
|
for (int64_t ci = 0; ci < c; ++ci)
|
|
{
|
|
auto const index = getIndex(ki, ci, rsi);
|
|
if (nz < nonzeros)
|
|
{
|
|
sparseValues[index] = values[index];
|
|
++nz;
|
|
}
|
|
else
|
|
{
|
|
sparseValues[index] = 0;
|
|
}
|
|
if (++w == window)
|
|
{
|
|
w = 0;
|
|
nz = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Explicit instantiation
|
|
template void sparsify<float>(
|
|
float const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
|
|
template void sparsify<half_float::half>(
|
|
half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
|
|
|
|
template <typename T>
|
|
void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n)
|
|
{
|
|
ASSERT(dst != src);
|
|
T* tdst = reinterpret_cast<T*>(dst);
|
|
T const* tsrc = reinterpret_cast<T const*>(src);
|
|
for (int32_t mi = 0; mi < m; ++mi)
|
|
{
|
|
for (int32_t ni = 0; ni < n; ++ni)
|
|
{
|
|
int32_t const isrc = mi * n + ni;
|
|
int32_t const idst = ni * m + mi;
|
|
tdst[idst] = tsrc[isrc];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Explicit instantiation
|
|
template void transpose2DWeights<float>(void* dst, void const* src, int32_t const m, int32_t const n);
|
|
template void transpose2DWeights<half_float::half>(void* dst, void const* src, int32_t const m, int32_t const n);
|
|
|
|
template <typename T, typename std::enable_if<std::is_integral<T>::value, bool>::type>
|
|
void fillBuffer(void* buffer, int64_t volume, T min, T max)
|
|
{
|
|
T* typedBuffer = static_cast<T*>(buffer);
|
|
std::default_random_engine engine;
|
|
std::uniform_int_distribution<int32_t> distribution(min, max);
|
|
auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
|
|
std::generate(typedBuffer, typedBuffer + volume, generator);
|
|
}
|
|
|
|
template <typename T, typename std::enable_if<!std::is_integral<T>::value, int32_t>::type>
|
|
void fillBuffer(void* buffer, int64_t volume, T min, T max)
|
|
{
|
|
T* typedBuffer = static_cast<T*>(buffer);
|
|
std::default_random_engine engine;
|
|
std::uniform_real_distribution<float> distribution(min, max);
|
|
auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
|
|
std::generate(typedBuffer, typedBuffer + volume, generator);
|
|
}
|
|
|
|
// Explicit instantiation
|
|
template void fillBuffer<bool>(void* buffer, int64_t volume, bool min, bool max);
|
|
template void fillBuffer<float>(void* buffer, int64_t volume, float min, float max);
|
|
template void fillBuffer<int32_t>(void* buffer, int64_t volume, int32_t min, int32_t max);
|
|
template void fillBuffer<int8_t>(void* buffer, int64_t volume, int8_t min, int8_t max);
|
|
template void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max);
|
|
template void fillBuffer<uint8_t>(void* buffer, int64_t volume, uint8_t min, uint8_t max);
|
|
|
|
} // namespace sample
|